scripts/getAlleleByLocus_mod.R

##' function to return of list of variant GRanges that overlap a particular gene/locus for a set of ids 
##'
##' This function takes a geneID, list of sample IDs, a metadata table, and returns a list of
##' alleles/variants that overlap a gene/locus or a set of genes/loci from these samples
##' @title 
##' @param gene the Entrez gene id(s) for the gene(s) of intest. if multiple it should be a vector
##' @param locus a GRanges object with the locus (loci) positional information and allele informtion. see help for example
##' @param txdb txdb of all genes
##' @param ids a list of ids for the samples of interest
##' @param meta a metaData table as generated by getCGPMetadata()
##' @param tech a character string denoting the technology ('rna_seq' or 'exome') to use
##' @param type a character string denoting the type of variants to be checked. Can be somatic, called, or raw
##' @param exact only used with the locus option, is a true/false to determine if exact matches to alleles are wanted.
##'              If true the GR must have a elementMetaData called variant with the allele of interest given  
##' @param save_dir directory to save VCF files. If null, no vcf files will be generated
##' @param cores number of cores to run on
##' @return a list of variant GRanges
##' @author Jeremiah Degenhardt
##'
##' 
##' @export
getAlleleByLocus_mod<- function(gene=NULL,
                              locus =NULL,
                              gene_db,
                              ids,
                              meta,
                              tech = "exon",
                              type="somatic",
                              exact=FALSE,
                              save_dir=NULL,
                              cores=detectCores()){
  warning("This function is hard coded with the genome information from the CGP3.0 release...")
  genome_dir = '/gne/research/workspace/degenhj2'
  genome = 'hg19_IGIS21'
  if(tech=="exon"){
    bq = 56
  }else{
    bq=23
  }
  if(!is.null(gene) && !is.null(locus)){
    stop("must supply either a gene ID or locus GRanges but not both")
  }
  if(!is.null(gene)){
    ##genes <- normalizeGeneNames(gene_db)
    gId <- paste("GeneID:", gene, sep = "")
    InTxdb <- gId %in% names(genes)
    if(any(!InTxdb)){
      missing <- gId[which(!InTxdb)]
      gId <- gId[InTxdb] 
      message(missing, " is not in the txdb. \nRemoving this gene from further analysis")
    }
    gene <- genes[gId]
    gene <-unlist(gene)
    gene<-reduce(gene)
  }
  if(!is.null(locus)){
    gene <- locus
  }
  ##gene <- normalizeChrNames(gene)
  if(type =="somatic"){
    post <- ".sample_specific_variants_granges.RData"
  }else{
    post <- ".merged.uniques.bam"
  }
  in_gene <- mclapply(ids, mc.cores=cores, function(sam){
    if(tech == "exome"){
      meta<-meta[meta$project == "exome",]
    }else if(tech =="rna_seq"){
      meta<-meta[meta$project == "rna_seq",]
    }
    ##dir<-meta$results_dir[meta$srcid==sam]
    if(type =="somatic"){
      dir <-"/gnet/is3/research/data/bioinfo/ngs_analysis/degenhj2/CGP3.0_colon_var_rerun/TN/"
    }else{
     dir<-meta$results_dir[meta$srcid==sam]
    }
    if(length(dir)==0){
      message(paste("The SAM ", sam, " does not exist in meta data", sep=""))
      return(GRanges())
    }
    if(type =="somatic"){
      if(file.exists(file.path(dir, sam, paste(sam, post, sep = "")))){
        gr <- load(file.path(dir, sam, paste(sam, post, sep = "")))
        gr <- get(gr)
      }else{
        message(paste("The SAM ", sam ," does not have sample specific variants", sep =""))
        return(GRanges())
      }
    }
    if(type == "called"){
      if(file.exists(file.path(dir,"bams/merged/", paste(sam, post, sep = "")))){
        bam_file <- file.path(dir,"bams/merged/", paste(sam, post, sep = ""))
        gg <- GmapGenome(genome=genome, directory=genome_dir)
        param <- BamTallyParam(which=gene, 
                               high_quality_cutoff = as.integer(bq),
                               minimum_mapq = 13L,
                               min_depth = 0L, variant_strand = 1L,
                               ignore_query_Ns = TRUE,
                               indels = FALSE)
        gr <- bam_tally(bam_file, gg, param)
        ##gr <- tally2GR(bamfiles=bam_file, genome=genome,genome_dir=genome_dir,chr_gr=gene, variant_strand=1, map_qual=13, bqual_thresh=bq)
        gr <-variantFilter(gr, useQual=TRUE)
        gr <- gr$filtered_granges
      }else{
        message(paste("The SAM ", sam, " does not have bam files", sep = ""))
        return(GRanges())
      }
    }
    if(type == "raw"){
      bam_file <- file.path(dir,"bams/merged/", paste(sam, post, sep = ""))
      gg <- GmapGenome(genome=genome, directory=genome_dir)
      param <- BamTallyParam(which=gene,
                             high_quality_cutoff = as.integer(bq),
                             minimum_mapq = 13L,
                             min_depth = 0L, variant_strand = 0L,
                             ignore_query_Ns = TRUE,
                             indels = FALSE)
      gr <- bam_tally(bam_file, gg, param)
      gr <- tally2GR(bamfiles=bam_file, genome=genome,genome_dir=genome_dir,chr_gr=gene, variant_strand=0, map_qual=13, bqual_thresh=bq)
    }

    ##gr <- normalizeChrNames(gr)
    if(exact && !is.null(locus)){
      vec1 <- paste(seqnames(gr), start(gr), values(gr)$read, sep = ":")
      vec2 <- paste(seqnames(gene), start(gene), values(gene)$variant, sep = ":")
      ind <- which(vec1 %in% vec2)
      InGene <- gr[ind]
    }else{
      seqlevels(gene) <- seqlevels(gr)
      seqlengths(gene) <- seqlengths(gr)
      over <- suppressWarnings(findOverlaps(gr, gene))
      InGene <- gr[unique(queryHits(over))]
      if(!is.null(save_dir)){
        InGene <- sort(InGene)
        cgpGr2vcf(GR=InGene, filename=paste(save_dir,"/", sam, ".vcf", sep = ""), project="Mutations_by_gene", sample_id=sam)
      }
    }
    return(InGene)
  })
  names(in_gene) <- as.character(ids)
  return(in_gene)
}
lawremi/VariantTools documentation built on March 4, 2024, 11:54 a.m.