R/xSymbol2GeneID.r

Defines functions xSymbol2GeneID

Documented in xSymbol2GeneID

#' Function to convert gene symbols to entrez geneid
#'
#' \code{xSymbol2GeneID} is supposed to convert gene symbols to entrez geneid.
#'
#' @param data an input vector containing gene symbols
#' @param org a character specifying an organism. Currently supported organisms are 'human' and 'mouse'. It can be an object 'EG'
#' @param check.symbol.identity logical to indicate whether to match the input data via Synonyms for those unmatchable by official gene symbols. By default, it sets to false
#' @param details logical to indicate whether to result in a data frame (in great details). By default, it sets to false
#' @param verbose logical to indicate whether the messages will be displayed in the screen. By default, it sets to false for no display
#' @param RData.location the characters to tell the location of built-in RData files. See \code{\link{xRDataLoader}} for details
#' @param guid a valid (5-character) Global Unique IDentifier for an OSF project. See \code{\link{xRDataLoader}} for details
#' @return a vector containing entrez geneid with 'NA' for the unmatched if (details set to false); otherwise, a data frame is returned
#' @note If a symbol mapped many times, the one assiged as the "protein-coding" type of gene is preferred.
#' @export
#' @seealso \code{\link{xRDataLoader}}
#' @include xSymbol2GeneID.r
#' @examples
#' RData.location <- "http://galahad.well.ox.ac.uk/bigdata"
#' \dontrun{
#' 
#' # a) provide the input Genes of interest (eg 100 randomly chosen human genes)
#' ## load human genes
#' org.Hs.eg <- xRDataLoader(RData='org.Hs.eg')
#' Symbol <- as.character(sample(org.Hs.eg$gene_info$Symbol, 100))
#' Symbol
#' 
#' # b) convert into GeneID
#' GeneID <- xSymbol2GeneID(Symbol)
#' 
#' # c) convert into a data frame
#' df <- xSymbol2GeneID(Symbol, details=TRUE)
#' 
#' 
#' # advanced use
#' df <- xSymbol2GeneID(Symbol, org=org.Hs.eg, details=TRUE)
#' }

xSymbol2GeneID <- function(data, org=c("human","mouse"), check.symbol.identity=FALSE, details=FALSE, verbose=TRUE, RData.location="http://galahad.well.ox.ac.uk/bigdata", guid=NULL)
{
    
    if (!is.vector(data)){
        stop("The input data must be a vector.\n")
    }
    Symbol <- as.character(data)
    
    ## load Enterz Gene information
	if(is(org,"EG")){
		df_eg <- org$gene_info
		if(verbose){
			message(sprintf("Customised organism (%s)", as.character(Sys.time())), appendLF=TRUE)
		}
	}else{
		org <- org[1]
		if(org=='human'){
			df_eg <- xRDataLoader('org.Hs.eg', RData.location=RData.location, guid=guid, verbose=verbose)$gene_info
		}else if(org=='mouse'){
			df_eg <- xRDataLoader('org.Mm.eg', RData.location=RData.location, guid=guid, verbose=verbose)$gene_info
		}
		if(verbose){
			message(sprintf("%s organism (%s)", org, as.character(Sys.time())), appendLF=TRUE)
		}
	}
    
    ## subdived into two parts: "protein-coding" and the rest
    type_of_gene <- ''
    df_eg <- rbind(subset(df_eg,type_of_gene=='protein-coding'), subset(df_eg,type_of_gene!='protein-coding'))
    
	allGeneID <- df_eg$GeneID
	allSymbol <- as.vector(df_eg$Symbol)
	allSynonyms <- as.vector(df_eg$Synonyms)
	allDescription <- as.vector(df_eg$description)
	
    ## correct for those symbols being shown as DATE format
    if(0){
        ## for those starting with 'Mar' in a excel-input date format
        a <- Symbol
        flag <- grep("-Mar$", a, ignore.case=TRUE, perl=TRUE, value=FALSE)
        if(length(flag)>=1){
        	b <- a[flag]
        	c <- sub("-Mar$", "", b, ignore.case=TRUE, perl=TRUE)
        	d <- sub("^0", "", c, ignore.case=TRUE, perl=TRUE)
        	e <- sapply(d, function(x) paste(c("March",x), collapse=""))
        	a[flag] <- e
        	Symbol <- a
    	}

        ## for those starting with 'Sep' in a excel-input date format
        a <- Symbol
        flag <- grep("-Sep$", a, ignore.case=TRUE, perl=TRUE, value=FALSE)
        if(length(flag)>=1){
        	b <- a[flag]
            c <- sub("-Sep$", "", b, ignore.case=TRUE, perl=TRUE)
            d <- sub("^0", "", c, ignore.case=TRUE, perl=TRUE)
            e <- sapply(d, function(x) paste(c("Sept",x), collapse=""))
            a[flag] <- e
            Symbol <- a
        }
    }
    
    ## case-insensitive
    ## only keep the first matched one
    #match_flag <- match(tolower(Symbol),tolower(allSymbol))
    match_flag <- match(Symbol,allSymbol)
    
    ## match via Synonyms for those unmatchable by official gene symbols
    if(check.symbol.identity){
    	## match Synonyms (if not found via Symbol)
        na_flag <- is.na(match_flag)
        a <- Symbol[na_flag]

        ###
        tmp_flag <- is.na(match(tolower(allSymbol), tolower(Symbol)))
        tmp_Synonyms <- allSynonyms[tmp_flag]
        Orig.index <- seq(1,length(allSynonyms))
        Orig.index <- Orig.index[tmp_flag]
        ###

        b <- sapply(1:length(a), function(x){
        	tmp_pattern1 <- paste("^",a[x],"\\|", sep="")
            tmp_pattern2 <- paste("\\|",a[x],"\\|", sep="")
            tmp_pattern3 <- paste("\\|",a[x],"$", sep="")
            tmp_pattern <- paste(tmp_pattern1,"|",tmp_pattern2,"|",tmp_pattern3, sep="")
            tmp_result <- grep(tmp_pattern, tmp_Synonyms, ignore.case=TRUE, perl=TRUE, value=FALSE)
            ifelse(length(tmp_result)==1, Orig.index[tmp_result[1]], NA)
        })
        match_flag[na_flag] <- b
        
        if(verbose){
        	now <- Sys.time()
            message(sprintf("Among %d symbols of input data, there are %d mappable via official gene symbols, %d mappable via gene alias but %d left unmappable", length(Symbol), (length(Symbol)-length(a)), sum(!is.na(b)), sum(is.na(b))), appendLF=TRUE)
        }
    
    }else{
    	if(verbose){
        	now <- Sys.time()
            message(sprintf("Among %d symbols of input data, there are %d mappable via official gene symbols but %d left unmappable", length(Symbol), (sum(!is.na(match_flag))), (sum(is.na(match_flag)))), appendLF=TRUE)
        }
    }
    
	## convert into GeneID
	df_res <- df_eg[match_flag, ]
	
	if(details){
		df_res <- data.frame(Input=Symbol, df_res, stringsAsFactors=FALSE)
		return(df_res)
	}else{
		return(df_res$GeneID)
	}
}

Try the Pi package in your browser

Any scripts or data that you put into this service are public.

Pi documentation built on Nov. 26, 2020, 2:01 a.m.