#' @author Liya Ming
#' @title convert_mzid
#' @description Convertion of mzid format to the input file required by function mms2plot.
#' @export convert_mzid
#' @param path_user_table File path of the table that contains the information of
#' user-specified modification.The columns in the table represent respectively
#' the paths of the raw file (format.mzML), the first two letters of the
#' modification of interest, the mass shift of the modification
#' (e.g.The mass shift of oxidation is 15.994915),and the acceptable mass error.
#' @param dir_mzid The full path of searching result mzid format
#' (e.g.result of comet).
#' @param label.by.Raw_seq_charge a logical value about the way of adding lables,
#' if FALSE, the PSMs from the same .raw(.mzML or .mzXML) file and were
#' identified as the same sequence will be labled as the same number,
#' whether or not the number of charges is the same;If TRUE,only psms
#' have the same number of charges will be labled the same number,
#' the psms labled the same number will be outputed to the same PDF
#' file when generating mirrored or aligned spectra using mms2plot.
#' @return the input file required by mms2plot.
#' @import stringr
#' @import mzID
#' @importFrom tools file_path_sans_ext
#' @importFrom dplyr select
#' @importFrom data.table fread
#' @note See vignettes for more details
#' @examples
#' ###
#' general_path = system.file( package = "mms2plot",dir = "extdata" )
#' setwd(general_path)
#' ######
#' #Mzid files generated by comet
#' path_prot = 'mzid_convert/acc_genename.txt'
#' path_user_table = 'mzid_convert/user_table_test.txt'
#' dir_mzid = 'mzid_convert/mzid_comet'
#' lf = list.files(dir_mzid,full.names = T)
#' a=mapply(unzip,lf[grep('\\.zip',lf)],exdir = dir_mzid,USE.NAMES = F)
#' # mms2plot_input = convert_mzid(path_user_table,dir_mzid,label.by.Raw_seq_charge)#for test
#' ###
#' #Mzid files generated by MSGFPlus
#' path_prot = 'mzid_convert/acc_genename.txt'
#' path_user_table = 'mzid_convert/user_table_test.txt'
#' dir_mzid = 'mzid_convert/mzid_MSGFPlus'
#' lf = list.files(dir_mzid,full.names = T)
#' a=mapply(unzip,lf[grep('\\.zip',lf)],exdir = dir_mzid,USE.NAMES = F)
#' # mms2plot_input = convert_mzid(path_user_table,dir_mzid,label.by.Raw_seq_charge)#for test
#rm(list=ls())
options(stringsAsFactors = FALSE)
options(digits = 15)
######Path of protein information:
test = F
if(test){
general_path = 'M:/Software/Git/new/inst/extdata'
}else{
general_path = system.file( package = "mms2plot",dir = "extdata" )
}
#setwd(general_path)
path_prot = paste(general_path,'mzid_convert/acc_genename.txt',sep = '/')
###########******FUNCTION******#########
####***AddModLabel:add labels by raw_sequence***#########
Add_label=function(i,df,keys){
df_sub=df[keys == unique(keys)[i],]
df_sub$label = i
return(df_sub)
}
###***AddModLabel:add modification symbols into sequence***####
AddModLabel = function(i,seqs,poss,modlabel){
seq = seqs[i]
pos = as.numeric(poss[[i]])
split = unlist(strsplit(seq,''))
mod.res = split[pos]
split[pos] = paste0(mod.res,modlabel)
modseq = paste(split,collapse = '')
return(modseq)
}
#Get indicies beyond the mw range of user-specified modification and fixed modification
Ind_modtype_err = function(MW,flo,cei,c_flo,c_cei){
mw = as.numeric(MW)
ind_within = which(flo < mw & mw < cei | c_flo < mw & mw < c_cei)
}
#get the mw and position within range of fixed modification
Ind_mod_fix = function(MW,c_flo,c_cei){
mw = as.numeric(MW)
ind_within = which(c_flo < mw & mw < c_cei)
}
#paste mws and positions
Ind_mod_fix1 = function(MW_POS,c_flo,c_cei){
mw = as.numeric(unlist(stringr::str_extract_all(MW_POS,"[^\\(]\\d+\\.?\\d+[^\\)]")))
pos = stringr::str_remove_all(string = unlist(stringr::str_extract_all(MW_POS,'\\(\\d+\\)')),pattern = '[\\(\\)]')
ind_within = which(c_flo < mw & mw < c_cei)
mw_1 = mw[-ind_within]
pos_1 = pos[-ind_within]
pas = paste(paste(mw_1,paste0('(',pos_1,')')),collapse=';')
return(pas)
}
Convert_by_usr <- function(i,table,paths_mzid){
# i is the row number of table
# table is user_table which is a dataframe
user_table = table[i,]
raw_path = user_table$fullfilepath
rawName = basename(tools::file_path_sans_ext(raw_path))
#>>>>>>
if(!file.exists(raw_path)){
lf=list.files(dirname(raw_path),full.names = T)
if(file.exists(paste(dirname(raw_path),paste0(rawName,'.zip'),sep = '/'))){
unzip(paste(dirname(raw_path),paste0(rawName,'.zip'),sep = '/'),exdir = dirname(raw_path))
}else{
stop(paste0('The raw file does NOT exist in the path: ',
dirname(raw_path)),'! Please check the folder...')
}
}
path_mzid_user = paths_mzid[grep(paste0(rawName,'.mzid'),basename(paths_mzid),fixed = T)]
######******MZID --> flatten file
mzid = mzID::mzID(path_mzid_user)
fla = mzID::flatten(mzid)
fla = fla[with(fla,rank == 1),]#comet
# browser()
# #msgfplus:
inv.col = c("start","end","pre","post","accession","description")
if(length(which(inv.col %in% colnames(fla))) > 0){
fla = fla[!duplicated(subset(fla,select = -which(colnames(fla) %in% inv.col))),]
}
# browser()
######******filter modification mass shift in flatten file
mod = subset(fla,fla$modified == TRUE)
unmod = subset(fla,fla$modified == FALSE)
###***range of mw,only for user_table with single row
accu_usr = user_table$delta
mw_usr = user_table$mass_shift
mw_floor = as.numeric(mw_usr) - as.numeric(accu_usr)
mw_ceiling = as.numeric(mw_usr) + as.numeric(accu_usr)
fix_c_mw = 57.021464
c_fixed_floor = as.numeric(fix_c_mw) - as.numeric(accu_usr)
c_fixed_ceiling = as.numeric(fix_c_mw) + as.numeric(accu_usr)
######******Extract all of mw and midified positions
mw_pos=mod$modification#mass shift and position
extra_mw = stringr::str_extract_all(mw_pos,"[^\\(]\\d+\\.?\\d+[^\\)]")
extra_pos = mapply(FUN = stringr::str_remove_all,
string = stringr::str_extract_all(mw_pos,'\\(\\d+\\)'),pattern = '[\\(\\)]')
#####***Save mw within range [mw-variance,mw+variance]
#remove mw not within modification specified by user and fixed modification
inds_correct_ls = lapply(FUN = Ind_modtype_err,X = extra_mw,
flo = mw_floor,cei = mw_ceiling,
c_flo = c_fixed_floor,c_cei = c_fixed_ceiling)
count_mw = unlist(lapply(extra_mw,length))
count_correct = unlist(lapply(inds_correct_ls,length))
inds_correct = which(count_mw == count_correct)
mod_user_1 = mod[inds_correct,]#subset within mw range and fixed modification mw range
mw_pos_1=mod_user_1$modification
extra_mw_1 = stringr::str_extract_all(mw_pos_1,"[^\\(]\\d+\\.?\\d+[^\\)]")
inds_fix_within = lapply(FUN=Ind_mod_fix,X = extra_mw_1,c_flo = c_fixed_floor,c_cei = c_fixed_ceiling)
count_fix = unlist(lapply(inds_fix_within,length))
inds_fix = which(count_fix != 0)
mod_fix_within = mod_user_1[inds_fix,]#subset containing fixed modification
mod_nonfix = mod_user_1[-inds_fix,]#subset DOES NOT contain fixed modification
#####***Save mw within range [mw-variance,mw+variance]
mw_pos_2=mod_fix_within$modification
extra_mw_2 = stringr::str_extract_all(mw_pos_2,"[^\\(]\\d+\\.?\\d+[^\\)]")
count_mw_2 = unlist(lapply(extra_mw_2,length))
#When a peptide has only fixed modification,
#the modification column will be changed to NA, modified to FALSE, and merged into the unmod subset
ind_only_fix = which(count_fix[count_fix != 0] == count_mw_2)
mod_only_fix = mod_fix_within[ind_only_fix,]#subset has only fixed modification
mod_only_fix$modified = FALSE
mod_only_fix$modification = NA
unmod_1 = rbind(mod_only_fix,unmod)#merge into unmod subset
mod_mul_types = mod_fix_within[-ind_only_fix,]#subset containing fixed and variable modifications simultaneously
mw_pos_clean = unlist(lapply(X=mod_mul_types$modification,c_flo=c_fixed_floor,c_cei=c_fixed_ceiling,FUN=Ind_mod_fix1))
mod_mul_types$modification = mw_pos_clean
mod_1 = rbind(mod_mul_types,mod_nonfix)#merge into mod subset
######***Save pairs/groups***######
seq_intsect = intersect(unmod_1$pepseq,mod_1$pepseq)
ind_sub = which(mod_1$pepseq %in% seq_intsect)
mod_1_sub = mod_1[ind_sub,]
unmod_sub = unmod_1[unmod_1$pepseq %in% seq_intsect,]
######******Extract mw and modified position from flatten file******######
mw_pos_mod_1 = mod_1_sub$modification
MWs = stringr::str_extract_all(mw_pos_mod_1,"[^\\(]\\d+\\.?\\d+[^\\)]")
POSs = mapply(FUN = stringr::str_remove_all,
string = stringr::str_extract_all(mw_pos_mod_1,'\\(\\d+\\)'),pattern = '[\\(\\)]')
######******Get modified sequence with modification labels such as "(ox)"******######
SEQ = mod_1_sub$pepseq
modlabel = paste0('(',tolower(user_table$modification),')')
Modified_seq = mapply(AddModLabel,seq(nrow(mod_1_sub)),MoreArgs = list(seqs=SEQ,poss=POSs,modlabel=modlabel))#modified sequences
######******Get the dataframe required by mms2plot******######
unmod_sub$`Modified sequence` = unmod_sub$pepseq
mod_1_sub$`Modified sequence` = Modified_seq
unmod_sub$Modifications = 'Unmodified'
mod_1_sub$Modifications = user_table$modification
bind_mod_unmod = rbind(mod_1_sub,unmod_sub)
bind_mod_unmod$`Raw file` = raw_path
if('description' %in% colnames(bind_mod_unmod)){
cols_select = c('Raw file','acquisitionnum','Modifications','pepseq',
'Modified sequence','chargestate','description')
df_mms2plot = dplyr::select(bind_mod_unmod,cols_select)
colnames(df_mms2plot) = c('Raw file','Scan number','Modifications','Sequence',
'Modified sequence','Charge','description')
GNs=stringr::str_remove(string=stringr::str_extract(df_mms2plot$description,'GN=\\w+'),
pattern = 'GN=')
df_mms2plot = dplyr::select(df_mms2plot,-description)
df_mms2plot$`Gene Names` = GNs
}else{
cols_select = c('Raw file','acquisitionnum','Modifications','pepseq',
'Modified sequence','chargestate','accession')
df_mms2plot = dplyr::select(bind_mod_unmod,cols_select)
colnames(df_mms2plot) = c('Raw file','Scan number','Modifications','Sequence',
'Modified sequence','Charge','Protein')
acc.names = stringr::str_remove_all(stringr::str_extract(df_mms2plot$Protein,'\\|\\w+\\|'),'\\|')
prot_acc_gn = data.table::fread(path_prot,header=T)
GNs = prot_acc_gn$gene_name[match(acc.names,prot_acc_gn$accession)]
df_mms2plot$`Gene Names` = GNs
df_mms2plot = dplyr::select(df_mms2plot,-Protein)
}
return(df_mms2plot)
}
####>>>>>> Main function:convert_mzid <<<<<<####
convert_mzid = function(path_user_table,dir_mzid,label.by.Raw_seq_charge){
user_table = data.table::fread(path_user_table,header = T)#read user table
paths_mzid = list.files(dir_mzid,recursive = T,full.names = T)
result_df = lapply(FUN=Convert_by_usr,X=seq(nrow(user_table)),
table=user_table,
paths_mzid=paths_mzid)
df = do.call(rbind,result_df)
df_mod = df[with(df,Modifications == 'Unmodified'),]
df_unmod = df[with(df,Modifications != 'Unmodified'),]
if(label.by.Raw_seq_charge){
key_unmod = unique(paste(df_unmod$`Raw file`,df_unmod$Sequence,df_unmod$Charge))
key_mod = unique(paste(df_mod$`Raw file`,df_mod$Sequence,df_mod$Charge))
key_intersect = intersect(key_unmod,key_mod)
keep_unmod = df_unmod[with(df_unmod,paste(`Raw file`,`Sequence`,`Charge`) %in% key_intersect),]
keep_mod = df_mod[with(df_mod,paste(`Raw file`,`Sequence`,`Charge`) %in% key_intersect),]
bind_groups = rbind(keep_mod,keep_unmod)
keys = paste(bind_groups$`Raw file`,bind_groups$Sequence,bind_groups$Charge)
}else{
key_unmod = unique(paste(df_unmod$`Raw file`,df_unmod$Sequence))
key_mod = unique(paste(df_mod$`Raw file`,df_mod$Sequence))
key_intersect = intersect(key_unmod,key_mod)
keep_unmod = df_unmod[with(df_unmod,paste(`Raw file`,`Sequence`) %in% key_intersect),]
keep_mod = df_mod[with(df_mod,paste(`Raw file`,`Sequence`) %in% key_intersect),]
bind_groups = rbind(keep_mod,keep_unmod)
keys = paste(bind_groups$`Raw file`,bind_groups$Sequence)
}
df_sub_ls = lapply(seq(length(key_intersect)),Add_label,df = bind_groups,keys = keys)
df_labeled = do.call(rbind,df_sub_ls)
return(df_labeled)
}
###***Run the main function:convert_mz id
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.