inst/scripts/downsize.R

#---------------------------------------------------------------------------
#
#   ATKIN HYPO
#   https://dom-pubs.pericles-prod.literatumonline.com/doi/10.1111/dom.13602
#
#----------------------------------------------------------------------------

mfile <- download_data('atkin.metabolon.xlsx')
sfile <- download_data('atkin.somascan.adat')
mobj <- read_metabolon(mfile)
sobj <- read_somascan(sfile)
sdt(mobj)
sdt(sobj)
mobj$Bmi <- NULL
sobj$Bmi <- NULL
mobj$Age <- NULL
sobj$Age <- NULL
sdt(mobj)
sdt(sobj)

PLOT_EXPRS <- function(obj)  plot_exprs(obj, block = 'Subject', coefs = NULL, shape = 'Diabetes', size = 'Diabetes') + 
                             scale_size_manual(values = c(Control = 1, T2DM = 2))

  mobj[ 'glucose',                                    ]  %>%  PLOT_EXPRS() # 1
  mobj[ 'cortisol',                                   ]  %>%  PLOT_EXPRS() # 2
  mobj[ 'docosatrienoate (22:3n3)',                   ]  %>%  PLOT_EXPRS() # 3
  mobj[ '10-nonadecenoate (19:1n9)',                  ]  %>%  PLOT_EXPRS() # 4 
# mobj[ 'X - 12450',                                  ]  %>%  PLOT_EXPRS() # 5
# mobj[ 'dihomo-linoleate (20:2n6)',                  ]  %>%  PLOT_EXPRS() # 6
# mobj[ 'linolenate [alpha or gamma; (18:3n3 or 6)]', ]  %>%  PLOT_EXPRS() # 7

  sobj['Activated Protein C',                         ]  %>%  PLOT_EXPRS() #  1
  sobj['BOC',                                         ]  %>%  PLOT_EXPRS() #  2
  sobj['CRDL1',                                       ]  %>%  PLOT_EXPRS() #  3
  sobj['Dynactin subunit 2',                          ]  %>%  PLOT_EXPRS() #  4
# sobj['Ephrin-A5',                                   ]  %>%  PLOT_EXPRS() #  5
# sobj['FABP',                                        ]  %>%  PLOT_EXPRS() #  6
# sobj['Insulin',                                     ]  %>%  PLOT_EXPRS() #  7
# sobj['Myoglobin',                                   ]  %>%  PLOT_EXPRS() #  8
# sobj['PH',                                          ]  %>%  PLOT_EXPRS() #  9
# sobj['PRL',                                         ]  %>%  PLOT_EXPRS() # 10
# sobj['RGMA',                                        ]  %>%  PLOT_EXPRS() # 11
# sobj['RSPO2',                                       ]  %>%  PLOT_EXPRS() # 12
# sobj['WIF-1',                                       ]  %>%  PLOT_EXPRS() # 13

  mfeatures <- c('glucose', 'cortisol', 'docosatrienoate (22:3n3)', '10-nonadecenoate (19:1n9)')
  sfeatures <- c('Activated Protein C', 'BOC', 'CRDL1', 'Dynactin subunit 2')
  
  mfeatures %<>% c(fnames(mobj)[order(rowVars(values(mobj), na.rm = TRUE))[1:16]])
  sfeatures %<>% c(fnames(sobj)[order(rowVars(values(sobj), na.rm = TRUE))[1:16]])

  mobj %<>% extract(mfeatures, )
  sobj %<>% extract(sfeatures, )
  
  biplot(pca(mobj))
  biplot(pca(sobj))
  
  plot_volcano(fit_limma(mobj), coefs = 't2')
  plot_volcano(fit_limma(sobj), coefs = 't2')

    
  read_metabolon('inst/extdata/atkin.metabolon.xlsx')
  read_somascan( 'inst/extdata/atkin.somascan.adat')


#---------------------------------------------------------------------------
#
#   FUKUDA ZEBRAFISH DEVELOPMENT
#   https://www.embopress.org/doi/full/10.15252/embr.201949752
#
#----------------------------------------------------------------------------
  
# Read
    file <- download_data('fukuda20.proteingroups.txt')
    object <- read_maxquant_proteingroups(file, contaminants = TRUE, reverse = TRUE)
    fdt(object)
    fdt(object)$protein    <- NULL # identical to feature_id
    fdt(object)$log2maxlfq <- NULL # not needed here
    fdt(object)$organism   <- NULL # DANRE everywhere
    fdt(object)$isoform    <- NULL # not needed now
    table(systematic_nas(object))
    object %<>% filter_exprs_replicated_in_some_subgroup()
    table(systematic_nas(object))
    
# Three types of detections. Impute systematic NAs.
    fdt(object)$detection <- 'none'
    fdt(object)$detection[ systematic_nas(object) ] <- 'systematic'
    fdt(object)$detection[     random_nas(object) ] <- 'random'
    fdt(object)$detection[         no_nas(object) ] <- 'full'
    table(fdt(object)$detection)
    object %<>% impute()
    fdt(object)
    
# LinMod
    object %<>% fit_limma(coefs = 'Adult')
    object %<>% extract( order(fdt(.)$`p~Adult~limma`) , )
    fdt(object)
    
# Lets compose a representative set                                                                                                                 dir    det    dif
#                                                                                                                                                   ---    ---    ---
    fdt(object)[ contaminant == '' ][ `effect~Adult~limma` < 0 ][ detection == 'systematic' ]  #    54  mcm6        F1R5P3_DANRE      -8e-9         dn     sys     1    (1)
                                                                                               #  5575  polr2d      Q6DRG4_DANRE      -5e-1                        0    (2)
                                                                                               #  6266  riox2       Q7T3G6_DANRE      -2e-1                        0    (3)
    fdt(object)[ contaminant == '' ][ `effect~Adult~limma` < 0 ][ detection == 'full'       ]  #  5045  hmbsb       Q503D2_DANRE      -4e-8                full    1    (4)
                                                                                               #  2769  synm        B7ZUY8_DANRE      -1                           0    (5)
                                                                                               #  4750  pmpcb       Q1L8E2_DANRE      -1                           0    (6)
    fdt(object)[ contaminant == '' ][ `effect~Adult~limma` < 0 ][ detection == 'random'     ]  #  6198  hbbe2       Q7T1B0_DANRE      -1e-7                rnd     1    (7)
                                                                                               #  6020  atg3        ATG3_DANRE        -1                           0    (8)
                                                                                               #   641  lztfl1      A0A0R4ISB5_DANRE  -1                           0    (9)
    
    fdt(object)[ contaminant == '' ][ `effect~Adult~limma` > 0 ][ detection == 'systematic' ]  #  3568  vtg4        F1Q7L0_DANRE      +2e-8         up     sys     1   (10)
                                                                                               #  5065  ovca2       OVCA2_DANRE       +2e-1                        0   (11)
                                                                                               #  3335  mief2       E7FFR5_DANRE      +1e-1                        0   (12)
    fdt(object)[ contaminant == '' ][ `effect~Adult~limma` > 0 ][ detection == 'full'       ]  #  5477  hbba2       Q6DGK4_DANRE      +3e-6                full    1   (13)
                                                                                               #  3416  lrrc57      LRC57_DANRE       +1                           0   (14)
                                                                                               #   471  ryr2b       A0A0R4IKT8_DANRE  +1                           0   (15)
    fdt(object)[ contaminant == '' ][ `effect~Adult~limma` > 0 ][ detection == 'random'     ]  #  5522  bfb         Q6DHC4_DANRE      +1e-5                rnd     1   (16)
                                                                                               #   599  utp20       A0A0R4IQW7_DANRE  +1                           0   (17)
                                                                                               #  2711  plbd1       B3DJ80_DANRE      +1                           0   (18)
    
    fdt(object)[ contaminant == '+' ]                                                          #  2936  ALB         CON__ALBU_BOVIN   -8e-1         dn     full    0   (19)
    fdt(object)[     reverse == '+' ]                                                          #  6754  REV__elmo3  REV__F1QSV8_DANRE -3e-1         dn     full    0   (20)

    features <- c( 'F1R5P3_DANRE',  'Q6DRG4_DANRE', 'Q7T3G6_DANRE',  'Q503D2_DANRE', 'B7ZUY8_DANRE',     'Q1L8E2_DANRE',  'Q7T1B0_DANRE',       'ATG3_DANRE', 'A0A0R4ISB5_DANRE', 
                   'F1Q7L0_DANRE',   'OVCA2_DANRE', 'E7FFR5_DANRE',  'Q6DGK4_DANRE',  'LRC57_DANRE', 'A0A0R4IKT8_DANRE',  'Q6DHC4_DANRE', 'A0A0R4IQW7_DANRE',     'B3DJ80_DANRE' , 
                   'CON__ALBU_BOVIN', 'REV__F1QSV8_DANRE' )
    all( features %in% fdt(object)$feature_id )
    fdt(object)[ feature_id %in% features ][ order(as.numeric(proId)) ]

  read_maxquant_proteingroups('inst/extdata/fukuda20.proteingroups.txt') # works
  read_maxquant_proteingroups(download_data('fukuda20.proteingroups.txt')) # works
  
  
#---------------------------------------------------------------------------
#
#   BILLING STEM CELL COMPARISON
#   https://www.nature.com/articles/srep21507
#
#----------------------------------------------------------------------------

    file <- download_data('billing16.proteingroups.txt')
    object <- read_maxquant_proteingroups(file)
    sdt(object)
  

    
#---------------------------------------------------------------------------------
#
#   BILLING STEM CELL DIFFERENTIATION
#   https://www.sciencedirect.com/science/article/pii/S1535947620315243?via%3Dihub
#
#---------------------------------------------------------------------------------
    # Read
        rnafile <- download_data('billing19.rnacounts.txt')
        profile <- download_data('billing19.proteingroups.txt')
        fosfile <- download_data('billing19.phosphosites.txt')
        rnafile1 <- 'inst/extdata/billing19.rnacounts.txt'
        profile1 <- 'inst/extdata/billing19.proteingroups.txt'
        fosfile1 <- 'inst/extdata/billing19.phosphosites.txt'
        subgroups <- sprintf('%s_STD', c('E00', 'E01', 'E02', 'E05', 'E15', 'E30', 'M00'))
        rna  <- read_rnaseq_counts(rnafile)
        rna1 <- read_rnaseq_counts(rnafile1)
        pro  <- read_maxquant_proteingroups(profile,  subgroups = subgroups, contaminant = TRUE, reverse = TRUE)
        pro1 <- read_maxquant_proteingroups(profile1, subgroups = subgroups)
        fos  <- read_maxquant_phosphosites( fosfile = fosfile,  profile = profile,  subgroups = subgroups, contaminant = TRUE, reverse = TRUE )
        fos1 <- read_maxquant_phosphosites( fosfile = fosfile1, profile = profile1, subgroups = subgroups)
        pro$subgroup  %<>% split_extract_fixed('_', 1)
        pro1$subgroup %<>% split_extract_fixed('_', 1)
        fos$subgroup  %<>% split_extract_fixed('_', 1)
        fos1$subgroup %<>% split_extract_fixed('_', 1)
        fdt(fos)$`Fasta headers` <- NULL
        fvars(rna) %<>% stri_replace_first_fixed('gene_name', 'gene')
    # Pca
        pcalist <- list( p1 = biplot(pca(rna ), nx = 1, ny = 1, feature_label = 'gene') + ggtitle('RNA'),   #  COL3A1 and COL11A1
                         p3 = biplot(pca(pro ), nx = 1, ny = 1, feature_label = 'gene') + ggtitle('PRO'),   #   HSPB6 and LCP1
                         p5 = biplot(pca(fos ), nx = 1, ny = 1, feature_label = 'gene') + ggtitle('FOS') )  #     VIM and NES
        gridExtra::grid.arrange(grobs = pcalist, nrow = 3)
    # Model
        rna  %>% impute() # no NA
        pro %<>% impute()
        fos %<>% impute()
        rna %<>% fit_limma(coefs = 'M00') # differentiation E00 -> M00
        pro %<>% fit_limma(coefs = 'M00')
        fos %<>% fit_limma(coefs = 'M00')
        rna %<>% extract(order(fdt(.)$`p~M00~limma`), )
        pro %<>% extract(order(fdt(.)$`p~M00~limma`), )
        fos %<>% extract(order(fdt(.)$`p~M00~limma`), )
    # Rm missing values
        fdt(rna)[is.na(`p~M00~limma`)]  #  0 % NA
        fdt(pro)[is.na(`p~M00~limma`)]  #  7 % NA :  586/8017
        fdt(fos)[is.na(`p~M00~limma`)]  # 27 % NA : 2314/8590
        pro %<>% filter_features(!is.na(`p~M00~limma`))
        fos %<>% filter_features(!is.na(`p~M00~limma`))
        fdt(rna) %<>% add_adjusted_pvalues('fdr')
        fdt(pro) %<>% add_adjusted_pvalues('fdr')
        fdt(fos) %<>% add_adjusted_pvalues('fdr')
        rna %<>% abstract_fit()
        pro %<>% abstract_fit()
        fos %<>% abstract_fit()
    # Drops features with no gene annotation
        fdt(rna)[gene=='' | is.na(gene) | gene == 'NA'] # 0
        fdt(pro)[gene=='' | is.na(gene) | gene == 'NA'] # 108 - incomplete fastahdrs - rm
        fdt(fos)[gene=='' | is.na(gene) | gene == 'NA'] #  71 - incomplete fastahdrs - rm
        pro %<>% filter_features( gene!='' & !is.na(gene) & gene != 'NA' )
        fos %<>% filter_features( gene!='' & !is.na(gene) & gene != 'NA' )
    # Fix Excel genes
        fdt(rna)[order(gene)]  # excel genes issue
        fdt(pro)[order(gene)]  # no issue
        fdt(fos)[order(gene)]  # no issue
        fdt(rna)$gene %<>% fix_xlgenes()
        fdt(pro)$gene %<>% fix_xlgenes()
        fdt(fos)$gene %<>% fix_xlgenes()
        fdt(rna)[order(gene)]  # excel genes issue
    # Contaminant
        procontaminants <- fdt(pro)[contaminant == '+' & gene != '', unique(gene) ]  # Lets take a contaminant in both PRO and FOS
        foscontaminants <- fdt(fos)[contaminant == '+' & gene != '', unique(gene) ]  # That makes things a bit easier
        commoncontaminants <- intersect( procontaminants, foscontaminants )          # KRT19 catches the eye. Well-known contaminant
        fdt(pro)[commoncontaminants, on = 'gene']   # PRO : 1 down - ok, gets filtered out anyways  - take
        fdt(rna)[gene == 'KRT19', on = 'gene']   # RNA : 1 flat - contributes to flat background - take
        fdt(fos)[gene == 'KRT19', on = 'gene']   # FOS : 4 down (imputed), 1 flat (detected) - take the detected
        contaminants <- union(procontaminants, foscontaminants)
        rna %<>% filter_features(! gene %in% c('', contaminants))
        pro %<>% filter_features(! gene %in% c('', contaminants))
        fos %<>% filter_features(! gene %in% c('', contaminants))
    # Reverse
        proreverse <- fdt(pro)[reverse == '+', unique(gene)]  # Lets find one in both PRO and FOS
        fosreverse <- fdt(fos)[reverse == '+', unique(gene)]  # REV__SYNE2 is the only such protein!
        intersect(proreverse, fosreverse)                     # PRO : 1 flat - take
        fdt(pro)[gene == 'REV__SYNE2']                        # FOS : 2 up (imp) - same protein - take first (12087)
        fdt(fos)[gene == 'REV__SYNE2']                        # Interestingly the fwd protein is also detected!
        fdt(rna)[gene == 'SYNE2']                             # RNA : 1 down - take
        fdt(pro)[gene == 'SYNE2']                             # PRO : 1 down - take
        fdt(fos)[gene == 'SYNE2']                             # FOS : 1 down (imputed)
        pro %<>% filter_features(reverse=='')                 #         will not contribute to modeling background because NA
        fos %<>% filter_features(reverse=='')                 #         still good to take because of the pairing with PRO
    # Multiprotein genes
        multiproteindt <- fdt(pro)[imputed==FALSE]
        multiproteindt %<>% extract(, .SD[.N>1], by = 'gene')                                                    # multiprotein gene
        multiproteindt %<>% extract(gene %in% unique(fdt(rna)[`p~M00~limma`>0.1]$gene))                          # RNA flat
        multiproteindt %<>% extract(, .SD[ sum(   `fdr~M00~limma`<0.05 & `effect~M00~limma`>0)>0], by = 'gene')  # PRO up (SYNE2 was already down)
        multiproteindt %<>% extract(, .SD[ sum(     `p~M00~limma`>0.5 )>0], by = 'gene')                         # PRO flat
        fdt(rna)[gene == 'ACOT9']  # RNA: 1 flat
        fdt(pro)[gene == 'ACOT9']  # PRO: 1 up, 1 flat, genenames intuitive
        fdt(fos)[gene == 'ACOT9']  # FOS: none
        multiproteingenes <- fdt(pro)[, .SD[.N>1], by = 'gene']$gene
        rna %<>% filter_features(!gene %in% multiproteingenes)
        pro %<>% filter_features(!gene %in% multiproteingenes)
        fos %<>% filter_features(!gene %in% multiproteingenes)
    # RNA down - PRO flat - FOS none
        unique(fdt(pro)[  `p~M00~limma`>0.5                         ]$gene)   %>% intersect(  
        unique(fdt(rna)[`fdr~M00~limma`<5e-8 & `effect~M00~limma`<0 ]$gene) ) %>% setdiff(
        unique(fdt(fos)$gene))
        fdt(rna)[gene == 'KIF5A']
        fdt(pro)[gene == 'KIF5A']
        fdt(fos)[gene == 'KIF5A']
    # RNA up - PRO flat - FOS none
        unique(fdt(pro)[  `p~M00~limma`>0.5                         ]$gene)   %>% intersect(  
        unique(fdt(rna)[`fdr~M00~limma`<5e-9 & `effect~M00~limma`>0 ]$gene) ) %>% setdiff(
        unique(fdt(fos)$gene))
        fdt(rna)[gene == 'CHD3']
        fdt(pro)[gene == 'CHD3']
        fdt(fos)[gene == 'CHD3']
    # RNA flat, PRO flat, FOS down+up (and some flat)
        candgenes <-             unique(fdt(rna)[  `p~M00~limma` > 0.5 ]$gene)                              # RNA flat
        candgenes %<>% intersect(unique(fdt(pro)[  `p~M00~limma` > 0.5 ]$gene))                             # PRO flat
        candgenes %<>% intersect(unique(fdt(fos)[imputed==FALSE][`fdr~M00~limma` < 0.05 & `effect~M00~limma` < 0 ]$gene))   # FOS down
        candgenes %<>% intersect(unique(fdt(fos)[imputed==FALSE][`fdr~M00~limma` < 0.05 & `effect~M00~limma` > 0 ]$gene))   # FOS up
        candgenes %<>% intersect(unique(fdt(fos)[imputed==FALSE][  `p~M00~limma` > 0.05                          ]$gene))   # FOS flat
        fdt(rna)['TNKS1BP1', on = 'gene']
        fdt(pro)['TNKS1BP1', on = 'gene']
        fdt(fos)['TNKS1BP1', on = 'gene'][imputed==FALSE]
    # RNA flat, PRO flat, FOS down (and some flat)
        candgenes <-             unique(fdt(rna)[  `p~M00~limma` > 0.5 ]$gene)                              # RNA flat
        candgenes %<>% intersect(unique(fdt(pro)[  `p~M00~limma` > 0.5 ]$gene))                             # PRO flat
        candgenes %<>% intersect(unique(fdt(fos)[imputed==FALSE][`fdr~M00~limma` < 5e-5 & `effect~M00~limma` < 0 ]$gene))   # FOS down
        candgenes %<>% intersect(unique(fdt(fos)[imputed==FALSE][  `p~M00~limma` > 0.05                          ]$gene))   # FOS flat
        candgenes
        fdt(rna)[gene == candgenes[2]]
        fdt(rna)['KLC4', on = 'gene']  # flat
        fdt(pro)['KLC4', on = 'gene']  # flat
        fdt(fos)['KLC4', on = 'gene'][imputed==FALSE]  # one down, (one mildly up), one flat
    # RNA up, PRO flat, FOS up (and some flat)
        candgenes <-             unique(fdt(rna)[  `p~M00~limma` > 0.5 ]$gene)                              # RNA flat
        candgenes %<>% intersect(unique(fdt(pro)[  `p~M00~limma` > 0.5 ]$gene))                             # PRO flat
        candgenes %<>% intersect(unique(fdt(fos)[imputed==FALSE][`fdr~M00~limma` < 5e-4 & `effect~M00~limma` > 0 ]$gene))   # FOS down
        candgenes %<>% intersect(unique(fdt(fos)[imputed==FALSE][  `p~M00~limma` > 0.05                          ]$gene))   # FOS flat
        candgenes
        fdt(rna)['MFF', on = 'gene']                  # flat
        fdt(pro)['MFF', on = 'gene']                  # flat. but proteingroups file shows 3 isoforms
        fdt(fos)['MFF', on = 'gene'][imputed==FALSE]  # one up, one flat
        fdt(rna)['SLC4A7', on = 'gene']                  # flat
        fdt(pro)['SLC4A7', on = 'gene']                  # flat
        fdt(fos)['SLC4A7', on = 'gene'][imputed==FALSE]  # take 1 up and 2 flat
    # RNA flat, PRO impute down, FOS impute down
        candgenes <-      unique(fdt(rna)[ `p~M00~limma` > 0.5 ]$gene)                             # RNA flat
        candgenes %<>% intersect(fdt(pro)[imputed==TRUE]$gene)                                     # PRO imputed
        candgenes %<>% intersect(fdt(fos)[imputed==TRUE]$gene)                                     # FOS imputed
        candgenes %<>% intersect(fdt(pro)[ `fdr~M00~limma` < 5e-6 & `effect~M00~limma` < 0 ]$gene) # PRO down
        candgenes %<>% intersect(fdt(fos)[ `fdr~M00~limma` < 5e-6 & `effect~M00~limma` < 0 ]$gene) # FOS down
        candgenes
        fdt(rna)['MNAT1', on = 'gene']
        fdt(pro)['MNAT1', on = 'gene']
        fdt(fos)['MNAT1', on = 'gene']
    # RNA flat, PRO impute up, FOS impute up
        candgenes <-      unique(fdt(rna)[ `p~M00~limma` > 0.5 ]$gene)                             # RNA flat
        candgenes %<>% intersect(fdt(pro)[imputed==TRUE]$gene)                                     # PRO imputed
        candgenes %<>% intersect(fdt(fos)[imputed==TRUE]$gene)                                     # FOS imputed
        candgenes %<>% intersect(fdt(pro)[ `fdr~M00~limma` < 5e-3 & `effect~M00~limma` > 0 ]$gene) # PRO up
        candgenes %<>% intersect(fdt(fos)[ `fdr~M00~limma` < 5e-3 & `effect~M00~limma` > 0 ]$gene) # FOS up
        candgenes
        fdt(rna)['FYN', on = 'gene']
        fdt(pro)['FYN', on = 'gene']
        fdt(fos)['FYN', on = 'gene']
    # RNA flat, PRO flat, FOS flat, random NA
        candgenes <-             unique(fdt(rna)[ `p~M00~limma` > 0.2 ]$gene)   # RNA flat
        candgenes %<>% intersect(unique(fdt(pro)[ `p~M00~limma` > 0.2 ]$gene))  # PRO flat
        candgenes %<>% intersect(unique(fdt(fos)[ `p~M00~limma` > 0.2 ]$gene))  # FOS flat
        candgenes %<>% intersect(fdt(pro)$gene[random_nas(pro)])
        candgenes %<>% intersect(fdt(fos)$gene[random_nas(fos)])
        candgenes
        fdt(rna)['MFF', on = 'gene']
        fdt(pro)['MFF', on = 'gene'] # multiple isoforms
        fdt(fos)['MFF', on = 'gene']
        fdt(rna)['DDA1', on = 'gene']
        fdt(pro)['DDA1', on = 'gene']
        fdt(fos)['DDA1', on = 'gene']
        fdt(rna)['NEK1', on = 'gene']
        fdt(pro)['NEK1', on = 'gene']
        fdt(fos)['NEK1', on = 'gene']
        
    # Flatliner: RNA and FOS (not in PRO)
        candidates <- rnagenes %>% setdiff(progenes) %>% intersect(fosgenes)
        candidates %<>% intersect(fdt(rna)[`p~M00~limma` > 0.30, unique(gene)])  # flat rna - flat pro
        candidates %<>% intersect(fdt(fos)[`p~M00~limma` > 0.30, unique(gene)])  # flat rna - flat pro
        fdt(rna)['EAF1', on = 'gene']
        fdt(pro)['EAF1', on = 'gene']
        fdt(fos)['EAF1', on = 'gene']
    # Review
        rnafile <- download_data('billing19.rnacounts.txt')
        profile <- download_data('billing19.proteingroups.txt')
        fosfile <- download_data('billing19.phosphosites.txt')
        rnafile1 <- 'inst/extdata/billing19.rnacounts.txt'
        profile1 <- 'inst/extdata/billing19.proteingroups.txt'
        fosfile1 <- 'inst/extdata/billing19.phosphosites.txt'
        subgroups <- sprintf('%s_STD', c('E00', 'E01', 'E02', 'E05', 'E15', 'E30', 'M00'))
        rna  <- read_rnaseq_counts(rnafile)
        rna1 <- read_rnaseq_counts(rnafile1)
        pro  <- read_maxquant_proteingroups(profile,  subgroups = subgroups, contaminant = TRUE, reverse = TRUE)
        pro1 <- read_maxquant_proteingroups(profile1, subgroups = subgroups)
        fos  <- read_maxquant_phosphosites( fosfile = fosfile,  profile = profile,  subgroups = subgroups, contaminant = TRUE, reverse = TRUE )
        fos1 <- read_maxquant_phosphosites( fosfile = fosfile1, profile = profile1, subgroups = subgroups)
        pro$subgroup  %<>% split_extract_fixed('_', 1)
        pro1$subgroup %<>% split_extract_fixed('_', 1)
        fos$subgroup  %<>% split_extract_fixed('_', 1)
        fos1$subgroup %<>% split_extract_fixed('_', 1)
        fdt(fos)$`Fasta headers` <- NULL
        fvars(rna) %<>% stri_replace_first_fixed('gene_name', 'gene')
        pcalist <- list( p1 = biplot(pca(rna ), nx = 1, ny = 1, feature_label = 'gene') + ggtitle('RNA'),  #  COL3A1
                         p2 = biplot(pca(rna1), nx = 1, ny = 1, feature_label = 'gene') + ggtitle('rna'),  #  COL11A1
                         p3 = biplot(pca(pro ), nx = 1, ny = 1, feature_label = 'gene') + ggtitle('PRO'),  #  HSPB6
                         p4 = biplot(pca(pro1), nx = 1, ny = 1, feature_label = 'gene') + ggtitle('pro'),  #  LCP1
                         p5 = biplot(pca(fos ), nx = 1, ny = 1, feature_label = 'gene') + ggtitle('FOS'),  #  VIM
                         p6 = biplot(pca(fos1), nx = 1, ny = 1, feature_label = 'gene') + ggtitle('fos') ) #  NES
        gridExtra::grid.arrange(grobs = pcalist, layout_matrix = matrix(1:6, nrow = 3, byrow = TRUE))
        rna  %<>% fit_limma()
        pro  %<>% fit_limma()
        fos  %<>% fit_limma()
        rna1 %<>% fit_limma()
        pro1 %<>% fit_limma()
        fos1 %<>% fit_limma()
        gridExtra::grid.arrange( plot_volcano(rna,  label = 'gene_name'), 
                                 plot_volcano(rna1, label = 'gene'     ) )
        gridExtra::grid.arrange( plot_volcano(pro,  label = 'gene'     ), 
                                 plot_volcano(pro1, label = 'gene'     ) )
        gridExtra::grid.arrange( plot_volcano(fos,  label = 'gene'     ), 
                                 plot_volcano(fos1, label = 'gene'     ) )
        
        plotlist <- list( p1 = plot_volcano(rna,  label = 'gene_name' ),
                          p2 = plot_volcano(rna1, label = 'gene'),
                          p3 = plot_volcano(pro,  label = 'gene'),
                          p4 = plot_volcano(pro1, label = 'gene'),
                          p5 = plot_volcano(fos,  label = 'gene'),
                          p6 = plot_volcano(fos1, label = 'gene') )
        gridExtra::grid.arrange(grobs = plotlist, layout_matrix = matrix(1:6, nrow = 3, byrow = TRUE))


        
#---------------------------------------------------------------------------------
#
#   BILLING STEM CELL COMPARISON
#   https://www.nature.com/articles/srep21507
#
#---------------------------------------------------------------------------------
        # RNA, PRO, and SOMA exist
        # But the examples us only PRO
        # So for now, lets just downsize that.
        profile <-  download_data('billing16.proteingroups.txt')
        somafile <- download_data('billing16.somascan.adat')
        rna <- read_rnaseq_counts(rnafile)
        pro <- read_maxquant_proteingroups(profile)
        soma <- read_somascan(somafile)
        rna # already a reduced set file !
        pro
        soma
        
#---------------------------------------------------------------------------------
#
#   UNIPROT_HSA_20140515
#
#---------------------------------------------------------------------------------
        profile <- system.file('extdata/billing19.proteingroups.txt', package = 'autonomics')
        fosfile <- system.file('extdata/billing19.phosphosites.txt',  package = 'autonomics')
        prodt <- fread(profile, select = c('id', 'Majority protein IDs', 'Gene names'))
        fosdt <- fread(fosfile, select = c('id', 'Protein group IDs', 'Proteins', 'Gene names'))
        
        uniprots <-         unique(unlist(stri_split_fixed(prodt$`Majority protein IDs`, ';')))
        uniprots %<>% union(unique(unlist(stri_split_fixed(fosdt$`Proteins`,             ';'))))
        uniprots %<>% extract(. != '')
        uniprots %<>% extract(!stri_detect_fixed(., 'REV__'))
        uniprots %<>% extract(!stri_detect_fixed(., 'CON__'))
      # uniprots %<>% split_extract_fixed('-', 1)
        uniprots %<>% unique()

        fastainfile <- download_data('uniprot_hsa_20140515.fasta')
        fastaobj <- Biostrings::readAAStringSet(fastainfile)
        fastaups <- names(fastaobj) %>% split_extract_fixed('|', 2)
        idx <- match(uniprots, fastaups)
        fastaobj %<>% extract(idx)
        fastaoutfile <- 'inst/extdata/uniprot_hsa_20140515.fasta'
        Biostrings::writeXStringSet(fastaobj, filepath = fastaoutfile)
        
        pro1 <- read_maxquant_proteingroups(profile)
        pro2 <- read_maxquant_proteingroups(profile, fastafile = fastafile)
        fdt(pro1)
        fdt(pro2)

        
#----------------
#
#   DIANN DATASET
#   https://www.nature.com/articles/s41467-023-40596-0
#
#----------------
        # PCA and PLS: major variation drivers
        file <- '../../../02_analysis/ag_serrano/ag_pogge-serrano-pkp2-001/atnomx_report_names.tsv'
        object <- read_diann_proteingroups(file)
        sdt(object)
        p1 <- biplot(pca(object), nx = 2, ny = 4, feature_label = 'gene')
        p2 <- biplot(pls(object), nx = 2, ny = 2, feature_label = 'gene') # maybe better these to raise less questions
        gridExtra::grid.arrange(p1, p2, nrow = 1)
        genes <- c(  
                    'PI16',        #  down in MA
                    'CPM',         #    up in MA
                    'RPL36AL',     #  down in KD (MA)
                    'ANKRD54',     #    up in KD (MA)
                    'TFB1M',       #  down in KD (MA + PA)
                    'NAV1'         #    up in KD (MA + PA)
                 )
        
        # Explore feature clusters
        fcluster(object)            # 1 :   up (MA.WT, MA.KD) - down (PA.WT, PA.KD)
                                    # 2 : down (MA.WT, MA.KD) - up (PA.WT, PA.KD)
                                    # 3 : seems to be more flat backgrounders
                                    # But is clustering suited to pick flat backgrounders ?
                                    # Zscoring actually inflates signal
                                    # And flat backgrounders will not have a consistent pattern anyways
                                    # F test would actually be more suited
        fdt(object) %<>% extract(, 1:2)
        fdt(fit_limma(object))
        fdt(fit_lm(object))
        
        
        
        
bhagwataditya/importomics documentation built on Oct. 29, 2024, 3:19 p.m.