In Carldeboer/MAUDE: Mean Alterations Using Discrete Expression

knitr::opts_chunk$set(echo = TRUE)

Introduction

In this example, we use MAUDE to analyze a CRISPR base editor screen targeting a region containing a autoimmune-linked varaint (rs72928038) within a BACH2 enhancer. Data are from Mouri et al (Prioritization of autoimmune disease-associated genetic variants that perturb regulatory element activity in T cells), which also contains further details about the experiment: https://www.biorxiv.org/content/10.1101/2021.05.30.445673v1

In this experiment, CRISPR/Cas9 base editors were targeted to rs72928038, where they mutate the DNA. Expression is then measured by sorting cells by BACH2 expression (Flow-FISH) into 4 expression bins. We then use MAUDE to estimate the mean expression of each of the alleles generated by the base editors. Only a minority of the alleles created correspond to rs72928038.

Here, we have several experiments and replicates, four sorting bins per experiment, as well as unsorted cells. For more details, please see the before referenced publication.

Loading the data/libraries

library(ggplot2)
library(reshape)
library(MAUDE)
maudeGitPathRoot = "https://raw.githubusercontent.com/de-Boer-Lab/MAUDE/master"

Load sample metadata.

allSamples = read.table(file=sprintf("%s/vignettes/BACH2_data/sample_metadata.txt",maudeGitPathRoot), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F)
head(allSamples)
#remove some of the control samples
allSamples = allSamples[!grepl("HEK62", allSamples$replicate),]

Load the data.

if (FALSE){
  #this was run on my computer to load the data from many files spread out over many subdirectories. Rather than upload all these files separately, I have loaded them all locally and saved the resulting concatenated file onto github. I leave this code here so that others may view how the data was loaded, should they have their own CRISPResso files to analyze.
  inDir = "/Path/To/CRISPResso/Files";
  setwd(inDir)


  allCRISPRessoData= data.frame()
  for (i in 1:nrow(allSamples)){
    curData = read.table(file=sprintf("%s/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "")
    curData2 = read.table(file=sprintf("%s/Deep_resequencing_analysis/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "")
    curData = rbind(curData, curData2);
    curData$locus = allSamples$locus[i];
    curData$replicate = allSamples$replicate[i];
    curData$sortBin = allSamples$sortBin[i];
    allCRISPRessoData = rbind(allCRISPRessoData, curData)
  }
  #remove gaps from sequences
  allCRISPRessoData$SeqSpecies = gsub("-","",allCRISPRessoData$Aligned_Sequence); # Allele
  #remove unwanted fields
  allCRISPRessoData$Aligned_Sequence=NULL; allCRISPRessoData$Reference_Sequence=NULL;  allCRISPRessoData$X.Reads.1=NULL;
  write.table(allCRISPRessoData, file=sprintf("%s/loaded_BACH2_BE_CRISPResso_data.txt", inDir),row.names = F, col.names = T, quote=F, sep="\t")
  #I then gzipped this file and uploaded it to github
}else{
  #load the CRISPResso files from GitHub. 
  z= gzcon(url(sprintf("%s/vignettes/BACH2_data/loaded_BACH2_BE_CRISPResso_data.txt.gz",maudeGitPathRoot)));
  fileConn=textConnection(readLines(z));
  allCRISPRessoData =  read.table(fileConn, sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F)
  close(fileConn)
}

Polish CRISPResso genotype data, calculate reads per sample and genotype composition per sample.

#CRISPResso has split some sequences into two or maybe more lines; below is an example
#We also input multiple files from different sequencing runs. This merges all identical seq species'/samples
allCRISPRessoData = cast(melt(allCRISPRessoData, id.vars = c("SeqSpecies","Reference_Name","Read_Status","n_deleted","n_inserted","n_mutated", "locus","replicate","sortBin")), SeqSpecies + Reference_Name + Read_Status + n_deleted + n_inserted + n_mutated + locus + replicate + sortBin ~ variable, value="value", fun.aggregate=sum)

#Get read totals per replicate
allCRISPRessoDataTotals = cast(allCRISPRessoData, locus + replicate + sortBin ~ ., value="X.Reads", fun.aggregate = sum)
names(allCRISPRessoDataTotals)[ncol(allCRISPRessoDataTotals)] = "totalReads";

#Add totalReads column to allCRISPRessoData, calculate read fractions
allCRISPRessoData = merge(allCRISPRessoData, allCRISPRessoDataTotals, by=c("locus","replicate","sortBin"))
allCRISPRessoData$readFraction = allCRISPRessoData$X.Reads/allCRISPRessoData$totalReads;

Quality Control analysis for the experiment

Reads per sample

p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_continuous(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)

Same on a log scale

p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_log10(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)

Toss all the cases where the locus was unmodified but sequenced anyway. Note that this means each was effectively only 50K cells, not 100k because the gDNA was divided in half

allCRISPRessoData = allCRISPRessoData[ !(allCRISPRessoData$locus=="HEK" & grepl("^[ABCR][12]$", allCRISPRessoData$replicate)) & !(allCRISPRessoData$locus=="BACH2" & grepl("^HEK9[12]", allCRISPRessoData$replicate)),]

Plot a CDF showing the cumulative fraction of genotypes (y axis) per sample (colour) by their abundance in the sequencing data (x axis). This illustrates how biased the data are towards rare genotypes (left) vs abundant genotypes (right). The more the curves shift to the right, the less complex the modifications to the DNA. A lot of the ones to the left are from sequencing artifacts.

p = ggplot(allCRISPRessoData[allCRISPRessoData$sortBin=="NS",], aes(x= readFraction, colour=replicate)) + stat_ecdf()+facet_grid(locus ~ .) + scale_x_log10() + theme_bw(); print(p)

Estimate the %unmodified in each sample.

temp = cast(allCRISPRessoData[allCRISPRessoData$Read_Status=="UNMODIFIED" & allCRISPRessoData$Reference_Name=="Reference",], formula = sortBin + locus+ replicate ~. ,value = "readFraction", fun.aggregate = max)
names(temp)[ncol(temp)]="readFraction";
p = ggplot(temp, aes(x=sortBin, y=replicate, fill=readFraction)) + geom_tile() + facet_grid(locus ~., scales="free_y")+ggtitle("just unmodified read fractions") + scale_fill_gradientn(colours=c("red","orange", "green","cyan","blue","violet"), limits=c(0,0.8)); print(p)
min(temp$readFraction, na.rm = T) # 0.0356963

Sample exclusion

allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="A1-HEK",] # this sample had no data for bin D (BACH2) because it didn't PCR well

allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="B2-HEK",] # this sample had very low coverage for bin D for BACH2

Identify and retain only those alleles that are present in all samples

seqsObserved = cast(allCRISPRessoData, SeqSpecies + locus + Read_Status + Reference_Name~ .)
names(seqsObserved)[ncol(seqsObserved)]="seqRunsObserved";

retainedSamples = unique(allCRISPRessoData[,c("locus","replicate","sortBin")])

for (l in unique(allSamples$locus)){
  seqsObserved$inAll[seqsObserved$locus == l] = seqsObserved$seqRunsObserved[seqsObserved$locus == l]==sum(retainedSamples$locus==l)
}

#require that all samples have at least one read for every species considered. This will help exclude read errors
keepAlleles = seqsObserved[seqsObserved$inAll,]

MAUDE Analysis of base editor screen

#First make a count matrix, but only of alleles seen in all replicates
readCountMat = cast(allCRISPRessoData[allCRISPRessoData$SeqSpecies %in% keepAlleles$SeqSpecies,], SeqSpecies  + replicate +locus ~ sortBin, value="X.Reads")
readCountMat[is.na(readCountMat)] = 0
readCountMat = readCountMat[order(readCountMat$NS, decreasing = T),]

#Label WT alleles
wtSeqs = data.frame(locus = c("HEK","BACH2"), SeqSpecies =  c("GGTAGCCAGAGACCCGCTGGTCTTCTTTCCCCTCCCCTGCCCTCCCCTCCCTTCAAGATGGCTGACAA","TGCCCCACCCTGTGCCCTTTTTACATTACACACAAATAGGGACGGATTTCCTGTAAGCTGATCTTGAAGAAAAAAAACATGTTAGACAAAGAAAATCAGAACTAAGA"), isWT=T, stringsAsFactors = F);
readCountMat = merge(readCountMat, wtSeqs, by=c("locus","SeqSpecies"), all.x=T)
readCountMat$isWT[is.na(readCountMat$isWT)]=F;

allCRISPRessoData = merge(allCRISPRessoData, wtSeqs, by=c("locus","SeqSpecies"), all.x=T)
allCRISPRessoData$isWT[is.na(allCRISPRessoData$isWT)]=F;


binBounds = makeBinModel(data.frame(Bin=c("A","B","C","D","E","F"), fraction=rep(0.105,6))) # 10.5% bins
# but only the top ~21% (CD) and bottom ~21% (AB) were retained
binBounds = binBounds[binBounds$Bin %in% c("A","B","E","F"),] 
binBounds$Bin[binBounds$Bin=="E"]="C"
binBounds$Bin[binBounds$Bin=="F"]="D"

The bin layout:

p = ggplot(binBounds, aes(colour=Bin))  + 
  geom_density(data=data.frame(x=rnorm(100000)), aes(x=x), fill="gray", colour=NA)+ 
  ggplot2::geom_segment(aes(x=binStartZ, xend=binEndZ, y=fraction, yend=fraction)) + 
  xlab("Bin bounds as expression Z-scores") + 
  ylab("Fraction of the distribution captured") +theme_classic()+scale_y_continuous(expand=c(0,0))+
  coord_cartesian(ylim=c(0,0.7)); print(p)

Replicate bin stats for each experiment (the same sorting parameters were used for each)

curExpts = unique(readCountMat[c("replicate","locus")])

binStatsAll = data.frame();
for(i in 1:nrow(curExpts)){
  binStatsAll = rbind(binStatsAll, cbind(binBounds, curExpts[i,]))
}

#perform MAUDE analysis at guide (allele in this case) level for each locus.
guideLevelStats = data.frame();
for( l in unique(allSamples$locus)){
  message(l);
  statsA = findGuideHitsAllScreens(experiments = curExpts[curExpts$locus==l,], countDataFrame = readCountMat[readCountMat$locus==l,], binStats = binStatsAll[binStatsAll$locus==l,], sortBins = c("A","B","C","D"), unsortedBin = "NS", negativeControl = "isWT")
  guideLevelStats = rbind(guideLevelStats, statsA)
}

Summarize base changes per allele

baseChanges = data.frame()
for (l in wtSeqs$locus){
  wtSeq = wtSeqs$SeqSpecies[wtSeqs$locus==l];
  wtSplit = strsplit(wtSeq,"")[[1]]
  curSeqs = unique(guideLevelStats$SeqSpecies[guideLevelStats$locus==l & guideLevelStats$libFraction > 0.001])
  for (i in 1:length(curSeqs)){
    curSplit = strsplit(curSeqs[i],"")[[1]]
    mismatches = c();
    mismatchPoss=c();
    for (j in 1:min(length(wtSplit),length(curSplit))){
      if (wtSplit[j]!=curSplit[j]){
        mismatches = c(mismatches, curSplit[j]);
        mismatchPoss = c(mismatchPoss, j)
      }
    }
    if (length(mismatchPoss)>0){
      baseChanges = rbind(baseChanges, data.frame(mismatch=mismatches, position = mismatchPoss, SeqSpecies=curSeqs[i], locus=l))
    }
  }
}

baseChangesSummary = cast(baseChanges, SeqSpecies +locus~ ., value="mismatch")
names(baseChangesSummary)[ncol(baseChangesSummary)] = "numMismatches";
p = ggplot(baseChangesSummary, aes(x=numMismatches, colour=locus)) + stat_ecdf()+ geom_vline(xintercept = 15); print(p)

#exclude any where the number of mismatches is greater than 15 - these are also likely read or PCR artifacts.
baseChangesSummary$keepAlleles= baseChangesSummary$numMismatches<15;

baseChanges$has = 1;

baseChangesNumWith = cast(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles],], position + mismatch + locus~ ., value="has", fun.aggregate = sum)
names(baseChangesNumWith)[ncol(baseChangesNumWith)] = "numSeqsWith"
baseChanges = merge(baseChanges, baseChangesNumWith, by=c("position","mismatch","locus"))
baseChanges$mismatch = as.character(baseChanges$mismatch)
baseChanges$SeqSpecies = as.character(baseChanges$SeqSpecies)


p = ggplot(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], aes(x=position, y=SeqSpecies, fill=mismatch)) + geom_tile()+scale_fill_manual(values=c("orange","green","blue","red"))+scale_x_continuous(expand=c(0,0))+theme_classic() + theme(axis.text.y = element_text(size=5, family = "Courier")) + facet_grid(locus ~ ., scales="free", space ="free"); print(p)

#A_53 is the mutation of interest

baseChangesMatrix = cast(baseChanges[baseChanges$locus == "BACH2" & baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], SeqSpecies ~ mismatch + position, value="has", fill = 0)
A53_seq = baseChangesMatrix$SeqSpecies[apply(baseChangesMatrix[2:ncol(baseChangesMatrix)],1, sum)==1 & baseChangesMatrix$A_53==1]

guideLevelStats$pooled = grepl("-",guideLevelStats$replicate);

wtAndVarMaudeMu = cast(guideLevelStats[guideLevelStats$SeqSpecies %in% c(A53_seq, wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]),], replicate +pooled ~ SeqSpecies, value="mean")
names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==A53_seq]="rs72928038";
names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]]="WT";

ttestResults = t.test(x = wtAndVarMaudeMu$rs72928038[!wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[!wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE)

ttestResults_mixedCells = t.test(x = wtAndVarMaudeMu$rs72928038[wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE)

wtAndVarMaudeMu$replicate2 = gsub("-HEK","",wtAndVarMaudeMu$replicate)

Make plot of WT vs rs72928038 mean expression levels

meltedSNPMus = melt(as.data.frame(wtAndVarMaudeMu), id.vars=c("pooled","replicate", "replicate2"));
meltedSNPMus$genotype = factor(ifelse(meltedSNPMus$variable=="WT", "G", "A (risk)"),levels=c("G","A (risk)"))
p = ggplot(meltedSNPMus, aes(x=genotype, y=value, group=replicate)) + geom_point() + geom_line()+facet_grid(. ~ pooled)+theme_bw()+xlab("Genotype") + ylab("Mean expression") + ggtitle(sprintf("P=%f; P=%f", ttestResults$p.value, ttestResults_mixedCells$p.value)); print(p)