knitr::opts_chunk$set(echo = TRUE)
In this example, we use MAUDE to analyze a CRISPR base editor screen targeting a region containing a autoimmune-linked varaint (rs72928038) within a BACH2 enhancer. Data are from Mouri et al (Prioritization of autoimmune disease-associated genetic variants that perturb regulatory element activity in T cells), which also contains further details about the experiment: https://www.biorxiv.org/content/10.1101/2021.05.30.445673v1
In this experiment, CRISPR/Cas9 base editors were targeted to rs72928038, where they mutate the DNA. Expression is then measured by sorting cells by BACH2 expression (Flow-FISH) into 4 expression bins. We then use MAUDE to estimate the mean expression of each of the alleles generated by the base editors. Only a minority of the alleles created correspond to rs72928038.
Here, we have several experiments and replicates, four sorting bins per experiment, as well as unsorted cells. For more details, please see the before referenced publication.
library(ggplot2) library(reshape) library(MAUDE) maudeGitPathRoot = "https://raw.githubusercontent.com/de-Boer-Lab/MAUDE/master"
Load sample metadata.
allSamples = read.table(file=sprintf("%s/vignettes/BACH2_data/sample_metadata.txt",maudeGitPathRoot), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F) head(allSamples) #remove some of the control samples allSamples = allSamples[!grepl("HEK62", allSamples$replicate),]
Load the data.
if (FALSE){ #this was run on my computer to load the data from many files spread out over many subdirectories. Rather than upload all these files separately, I have loaded them all locally and saved the resulting concatenated file onto github. I leave this code here so that others may view how the data was loaded, should they have their own CRISPResso files to analyze. inDir = "/Path/To/CRISPResso/Files"; setwd(inDir) allCRISPRessoData= data.frame() for (i in 1:nrow(allSamples)){ curData = read.table(file=sprintf("%s/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "") curData2 = read.table(file=sprintf("%s/Deep_resequencing_analysis/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "") curData = rbind(curData, curData2); curData$locus = allSamples$locus[i]; curData$replicate = allSamples$replicate[i]; curData$sortBin = allSamples$sortBin[i]; allCRISPRessoData = rbind(allCRISPRessoData, curData) } #remove gaps from sequences allCRISPRessoData$SeqSpecies = gsub("-","",allCRISPRessoData$Aligned_Sequence); # Allele #remove unwanted fields allCRISPRessoData$Aligned_Sequence=NULL; allCRISPRessoData$Reference_Sequence=NULL; allCRISPRessoData$X.Reads.1=NULL; write.table(allCRISPRessoData, file=sprintf("%s/loaded_BACH2_BE_CRISPResso_data.txt", inDir),row.names = F, col.names = T, quote=F, sep="\t") #I then gzipped this file and uploaded it to github }else{ #load the CRISPResso files from GitHub. z= gzcon(url(sprintf("%s/vignettes/BACH2_data/loaded_BACH2_BE_CRISPResso_data.txt.gz",maudeGitPathRoot))); fileConn=textConnection(readLines(z)); allCRISPRessoData = read.table(fileConn, sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F) close(fileConn) }
Polish CRISPResso genotype data, calculate reads per sample and genotype composition per sample.
#CRISPResso has split some sequences into two or maybe more lines; below is an example #We also input multiple files from different sequencing runs. This merges all identical seq species'/samples allCRISPRessoData = cast(melt(allCRISPRessoData, id.vars = c("SeqSpecies","Reference_Name","Read_Status","n_deleted","n_inserted","n_mutated", "locus","replicate","sortBin")), SeqSpecies + Reference_Name + Read_Status + n_deleted + n_inserted + n_mutated + locus + replicate + sortBin ~ variable, value="value", fun.aggregate=sum) #Get read totals per replicate allCRISPRessoDataTotals = cast(allCRISPRessoData, locus + replicate + sortBin ~ ., value="X.Reads", fun.aggregate = sum) names(allCRISPRessoDataTotals)[ncol(allCRISPRessoDataTotals)] = "totalReads"; #Add totalReads column to allCRISPRessoData, calculate read fractions allCRISPRessoData = merge(allCRISPRessoData, allCRISPRessoDataTotals, by=c("locus","replicate","sortBin")) allCRISPRessoData$readFraction = allCRISPRessoData$X.Reads/allCRISPRessoData$totalReads;
Reads per sample
p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_continuous(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)
Same on a log scale
p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_log10(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)
Toss all the cases where the locus was unmodified but sequenced anyway. Note that this means each was effectively only 50K cells, not 100k because the gDNA was divided in half
allCRISPRessoData = allCRISPRessoData[ !(allCRISPRessoData$locus=="HEK" & grepl("^[ABCR][12]$", allCRISPRessoData$replicate)) & !(allCRISPRessoData$locus=="BACH2" & grepl("^HEK9[12]", allCRISPRessoData$replicate)),]
Plot a CDF showing the cumulative fraction of genotypes (y axis) per sample (colour) by their abundance in the sequencing data (x axis). This illustrates how biased the data are towards rare genotypes (left) vs abundant genotypes (right). The more the curves shift to the right, the less complex the modifications to the DNA. A lot of the ones to the left are from sequencing artifacts.
p = ggplot(allCRISPRessoData[allCRISPRessoData$sortBin=="NS",], aes(x= readFraction, colour=replicate)) + stat_ecdf()+facet_grid(locus ~ .) + scale_x_log10() + theme_bw(); print(p)
Estimate the %unmodified in each sample.
temp = cast(allCRISPRessoData[allCRISPRessoData$Read_Status=="UNMODIFIED" & allCRISPRessoData$Reference_Name=="Reference",], formula = sortBin + locus+ replicate ~. ,value = "readFraction", fun.aggregate = max) names(temp)[ncol(temp)]="readFraction"; p = ggplot(temp, aes(x=sortBin, y=replicate, fill=readFraction)) + geom_tile() + facet_grid(locus ~., scales="free_y")+ggtitle("just unmodified read fractions") + scale_fill_gradientn(colours=c("red","orange", "green","cyan","blue","violet"), limits=c(0,0.8)); print(p) min(temp$readFraction, na.rm = T) # 0.0356963
Sample exclusion
allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="A1-HEK",] # this sample had no data for bin D (BACH2) because it didn't PCR well allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="B2-HEK",] # this sample had very low coverage for bin D for BACH2
Identify and retain only those alleles that are present in all samples
seqsObserved = cast(allCRISPRessoData, SeqSpecies + locus + Read_Status + Reference_Name~ .) names(seqsObserved)[ncol(seqsObserved)]="seqRunsObserved"; retainedSamples = unique(allCRISPRessoData[,c("locus","replicate","sortBin")]) for (l in unique(allSamples$locus)){ seqsObserved$inAll[seqsObserved$locus == l] = seqsObserved$seqRunsObserved[seqsObserved$locus == l]==sum(retainedSamples$locus==l) } #require that all samples have at least one read for every species considered. This will help exclude read errors keepAlleles = seqsObserved[seqsObserved$inAll,]
#First make a count matrix, but only of alleles seen in all replicates readCountMat = cast(allCRISPRessoData[allCRISPRessoData$SeqSpecies %in% keepAlleles$SeqSpecies,], SeqSpecies + replicate +locus ~ sortBin, value="X.Reads") readCountMat[is.na(readCountMat)] = 0 readCountMat = readCountMat[order(readCountMat$NS, decreasing = T),] #Label WT alleles wtSeqs = data.frame(locus = c("HEK","BACH2"), SeqSpecies = c("GGTAGCCAGAGACCCGCTGGTCTTCTTTCCCCTCCCCTGCCCTCCCCTCCCTTCAAGATGGCTGACAA","TGCCCCACCCTGTGCCCTTTTTACATTACACACAAATAGGGACGGATTTCCTGTAAGCTGATCTTGAAGAAAAAAAACATGTTAGACAAAGAAAATCAGAACTAAGA"), isWT=T, stringsAsFactors = F); readCountMat = merge(readCountMat, wtSeqs, by=c("locus","SeqSpecies"), all.x=T) readCountMat$isWT[is.na(readCountMat$isWT)]=F; allCRISPRessoData = merge(allCRISPRessoData, wtSeqs, by=c("locus","SeqSpecies"), all.x=T) allCRISPRessoData$isWT[is.na(allCRISPRessoData$isWT)]=F; binBounds = makeBinModel(data.frame(Bin=c("A","B","C","D","E","F"), fraction=rep(0.105,6))) # 10.5% bins # but only the top ~21% (CD) and bottom ~21% (AB) were retained binBounds = binBounds[binBounds$Bin %in% c("A","B","E","F"),] binBounds$Bin[binBounds$Bin=="E"]="C" binBounds$Bin[binBounds$Bin=="F"]="D"
The bin layout:
p = ggplot(binBounds, aes(colour=Bin)) + geom_density(data=data.frame(x=rnorm(100000)), aes(x=x), fill="gray", colour=NA)+ ggplot2::geom_segment(aes(x=binStartZ, xend=binEndZ, y=fraction, yend=fraction)) + xlab("Bin bounds as expression Z-scores") + ylab("Fraction of the distribution captured") +theme_classic()+scale_y_continuous(expand=c(0,0))+ coord_cartesian(ylim=c(0,0.7)); print(p)
Replicate bin stats for each experiment (the same sorting parameters were used for each)
curExpts = unique(readCountMat[c("replicate","locus")]) binStatsAll = data.frame(); for(i in 1:nrow(curExpts)){ binStatsAll = rbind(binStatsAll, cbind(binBounds, curExpts[i,])) }
#perform MAUDE analysis at guide (allele in this case) level for each locus. guideLevelStats = data.frame(); for( l in unique(allSamples$locus)){ message(l); statsA = findGuideHitsAllScreens(experiments = curExpts[curExpts$locus==l,], countDataFrame = readCountMat[readCountMat$locus==l,], binStats = binStatsAll[binStatsAll$locus==l,], sortBins = c("A","B","C","D"), unsortedBin = "NS", negativeControl = "isWT") guideLevelStats = rbind(guideLevelStats, statsA) }
Summarize base changes per allele
baseChanges = data.frame() for (l in wtSeqs$locus){ wtSeq = wtSeqs$SeqSpecies[wtSeqs$locus==l]; wtSplit = strsplit(wtSeq,"")[[1]] curSeqs = unique(guideLevelStats$SeqSpecies[guideLevelStats$locus==l & guideLevelStats$libFraction > 0.001]) for (i in 1:length(curSeqs)){ curSplit = strsplit(curSeqs[i],"")[[1]] mismatches = c(); mismatchPoss=c(); for (j in 1:min(length(wtSplit),length(curSplit))){ if (wtSplit[j]!=curSplit[j]){ mismatches = c(mismatches, curSplit[j]); mismatchPoss = c(mismatchPoss, j) } } if (length(mismatchPoss)>0){ baseChanges = rbind(baseChanges, data.frame(mismatch=mismatches, position = mismatchPoss, SeqSpecies=curSeqs[i], locus=l)) } } } baseChangesSummary = cast(baseChanges, SeqSpecies +locus~ ., value="mismatch") names(baseChangesSummary)[ncol(baseChangesSummary)] = "numMismatches"; p = ggplot(baseChangesSummary, aes(x=numMismatches, colour=locus)) + stat_ecdf()+ geom_vline(xintercept = 15); print(p)
#exclude any where the number of mismatches is greater than 15 - these are also likely read or PCR artifacts. baseChangesSummary$keepAlleles= baseChangesSummary$numMismatches<15; baseChanges$has = 1; baseChangesNumWith = cast(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles],], position + mismatch + locus~ ., value="has", fun.aggregate = sum) names(baseChangesNumWith)[ncol(baseChangesNumWith)] = "numSeqsWith" baseChanges = merge(baseChanges, baseChangesNumWith, by=c("position","mismatch","locus")) baseChanges$mismatch = as.character(baseChanges$mismatch) baseChanges$SeqSpecies = as.character(baseChanges$SeqSpecies) p = ggplot(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], aes(x=position, y=SeqSpecies, fill=mismatch)) + geom_tile()+scale_fill_manual(values=c("orange","green","blue","red"))+scale_x_continuous(expand=c(0,0))+theme_classic() + theme(axis.text.y = element_text(size=5, family = "Courier")) + facet_grid(locus ~ ., scales="free", space ="free"); print(p)
#A_53 is the mutation of interest baseChangesMatrix = cast(baseChanges[baseChanges$locus == "BACH2" & baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], SeqSpecies ~ mismatch + position, value="has", fill = 0) A53_seq = baseChangesMatrix$SeqSpecies[apply(baseChangesMatrix[2:ncol(baseChangesMatrix)],1, sum)==1 & baseChangesMatrix$A_53==1] guideLevelStats$pooled = grepl("-",guideLevelStats$replicate);
wtAndVarMaudeMu = cast(guideLevelStats[guideLevelStats$SeqSpecies %in% c(A53_seq, wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]),], replicate +pooled ~ SeqSpecies, value="mean") names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==A53_seq]="rs72928038"; names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]]="WT"; ttestResults = t.test(x = wtAndVarMaudeMu$rs72928038[!wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[!wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE) ttestResults_mixedCells = t.test(x = wtAndVarMaudeMu$rs72928038[wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE) wtAndVarMaudeMu$replicate2 = gsub("-HEK","",wtAndVarMaudeMu$replicate)
Make plot of WT vs rs72928038 mean expression levels
meltedSNPMus = melt(as.data.frame(wtAndVarMaudeMu), id.vars=c("pooled","replicate", "replicate2")); meltedSNPMus$genotype = factor(ifelse(meltedSNPMus$variable=="WT", "G", "A (risk)"),levels=c("G","A (risk)")) p = ggplot(meltedSNPMus, aes(x=genotype, y=value, group=replicate)) + geom_point() + geom_line()+facet_grid(. ~ pooled)+theme_bw()+xlab("Genotype") + ylab("Mean expression") + ggtitle(sprintf("P=%f; P=%f", ttestResults$p.value, ttestResults_mixedCells$p.value)); print(p)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.