knitr::opts_chunk$set(echo = TRUE)
Load the data.
library(ExperimentHub) ehub <- ExperimentHub() EH1 <- ehub[["EH1"]] library(SummarizedExperiment) EH1 <- as(EH1, "SummarizedExperiment") EH1
Tabulate numbers of known male and female subjects.
table(EH1$gender)
Normalize in CPM.
libSize <- colSums(assay(EH1, "exprs")) assay(EH1, "CPM") <- t(t(assay(EH1, "exprs")) / libSize) * 1E6
Define a signature including all genes on the Y chromosome.
library(EnsDb.Hsapiens.v86) columns(EnsDb.Hsapiens.v86) Ygenes <- unique(select(EnsDb.Hsapiens.v86, keys = "Y", "SYMBOL", "SEQNAME")) Ygenes <- Ygenes$SYMBOL table(Ygenes %in% rownames(EH1))
Let us use only the Y chromosome genes that are present in the data set.
Ygenes <- intersect(Ygenes, rownames(EH1))
library(GSEABase) pbmc3k_markers_gsc <- GeneSetCollection(list( GeneColorSet( setName="Y chromosome expression in male", Ygenes, phenotype=c("Male"), geneColor=factor(rep(TRUE, length(Ygenes))), phenotypeColor=factor(rep(TRUE, length(Ygenes))) ), GeneColorSet( setName="Y chromosome expression in female", Ygenes, phenotype=c("Female"), geneColor=factor(rep(FALSE, length(Ygenes))), phenotypeColor=factor(rep(FALSE, length(Ygenes))) ) )) pbmc3k_markers_gsc
Aggregate counts across all genes on the Y chromosome
yCPM <- colSums(assay(EH1, "CPM")[Ygenes, ])
# Cluster on the pairwise distance between total Y-chromosome CPM. hc <- hclust(dist(yCPM)) plot(hc, labels=FALSE)
K-mean clustering with k=2 (expected number of genders)
km <- kmeans(matrix(yCPM, ncol = 1), 2)
Test if the total Y-CPM of the two groups is significantly different
testDataFrame <- data.frame(yCPM=yCPM, kmean=km$cluster) ttOut <- t.test(yCPM ~ kmean, testDataFrame) if (ttOut$p.value < 1E-3) { # there are both genders clusterMeans <- with(testDataFrame, tapply(yCPM, kmean, "mean")) clusterFemale <- which.min(clusterMeans) clusterMale <- which.max(clusterMeans) # Annotate known gender testDataFrame[colnames(EH1), "gender_annotated"] <- EH1$gender # Annotate predicted gender testDataFrame$gender_predicted <- factor(c("MALE", "FEMALE")[c(clusterMale, clusterFemale)][testDataFrame$kmean]) # return the group with the highest km$centers as male # return the group with the highest km$centers as female } else { # there is only one gender # we can only tell which gender if we're given an idea of what either "low" or "high" means } # Tabulate the predictions against the annotations with(testDataFrame, table(gender_annotated, gender_predicted))
A large proportion of male subjects are classified as female. How does the distribution of summed CPM look between genders?
yCpmDataFrame <- data.frame( yCPM=yCPM, gender=colData(EH1[, names(yCPM)])[, "gender"] ) require(ggplot2) require(cowplot) ggplot(yCpmDataFrame, aes(gender, yCPM)) + geom_point()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.