knitr::opts_chunk$set(echo = TRUE)
Load the data.
library(airway) data(airway) airway
Normalize in CPM.
libSize <- colSums(assay(airway, "counts")) assay(airway, "CPM") <- t(t(assay(airway, "counts")) / libSize) * 1E6
Define a signature including all genes on the Y chromosome.
Ygenes <- unique(names(which(table(seqnames(airway) == "Y")[, "TRUE"] > 0))) library(GSEABase) pbmc3k_markers_gsc <- GeneSetCollection(list( GeneColorSet( setName="Y chromosome expression in male", Ygenes, phenotype=c("Male"), geneColor=factor(rep(TRUE, length(Ygenes))), phenotypeColor=factor(rep(TRUE, length(Ygenes))) ), GeneColorSet( setName="Y chromosome expression in female", Ygenes, phenotype=c("Female"), geneColor=factor(rep(FALSE, length(Ygenes))), phenotypeColor=factor(rep(FALSE, length(Ygenes))) ) )) pbmc3k_markers_gsc
Aggregate counts across all genes on the Y chromosome
yCPM <- colSums(assay(airway, "CPM")[Ygenes, ])
# Cluster on the pairwise distance between total Y-chromosome CPM. hc <- hclust(dist(yCPM)) plot(hc)
K-mean clustering with k=2 (expected number of genders)
km <- kmeans(matrix(yCPM, ncol = 1), 2)
Test if the total Y-CPM of the two groups is significantly different
testDataFrame <- data.frame(yCPM=yCPM, kmean=km$cluster) ttOut <- t.test(yCPM ~ kmean, testDataFrame) if (ttOut$p.value < 1E-3) { # there are both genders # return the group with the highest km$centers as male # return the group with the highest km$centers as female } else { # there is only one gender # we can only tell which gender if we're given an idea of what either "low" or "high" means }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.