In drisso/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the count data

We obtain a single-cell RNA sequencing dataset of the mouse and human midbrains from @lamanno2016molecular. Counts for cells from various developmental stages in both species are available from the Gene Expression Omnibus using the accession number GSE76381. We download and cache it using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
base.url <- file.path("ftp://ftp.ncbi.nlm.nih.gov/geo/series",
    "GSE76nnn/GSE76381/suppl")

es.count.file <- bfcrpath(bfc, file.path(base.url,
    "GSE76381_ESMoleculeCounts.cef.txt.gz"))
embryo.count.file <- bfcrpath(bfc, file.path(base.url,
    "GSE76381_EmbryoMoleculeCounts.cef.txt.gz"))
ips.count.file <- bfcrpath(bfc, file.path(base.url,
    "GSE76381_iPSMoleculeCounts.cef.txt.gz"))

madult.count.file <- bfcrpath(bfc, file.path(base.url,
    "GSE76381_MouseAdultDAMoleculeCounts.cef.txt.gz"))
membryo.count.file <- bfcrpath(bfc, file.path(base.url,
    "GSE76381_MouseEmbryoMoleculeCounts.cef.txt.gz"))

Reading the data in

We create a function to extract data from each file.

library(S4Vectors)
FUN <- function(path, as.csv=FALSE, skip=0) {
    if (as.csv) {
        FUN <- read.csv
    } else {
        FUN <- read.delim
    }

    x <- FUN(path, header=FALSE, stringsAsFactors=FALSE, skip=skip)
    is.gene <- which(x[,1]=="Gene")

    metadata <- t(x[2:(is.gene-1L),-(1:2)])
    df <- data.frame(metadata, stringsAsFactors=FALSE)
    df <- DataFrame(df)
    colnames(df) <- x[2:(is.gene-1L),2]
    rownames(df) <- NULL

    counts <- as.matrix(x[-(1:(is.gene+1L)),])
    rownames(counts) <- counts[,1]
    colnames(counts) <- NULL

    # checking that second column has nothing interesting.
    stopifnot(length(unique(counts[,2]))==1L) 
    counts <- counts[,-(1:2)]
    storage.mode(counts) <- "integer"

    list(counts=counts, coldata=df)
}

We run this on all the human datasets:

es.data <- FUN(es.count.file)
dim(es.data$counts)
es.data$coldata
embryo.data <- FUN(embryo.count.file)
dim(embryo.data$counts)
embryo.data$coldata
ips.data <- FUN(ips.count.file)
dim(ips.data$counts)
ips.data$coldata

We repeat the process for the mouse data.

madult.data <- FUN(madult.count.file, as.csv=TRUE, skip=1)
dim(madult.data$counts)
madult.data$coldata
membryo.data <- FUN(membryo.count.file, skip=1)
dim(membryo.data$counts)
membryo.data$coldata

Saving objects

Rather frustratingly, each of the stages has a different set of genes, so we need to save them separately. We set up a simple function do to so:

path <- file.path("scRNAseq", "lamanno-brain", "2.0.0")
SAVEFUN <- function(input, suffix) {
    dir.create(path, showWarnings=FALSE, recursive=TRUE)
    saveRDS(input$counts, file=file.path(path, 
        sprintf("counts-%s.rds", suffix)))
    saveRDS(input$coldata, file=file.path(path, 
        sprintf("coldata-%s.rds", suffix)))
}

We save all of the relevant components to file for upload to r Biocpkg("ExperimentHub").

SAVEFUN(es.data, "human-es")
SAVEFUN(embryo.data, "human-embryo")
SAVEFUN(ips.data, "human-ips")
SAVEFUN(madult.data, "mouse-adult")
SAVEFUN(membryo.data, "mouse-embryo")