knitr::opts_chunk$set(echo = TRUE)
The file Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz
was downloaded from the ARMOR repository at https://github.com/csoneson/ARMOR/blob/master/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz.
The equivalence classes (eq_classes.txt
) and transcript estimated counts (quant.sf
) were obtained by aligning the paired-ended reads from the ARMOR repository with STAR and checking the transcripts compatible with each alignment via salmon.
The reads were downloaded from https://github.com/csoneson/ARMOR/tree/master/example_data/FASTQ.
# Download the ARMOR github repository git clone https://github.com/csoneson/ARMOR.git # set the base_dir to the downloaded repo: base_dir="~/ARMOR" # input reads: fastq_files=$base_dir/example_data/FASTQ ################################################################################################################# # RUN STAR alignment and Salmon on the alignment obtained from STAR: ################################################################################################################# # fasta file, reference genome (DNA) fasta=$base_dir/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.dna.chromosome.1.1.10M.fa # gtf file gtf=$base_dir/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.93.1.1.10M.gtf # make directory for STAR output: mkdir $base_dir/STAR # folder where we will create a genome index: mkdir $base_dir/STAR/genome_index GDIR=$base_dir/STAR/genome_index # Generate Genome index: STAR --runMode genomeGenerate --runThreadN 4 --genomeDir $GDIR \ --genomeFastaFiles $fasta --sjdbGTFfile $gtf --sjdbOverhang 62 ls $GDIR # sjdbOverhang ideally should be the lenght of the reads -1 (our reads are 63 bps). # output directory mkdir $base_dir/STAR/alignment outDir=$base_dir/STAR/alignment # change directory to the output directory: cd $outDir # align reads with STAR: # --quantMode TranscriptomeSAM is essential to obtain the transcript alignments. STAR --runMode alignReads --runThreadN 4 --genomeDir $GDIR \ --readFilesIn <(zcat $fastq_files/SRR1039508_R1.fastq.gz) <(zcat $fastq_files/SRR1039508_R2.fastq.gz) \ --outFileNamePrefix sample1 --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM STAR --runMode alignReads --runThreadN 4 --genomeDir $GDIR \ --readFilesIn <(zcat $fastq_files/SRR1039509_R1.fastq.gz) <(zcat $fastq_files/SRR1039509_R2.fastq.gz) \ --outFileNamePrefix sample2 --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM STAR --runMode alignReads --runThreadN 4 --genomeDir $GDIR \ --readFilesIn <(zcat $fastq_files/SRR1039512_R1.fastq.gz) <(zcat $fastq_files/SRR1039512_R2.fastq.gz) \ --outFileNamePrefix sample3 --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM STAR --runMode alignReads --runThreadN 4 --genomeDir $GDIR \ --readFilesIn <(zcat $fastq_files/SRR1039513_R1.fastq.gz) <(zcat $fastq_files/SRR1039513_R2.fastq.gz) \ --outFileNamePrefix sample4 --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM # use gffread to build a reference transcriptome (fasta format) compatible with the DNA fasta and gtf files used for STAR: gffread -w cDNA.fa -g $fasta $gtf # crete a variable to point at the newly created transcriptome fasta file: cdna=$outDir/cDNA.fa # Use salmon on the transcript alignments to compute the equivalence classes: # --dumpEq is essential to obtain the equivalence classes from salmon. $salmon quant -t $cdna -l A -a sample1Aligned.toTranscriptome.out.bam -o sample1 -p 4 --dumpEq $salmon quant -t $cdna -l A -a sample2Aligned.toTranscriptome.out.bam -o sample2 -p 4 --dumpEq $salmon quant -t $cdna -l A -a sample3Aligned.toTranscriptome.out.bam -o sample3 -p 4 --dumpEq $salmon quant -t $cdna -l A -a sample4Aligned.toTranscriptome.out.bam -o sample4 -p 4 --dumpEq # In the output folders, the `quant.sf` files contain the estimated transcript level abundances, while the equivalence classes (and respective counts) are stored in the `aux_info/eq_classes.txt` files.
The equivalence classes (pseudoalignments.ec
) and respective counts counts (pseudoalignments.tsv
) were obtained by aligning the paired-ended reads from the ARMOR repository with kallisto.
Continue after running the code above.
# create a directory for kallisto mkdir $base_dir/kallisto # create a directory for the genome index mkdir $base_dir/kallisto/kallisto_index idx=$base_dir/kallisto/kallisto_index # create a directory for the output of the alignment mkdir $base_dir/kallisto/alignment out_kallisto_alignment=$base_dir/kallisto/alignment # create a directory for the output of the equivalence classes mkdir $base_dir/kallisto/pseudo out_kallisto_pseudo=$base_dir/kallisto/pseudo #Build kallisto index kallisto index -i $idx/transcripts.idxl $cdna # Align reads and quantify transcript abundance with kallisto: kallisto quant -i $idx/transcripts.idx -o $out_kallisto_alignment/sample1 --bias --threads 4 \ $fastq_files/SRR1039508_R1.fastq.gz $fastq_files/SRR1039508_R2.fastq.gz kallisto quant -i $idx/transcripts.idx -o $out_kallisto_alignment/sample2 --bias --threads 4 \ $fastq_files/SRR1039509_R1.fastq.gz $fastq_files/SRR1039509_R2.fastq.gz kallisto quant -i $idx/transcripts.idx -o $out_kallisto_alignment/sample3 --bias --threads 4 \ $fastq_files/SRR1039512_R1.fastq.gz $fastq_files/SRR1039512_R2.fastq.gz kallisto quant -i $idx/transcripts.idx -o $out_kallisto_alignment/sample4 --bias --threads 4 \ $fastq_files/SRR1039513_R1.fastq.gz $fastq_files/SRR1039513_R2.fastq.gz # In the output folders, the `abundance.tsv` files contain the estimated transcript level abundances. # Compute the equivalence classes and respective counts with kallisto: kallisto pseudo -i $idx/transcripts.idx -o $out_kallisto_pseudo/sample1 \ $fastq_files/SRR1039508_R1.fastq.gz $fastq_files/SRR1039508_R2.fastq.gz kallisto pseudo -i $idx/transcripts.idx -o $out_kallisto_pseudo/sample2 \ $fastq_files/SRR1039509_R1.fastq.gz $fastq_files/SRR1039509_R2.fastq.gz kallisto pseudo -i $idx/transcripts.idx -o $out_kallisto_pseudo/sample3 \ $fastq_files/SRR1039512_R1.fastq.gz $fastq_files/SRR1039512_R2.fastq.gz kallisto pseudo -i $idx/transcripts.idx -o $out_kallisto_pseudo/sample4 \ $fastq_files/SRR1039513_R1.fastq.gz $fastq_files/SRR1039513_R2.fastq.gz # In the output folders, the `pseudoalignments.ec` files contain the transcripts forming each equivalence class, while the `pseudoalignments.tsv` files contain the equivalence classes counts.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.