# Stop on errors. Print commands to standard output. set -uex # Download and unpack the data. # # We have already organized into the reads/refs folders. # URL=http://data.biostarhandbook.com/rnaseq/projects/griffith/griffith-data.tar.gz # The file that collects the program outputs. LOG=runlog.txt # Download and unpack the data on the fly. curl -s $URL | tar xz # # This script will download a subset of data for the Griffith Experiment. # # The archive contains sequenced reads of mRNA from biological samples # mixed with a spike-in control (ERCC92) of known abundances. # # The archive also contains the genome for chromosome 22 and the known sequence # of the spike-in controls. # # # See: https://www.biostarhandbook.com/rnaseq/rnaseq-griffith-data.html # # # Limit the reference genome to chromosome 22 of the human genome. REF=refs/22.fa # The root name for the indices. IDX=refs/22.fa # Annotations for chromosome 22 of human genome. GTF=refs/22.gtf # Build the indices for both the spike-in genome and chromosome 22. hisat2-build $REF $IDX 1>>$LOG 2>> $LOG # Build the genome indices for IGV. samtools faidx $REF # Output folder for bam files. BAMDIR=xbam # Create output folder for bam files. mkdir -p $BAMDIR # Note: the flag below is -1 (minus 1) not little L. # This -1 flag lists the file names with no other information. # Align against human genome. ls -1 reads/*.fq | parallel -N 2 "hisat2 $IDX -1 {1} -2 {2} 2>> $LOG" '|' "samtools sort 2>>$LOG" '>' "$BAMDIR/{1/.}.bam 2>> $LOG" # Generate the index for each bam file. ls -1 $BAMDIR/*bam | parallel "samtools index {} 2>> $LOG" # Count features relative to the human annotations. # This tool "decides" what type of counts are produced: gene level, transcript level # The "gene_name" is an attribute set inside the GTF file. You may use other attributes inside of GTF. featureCounts -a $GTF -g gene_name -o counts_full.txt $BAMDIR/U*.bam $BAMDIR/H*.bam 1>> $LOG 2>> $LOG # This directory will store the R script that analyze the counts. mkdir -p code # # The reecipe will now run three different differential expression (DE) methods # on the resulting counts table. # For each analysis it will create a DE results table and a heatmap. # # Download each script into the code directory) (cd code && curl -sO http://data.biostarhandbook.com/rnaseq/code/deseq1.r) (cd code && curl -sO http://data.biostarhandbook.com/rnaseq/code/deseq2.r) (cd code && curl -sO http://data.biostarhandbook.com/rnaseq/code/edger.r) (cd code && curl -sO http://data.biostarhandbook.com/rnaseq/code/draw-heatmap.r) # Simplify the counts file to keep only the count columns. cat counts_full.txt | cut -f 1,7-12 > counts_simplified.txt # # Your R system needs to be set up to know about the tools below. # This setup needs to be done once. See the chapter: # https://www.biostarhandbook.com/rnaseq/rnaseq-bioconductor.html # # Run the DESeq1 method on the simplified count file. cat counts_simplified.txt | Rscript code/deseq1.r 3x3 > counts_DE_deseq1.results.txt 2> $LOG cat norm-matrix-deseq1.txt | Rscript code/draw-heatmap.r > counts_DE_deseq1.heatmap.pdf 2>> $LOG # Run the DESeq2 method on the simplified count file. cat counts_simplified.txt| Rscript code/deseq2.r 3x3 > counts_DE_deseq2.results.txt 2>> $LOG cat norm-matrix-deseq2.txt | Rscript code/draw-heatmap.r > counts_DE_deseq2.heatmap.pdf 2>> $LOG # Run the EdgeR method on the simplified count file. cat counts_simplified.txt | Rscript code/edger.r 3x3 > counts_DE_edger.results.txt 2>> $LOG cat norm-matrix-edgeR.txt | Rscript code/draw-heatmap.r > counts_DE_edger.heatmap.pdf 2>> $LOG