# Stop on errors. Print commands to standard output.
set -uex

# Download and unpack the data.
#
# We have already organized into the reads/refs folders.
#
URL=http://data.biostarhandbook.com/rnaseq/projects/griffith/griffith-data.tar.gz

# The file that collects the program outputs.
LOG=runlog.txt

# Download and unpack the data on the fly.
curl -s $URL | tar xz

#
# This script will download a subset of data for the Griffith Experiment.
#
# The archive contains sequenced reads of mRNA from biological samples
# mixed with a spike-in control (ERCC92) of known abundances.
#
# The archive also contains the genome for chromosome 22 and the known sequence
# of the spike-in controls.
#
#
# See: https://www.biostarhandbook.com/rnaseq/rnaseq-griffith-data.html
#
#

# Limit the reference genome to chromosome 22 of the human genome.
REF=refs/22.fa

# The root name for the indices.
IDX=refs/22.fa

# Annotations for chromosome 22 of human genome.
GTF=refs/22.gtf

# Build the indices for both the spike-in genome and chromosome 22.
hisat2-build $REF $IDX 1>>$LOG 2>> $LOG

# Build the genome indices for IGV.
samtools faidx $REF


# Output folder for bam files.
BAMDIR=xbam

# Create output folder for bam files.
mkdir -p $BAMDIR

# Note: the flag below is -1 (minus 1) not little L.
# This -1 flag lists the file names with no other information.

# Align against human genome.
ls -1 reads/*.fq | parallel -N 2 "hisat2 $IDX -1 {1} -2 {2} 2>> $LOG" '|' "samtools sort 2>>$LOG"  '>' "$BAMDIR/{1/.}.bam 2>> $LOG"

# Generate the index for each bam file.
ls -1 $BAMDIR/*bam | parallel "samtools index {} 2>> $LOG"

# Count features relative to the human annotations.
# This tool "decides" what type of counts are produced: gene level, transcript level
# The "gene_name" is an attribute set inside the GTF file.  You may use other attributes inside of GTF.
featureCounts -a $GTF -g gene_name -o counts_full.txt  $BAMDIR/U*.bam  $BAMDIR/H*.bam 1>> $LOG 2>> $LOG

# This directory will store the R script that analyze the counts.
mkdir -p code

#
# The reecipe will now run three different differential expression (DE) methods
# on the resulting counts table.
# For each analysis it will create a DE results table and a heatmap.
#

# Download each script into the code directory)
(cd code && curl -sO http://data.biostarhandbook.com/rnaseq/code/deseq1.r)
(cd code && curl -sO http://data.biostarhandbook.com/rnaseq/code/deseq2.r)
(cd code && curl -sO http://data.biostarhandbook.com/rnaseq/code/edger.r)
(cd code && curl -sO http://data.biostarhandbook.com/rnaseq/code/draw-heatmap.r)

# Simplify the counts file to keep only the count columns.
cat counts_full.txt | cut -f 1,7-12 > counts_simplified.txt

#
# Your R system needs to be set up to know about the tools below.
# This setup needs to be done once. See the chapter:
# https://www.biostarhandbook.com/rnaseq/rnaseq-bioconductor.html
#

# Run the DESeq1 method on the simplified count file.
cat counts_simplified.txt | Rscript code/deseq1.r 3x3 > counts_DE_deseq1.results.txt  2> $LOG
cat norm-matrix-deseq1.txt | Rscript code/draw-heatmap.r > counts_DE_deseq1.heatmap.pdf 2>> $LOG

# Run the DESeq2 method on the simplified count file.
cat counts_simplified.txt| Rscript code/deseq2.r 3x3 > counts_DE_deseq2.results.txt  2>> $LOG
cat norm-matrix-deseq2.txt | Rscript code/draw-heatmap.r > counts_DE_deseq2.heatmap.pdf 2>> $LOG

# Run the EdgeR method on the simplified count file.
cat counts_simplified.txt | Rscript code/edger.r 3x3 > counts_DE_edger.results.txt  2>> $LOG
cat norm-matrix-edgeR.txt | Rscript code/draw-heatmap.r > counts_DE_edger.heatmap.pdf 2>> $LOG