# Print the commands as are executed.
# Stop on errors.
set -uex

# The URL the data is located at.
URL=http://data.biostarhandbook.com/rnaseq/projects/griffith/griffith-data.tar.gz

# Obtain and download the data
wget -q -nc $URL

# Unpack the dataset.
tar zxvf griffith-data.tar.gz

# This is the reference genome.
REF=refs/ERCC92.fa

# Represents the annotations.
GTF=refs/ERCC92.gtf

# Build a hisat2 index for the genome.
hisat2-build $REF $REF 2>> log.txt 1>> log.txt

# Generate the root names for the files.
parallel echo {1}_{2} ::: UHR HBR ::: 1 2 3 > names.txt

# Make a directory for BAM files.
mkdir -p bam

# Run the alignment per each root name.
cat names.txt | parallel "hisat2 $REF -1 reads/{}_R1.fq -2 reads/{}_R2.fq | samtools sort > bam/{}.bam" 2>> stats.txt

# Count the features that overlap annotations.
featureCounts -a $GTF -g gene_name -o counts.txt  bam/*.bam 2>> log.txt

# Get the script that runs the deseq 1 method.
wget -q -nc  http://data.biostarhandbook.com/rnaseq/code/deseq1.r

# Simplify the count matrix to only contain gene names and read counts.
cat counts.txt | cut -f 1,7-12 > simple_counts.txt

# Perform the differential expression study.
cat simple_counts.txt | Rscript deseq1.r 3x3 > results.txt 2>> log.txt

# Download the script to draw the heatmaps (requires the glplots package).
wget -q -nc http://data.biostarhandbook.com/rnaseq/code/draw-heatmap.r

# Generate heatmap from the deseq1 normalized matrix.
cat norm-matrix-deseq1.txt | Rscript draw-heatmap.r > heatmap-norm-matrix-deseq1.pdf  2>> log.txt