#
# The script assumes that reference data have been already installed with
#
# http://data.biostarhandbook.com/redo/zika/zika-references.sh
#

# Stop on errors.
set -uex

# Set the project name.
PROJECT=PRJNA313294

# Shortcut to annotations
GTF=~/refs/Homo_sapiens.GRCh38.96.chr.gtf

# Full run information.
RUNINFO=runinfo.all.csv

# Run info for single end reads.
SINGLE=runinfo.single.csv

# Run information for paired end reads.
PAIRED=runinfo.paired.csv

# How many reads per sample to unpack.
READNUM=1000000

# The HiSat2 index name.
# The index was installed with
# http://data.biostarhandbook.com/redo/zika/zika-references.sh
IDX=~/refs/grch38/genome

# Downloading the run information.
esearch -db sra -query $PROJECT | efetch -format runinfo > $RUNINFO

# Separate SRR numbers for the single end files.
cat $RUNINFO | grep  SRR.*SINGLE | cut -f 1 -d , > $SINGLE

# Separate SRR numbers for the paired end files.
cat $RUNINFO | grep  SRR.*PAIRED | cut -f 1 -d , > $PAIRED

# Download and unpack single end reads.
cat $SINGLE  | parallel fastq-dump -X $READNUM -O reads >> log.txt

# Download and unpack paired end reads
cat $PAIRED  | parallel fastq-dump -X $READNUM --split-files -O reads >> log.txt

# Store bam files here.
mkdir -p bam

# How many CPU cores on the system.
CPUS=4

# Run hisat2 on the paired data.
cat $PAIRED | parallel "hisat2 -p $CPUS -x $IDX -1 reads/{}_1.fastq -2 reads/{}_2.fastq 2>> log.txt | samtools sort > bam/{}.bam"

# Run hisat2 on the single end data.
cat $SINGLE | parallel  "hisat2 -p $CPUS -x $IDX -U reads/{}.fastq 2>> log.txt | samtools sort > bam/{}.bam"

# Run a samtools index on all BAM files
ls -1 bam/*.bam | parallel samtools index {}

# These are SRR number of the the control (MOCK) numbers
MOCK=(SRR3191542 SRR3191543 SRR3194428 SRR3194429)

# These are the SRR number of infected (ZIKV) samples.
ZIKV=(SRR3191544 SRR3191545 SRR3194430 SRR3194431)

# We do some extra work to generate bam file names
# such as bam/SRR3191542.bam grouped by condition

# Generate MOCK bam file names.
for SRR in ${MOCK[@]}; do
    MOCK_BAMS+="bam/${SRR}.bam "
done

# Generate ZIKV bam file names.
for SRR in ${ZIKV[@]}; do
    ZIKV_BAMS+="bam/${SRR}.bam "
done

# Count the features over each gene name.
featureCounts -g gene_name -a $GTF -o counts.txt ${MOCK_BAMS} ${ZIKV_BAMS} 2>> log.txt

# Simplify the counts.
cat counts.txt | cut -f 1,7-14 > simple_counts.txt

# Get the script that runs the deseq 1 method.
wget -q -nc  http://data.biostarhandbook.com/rnaseq/code/deseq1.r

# Perform the differential expression study.
cat simple_counts.txt | Rscript deseq1.r 4x4 > results.txt 2>> log.txt

# Download the script to draw the heatmaps (requires the glplots package).
wget -q -nc http://data.biostarhandbook.com/rnaseq/code/draw-heatmap.r

# Generate heatmap from the deseq1 normalized matrix.
cat norm-matrix-deseq1.txt | Rscript draw-heatmap.r > heatmap-norm-matrix-deseq1.pdf  2>> log.txt