# Stop on any error. Show the commands.
set -uex

# NCBI Bioproject ID.
ID=PRJNA257197

# The number of samples to process.
N=5

# NCBI accession number for the reference genome.
ACC=KJ660346

# The reference data in GENBANK format.
GENBANK=refs/${ACC}.gb

# The reference data in FASTA format.
FASTA=refs/${ACC}.fa

# The reference data in GFF format.
GFF=refs/${ACC}.gff

# The directory that stores the reference files.
mkdir -p refs

# Get the reference sequence in GenBank format.
efetch -db=nuccore -format=gb -id=$ACC > $GENBANK

# Reformat GenBank to FASTA.
cat $GENBANK | readseq -p -f fasta > $FASTA 2>> log.txt

# Reformat GenBank to GFF. Keep only lines that match CDS.
cat $GENBANK | readseq -p -f gff 2>> log.txt | grep CDS > $GFF

# Index the FASTA file for bwa.
bwa index $FASTA 2>> log.txt

# Index the FASTA file for IGV.
samtools faidx $FASTA >> log.txt

# Obtain the SRR run number for the data deposited into the BioProject
esearch -db sra -query $ID | efetch -format runinfo > runinfo.csv

# Keep the top N SRR numbers for now.
cat runinfo.csv  | cut -f 1 -d , | grep SRR | head -$N  > samples.txt

# Make a directory for the sequencing data.
mkdir -p reads

# Limit each dataset to these many reads.
LIMIT=100000

# Download the sequencing data in parallel.
cat samples.txt | parallel "fastq-dump -X $LIMIT -O reads --split-files {} >> log.txt"

# Generate alignments
mkdir -p bam

# Run the bwa mem processes in parallel.
# Put an echo in front of the command to understand what it does.
cat samples.txt | parallel "bwa mem $FASTA reads/{}_1.fastq reads/{}_2.fastq 2>> log.txt | samtools sort > bam/{}.bam 2>> log.txt"

# Unix trick: once your commands become more advances redirecting inputs can be
# a bit challenging. Here we want to run samples in parallel, redirect each parallel process into
# its own file while collecting the overall messages into the log.txt file.

# Index the BAM files.
cat samples.txt | parallel "samtools index bam/{}.bam 2>> log.txt"

# Call variants on all BAM files in parallel.
bcftools mpileup -Ou -f $FASTA  bam/*.bam 2>> log.txt | bcftools call --ploidy 1 -vm -Ou |  bcftools norm -Ov -f $FASTA -d all - > all_variants.vcf

# Annotate the variants with SNPeff.
cat all_variants.vcf | snpEff ebola_zaire -s all_effects_report.html > all_variants_annotated.vcf