# Stop on errors. Show the commands.
set -uex

# Reference genome accession number.
ACC=AF086833

# SRR run number.
SRR=SRR1553500

# The directory to store the reference in.
mkdir -p refs

# The name of the file that stores the reference genome.
REF=refs/$ACC.fa

# The resulting alignment file name.
BAM=$SRR.bam

# Get the reference sequence.
efetch -db=nuccore -format=fasta -id=$ACC | seqret -filter -sid $ACC > $REF

# Index reference for the aligner.
bwa index $REF >> log.txt 2>> log.txt

# Index the reference genome for IGV
samtools faidx $REF

# Get data from an Ebola sequencing run.
fastq-dump -X 100000 --split-files $SRR >> log.txt

# Shortcut to read names.
R1=${SRR}_1.fastq
R2=${SRR}_2.fastq

# Tag the alignments. GATK will only work when reads are tagged.
TAG="@RG\tID:$SRR\tSM:$SRR\tLB:$SRR"

# Align and generate a BAM file.
bwa mem -R $TAG $REF $R1 $R2 2>> log.txt | samtools sort > $BAM 2>> log.txt

# Index the BAM file.
samtools index $BAM

# Determine the genotype likelihoods for each base.
bcftools mpileup -Ovu -f $REF $BAM > genotypes.vcf

# Call variants with bcftools.
bcftools call -vm -Ov genotypes.vcf > variants1.vcf 2>> log.txt

# Call variants with freebayes.
freebayes --ploidy 1 -f $REF $BAM > variants2.vcf

# Call variants with GATK. See the book for GATK installation.
# These commands are turned off in this example.

# It needs yet another so called Dictionary index.
# picard CreateSequenceDictionary REFERENCE=$REF  OUTPUT=refs/$ACC.dict

# Generate the variants.
#gatk HaplotypeCaller -R $REF -I $BAM -O variants3.vcf