# Stop on errors. Show all commands.
set -eux 

# Install bio if you don't have it.
#pip install bio --upgrade

# Substitute reference genome accession number.
ACC=AF086833

# Substitute the SRR run number.
SRR=SRR1972739

# The alignment file name.
BAM=${SRR}.bam

# All genotype likelihoods.
ALL=${SRR}.all.vcf

# The variants selected from all genotypes.
VCF=${SRR}.variants.vcf

# The reference genome in FASTA format.
REF=refs/${ACC}.fa

# The reference genome in GFF format.
GFF=refs/${ACC}.gff

#
# Stage 1: Obtain the data
#

# Directory for various reference information.
mkdir -p refs

# Obtain the reference in FASTA format.
bio fetch -format fasta $ACC > $REF

# Obtain the reference in GFF format.
bio fetch -format gff $ACC > $GFF

# Index reference for the bwa aligner.
bwa index $REF

# Index the reference genome so that it can be loaded into IGV.
samtools faidx $REF

# The directory that stores the sequencing reads
mkdir -p reads

# Get sequencing data from a SRR number.
# Use fastq-dump-orig if you get an error
fastq-dump -X 10000 --split-files -O reads $SRR

# The paired end read names.
R1=reads/${SRR}_1.fastq
R2=reads/${SRR}_2.fastq

#
# Stage 2: aligning and calling variants.
#

# Align and generate a sorted BAM file.
bwa mem $REF $R1 $R2 | samtools sort > $BAM

# Index the BAM file.
samtools index $BAM

# Compute all genotypes with bcftools.
bcftools mpileup -O v -f $REF $BAM > $ALL

# From all genotypes call only the variants.
cat $ALL | bcftools call --ploidy 1 -mv -O v -o $VCF 

# We could run both steps in one stage by pipe the two into one call like so.
#bcftools mpileup -O v -f $REF $BAM | bcftools call --ploidy 1 -mv -O v -o $VCF