# Stop on errors. Show all commands.
set -uxeo pipefail

# Set the reference genome accession number.
ACC=AF086833

# Set the SRR run number.
SRR=SRR1972739

# Directory for various reference information.
mkdir -p refs

# The reference genome in GENBANK format.
GB=refs/$ACC.gb

# The reference genome in FASTA format.
REF=refs/$ACC.fa

# The reference genome in GFF (Gene Feature Format).
GFF=refs/$ACC.gff

# Get the reference sequence in GenBank format.
efetch -db=nuccore -format=gb -id=$ACC > $GB

# Reformat GenBank to FASTA.
cat $GB | readseq -p -f fasta > $REF 2>> log.txt

# Reformat GenBank to GFF. Keep only lines that match CDS.
cat $GB | readseq -p -f gff 2>> log.txt | grep CDS > $GFF 

# Index reference for the bwa aligner.
bwa index $REF 2>> log.txt

# Index the reference genome so that it can be loaded into IGV.
samtools faidx $REF

# The directory that stores the sequencing reads
mkdir -p reads

# Get sequencing data from a SRR number.
fastq-dump -X 10000 --split-files -O reads $SRR

# Files names that store read pairs
R1=reads/${SRR}_1.fastq
R2=reads/${SRR}_2.fastq

# The alignment file name.
BAM=$SRR.bam

# The variant file name.
VCF=$SRR.vcf

# Align and generate a sorted BAM file.
bwa mem $REF $R1 $R2 2>> log.txt | samtools sort > $BAM

# Index the BAM file.
samtools index $BAM

# Call snps with bcftools
bcftools mpileup -Ou -f $REF $BAM 2>> log.txt | bcftools call -mv -Ov -o $VCF 2>> log.txt