# Show the commands as are executed.
set -uex

# Install bio with:
# pip install bio --upgrade

# Create a directory to store the reference and all indices.
mkdir -p refs

# The accession number.
ACC=AF086833

# The reference genome we align to.
REF=refs/REFERENCE.fa

# The "real" genome that is different from the reference.
GENOME=GENOME.fa

# The read alignment for the simulated reads.
RESULTS=results.bam

# The semi-global alignment of the genome to the reference.
REALITY=reality.bam

# Error rate for the simulation.
ERR=0.05

# How many reads to generate.
N=2500

# Obtain the data save the first 3kb of it as our reference FASTA.
bio fetch -format fasta $ACC | bio fasta --end 3kb > $REF

# We create the modified genome here. 
# We are replacing a sequence pattern with a different version.
# You are welcome to experiment with your changes.
MATCHED=AGGGTGGACAACAGAAGAACA
REPLACE=AGGGTGGACAACAGAGAA

# Modify the genome.
cat $REF | biosed -filter -target $MATCHED -replace $REPLACE > $GENOME

# Show the replacement
bio align $MATCHED $REPLACE -matrix NUC.4.4

# Create a genome wide alignment of the refence vs the genome.
# This is only possilble because we "know" what the actual genome is.
minimap2 -a $REF $GENOME | samtools sort > $REALITY

# Read names
R1=read1.fq
R2=read2.fq

# Generate simulated reads with no other mutations.
wgsim -N $N -e $ERR -r 0 $GENOME $R1 $R2

# Index the reference genome with bwa index.
bwa index $REF

# READ tagging means adding more information into each alignment.
TAG="@RG\tID:variants\tSM:${ACC}\tLB:simulation"

# Align the reads and generate athe BAM file.
bwa mem -R $TAG $REF $R1 $R2 | samtools sort > $RESULTS

# Create the indexes for the BAM files.
samtools index $RESULTS
samtools index $REALITY

# Variant call with bcftools.
bcftools mpileup -Ov -f $REF $RESULTS | bcftools call -mv -Ov -o bcftools.vcf

# Variant call with freebayes. Filter at 0.1 pval.
freebayes -P 0.1 -f $REF $RESULTS > freebayes.vcf