# Stop on errors.
set -uexo pipefail

# UNIX trick: Adding the -x flag will also show the commands as these are executed.
# This can help you track down the errors as these happen.

# There will be many ancillary files.
# Better keep them out of sight.
mkdir -p db

# The Genome Accesion Number
ACC=AF086833

# This will be the name of the file that holds the reference genome that we align against.
REF=db/${ACC}.fa

# It is best to specify all variables at the top ot the program.
# That way it is easier to see their usage.

# The read pairs that will hold the simulation results.
READ1=pair1.fq
READ2=pair2.fq

# Fetch the reference genome from NCBI.
efetch -db=nuccore -format=fasta -id=${ACC} > $REF

# Index the reference genome. This needs to be done only once.
bwa index $REF 2>> log.txt

# Create annother index file for the reference 
#so that we can load it up with IGV
samtools faidx $REF

# Unix trick: bash functions allow repeating batch a commands repeatedly.
# This function construct below will run when called by its name.
# The first parameter will be assigned to $1 and so on.
run_experiment()
{
    # This will be the first parameter assigned to the function.
    BAM=$1
  
    # Run the simulation using the GENOME.fa file.
    wgsim -e 0 -r 0 -R 0 -X 0 -N 2500 GENOME.fa ${READ1} ${READ2} >> simulation.txt 2>> log.txt
  
    # Align the simulated reads against the genome.
    bwa mem ${REF} ${READ1} ${READ2} 2>> log.txt | samtools sort > ${BAM}
  
    # Index the resulting BAM file.
    samtools index ${BAM}
}

# Experiment 1. Take the genome to be exactly the reference.
cp $REF GENOME.fa
run_experiment align1.bam

# Experiment 2. Large deletion. Deletion applied from 2000-3000
cat $REF | seqret --filter -sbegin 1 -send 2000 > part1
cat $REF | seqret --filter -sbegin 3000 > part2
cat part1 part2 | union -filter  > GENOME.fa
run_experiment align2.bam

# Experiment 3. Copy number variation. The first 2000 bases are present three times.
cat $REF | seqret --filter -sbegin 1 -send 2000 > part1
cat part1 part1 $REF | union -filter  > GENOME.fa
run_experiment align3.bam

# Experiment 4. Swap the genome. First 5000 bp  go last.
cat $REF | seqret --filter -send 5000 > part1
cat $REF  | seqret --filter -sbegin 5000 > part2
cat part2 part1 | union -filter  > GENOME.fa
run_experiment align4.bam

# Experiment 5. Reverse complement a region of the genome (1000 to 2000).
cat $REF | seqret --filter -sbegin 1 -send 1000 > part1
cat $REF | seqret --filter -sbegin 1000 -send 2000 -sreverse1 > part2
cat $REF | seqret --filter -sbegin 2000 > part3
cat part1 part2 part3 | union -filter  > GENOME.fa
run_experiment align5.bam

# Experiment 6. Ten unknown mutations, both point and block mutations.
# How many can your identify?
cat $REF | msbar -filter -count 10 -point 1 -block 1 -minimum 10 -maximum 500 > GENOME.fa
run_experiment align6.bam

# Remove temporary files
rm -f part?