# Stop on errors. set -uexo pipefail # UNIX trick: Adding the -x flag will also show the commands as these are executed. # This can help you track down the errors as these happen. # There will be many ancillary files. # Better keep them out of sight. mkdir -p db # The Genome Accesion Number ACC=AF086833 # This will be the name of the file that holds the reference genome that we align against. REF=db/${ACC}.fa # It is best to specify all variables at the top ot the program. # That way it is easier to see their usage. # The read pairs that will hold the simulation results. READ1=pair1.fq READ2=pair2.fq # Fetch the reference genome from NCBI. efetch -db=nuccore -format=fasta -id=${ACC} > $REF # Index the reference genome. This needs to be done only once. bwa index $REF 2>> log.txt # Create annother index file for the reference #so that we can load it up with IGV samtools faidx $REF # Unix trick: bash functions allow repeating batch a commands repeatedly. # This function construct below will run when called by its name. # The first parameter will be assigned to $1 and so on. run_experiment() { # This will be the first parameter assigned to the function. BAM=$1 # Run the simulation using the GENOME.fa file. wgsim -e 0 -r 0 -R 0 -X 0 -N 2500 GENOME.fa ${READ1} ${READ2} >> simulation.txt 2>> log.txt # Align the simulated reads against the genome. bwa mem ${REF} ${READ1} ${READ2} 2>> log.txt | samtools sort > ${BAM} # Index the resulting BAM file. samtools index ${BAM} } # Experiment 1. Take the genome to be exactly the reference. cp $REF GENOME.fa run_experiment align1.bam # Experiment 2. Large deletion. Deletion applied from 2000-3000 cat $REF | seqret --filter -sbegin 1 -send 2000 > part1 cat $REF | seqret --filter -sbegin 3000 > part2 cat part1 part2 | union -filter > GENOME.fa run_experiment align2.bam # Experiment 3. Copy number variation. The first 2000 bases are present three times. cat $REF | seqret --filter -sbegin 1 -send 2000 > part1 cat part1 part1 $REF | union -filter > GENOME.fa run_experiment align3.bam # Experiment 4. Swap the genome. First 5000 bp go last. cat $REF | seqret --filter -send 5000 > part1 cat $REF | seqret --filter -sbegin 5000 > part2 cat part2 part1 | union -filter > GENOME.fa run_experiment align4.bam # Experiment 5. Reverse complement a region of the genome (1000 to 2000). cat $REF | seqret --filter -sbegin 1 -send 1000 > part1 cat $REF | seqret --filter -sbegin 1000 -send 2000 -sreverse1 > part2 cat $REF | seqret --filter -sbegin 2000 > part3 cat part1 part2 part3 | union -filter > GENOME.fa run_experiment align5.bam # Experiment 6. Ten unknown mutations, both point and block mutations. # How many can your identify? cat $REF | msbar -filter -count 10 -point 1 -block 1 -minimum 10 -maximum 500 > GENOME.fa run_experiment align6.bam # Remove temporary files rm -f part?