# Stop on errors. set -uex # The accession number of the genome. ACC=AF086833 # The number of reads to generate. COUNT=10000 # The resulting BAM file. BAM=align.bam # The name of the reference file. REF=refs/${ACC}.fa # The directory for the reference genome. mkdir -p refs # Fetch the reference genome from NCBI. efetch -db nuccore -format fasta -id ${ACC} > $REF # Index the reference genome for bwa. bwa index $REF 2>> runlog.txt # Index the reference genome for IGV. samtools faidx $REF # This will be our "real" genome that we will introduce mutations into. REAL=real.fa # Create the REAL genome that has the mutation. # Extract the first 2000 bases. cat $REF | seqret --filter -sbegin 1 -send 2000 > part1 # Extract the bases from 3000 and on. cat $REF | seqret --filter -sbegin 3000 > part2 # Combine the two pieces as the "real" genome. # The net effect is that we lost the 2000-3000 region from the genome. cat part1 part2 | union -filter > ${REAL} # The read pairs that will hold the simulation results. R1=r1.fq R2=r2.fq # Generate sequencing reads. # We will set other mutation rates to zero so that only our variation is present. wgsim -e 0 -r 0 -R 0 -N $COUNT ${REAL} ${R1} ${R2} > mutations.txt 2>> runlog.txt # Align the simulated reads against the genome. bwa mem ${REF} ${R1} ${R2} 2>> log.txt | samtools sort > ${BAM} # Index the resulting BAM file. samtools index ${BAM} # Generate flag statistics. samtools flagstat ${BAM} > flagstat.txt # # Here are a few other mutations you may want to explore. # Apply these before the wgsim step then visualize the results. # # Experiment 2. Large deletion. Deletion applied from 2000-3000 # cat $REF | seqret --filter -sbegin 1 -send 2000 > part1 # cat $REF | seqret --filter -sbegin 3000 > part2 # cat part1 part2 | union -filter > ${REAL} # Experiment 3. Copy number variation. The first 2000 bases are present three times. # cat $REF | seqret --filter -sbegin 1 -send 2000 > part1 # cat part1 part1 $REF | union -filter > ${REAL} # Experiment 4. Swap the genome. First 5000 bp go last. # cat $REF | seqret --filter -send 5000 > part1 # cat $REF | seqret --filter -sbegin 5000 > part2 # cat part2 part1 | union -filter > ${REAL} # Experiment 5. Reverse complement a region of the genome (1000 to 2000). # cat $REF | seqret --filter -sbegin 1 -send 1000 > part1 # cat $REF | seqret --filter -sbegin 1000 -send 2000 -sreverse1 > part2 # cat $REF | seqret --filter -sbegin 2000 > part3 # cat part1 part2 part3 | union -filter > ${REAL} # Experiment 6. Ten unknown mutations, both point and block mutations. # cat $REF | msbar -filter -count 10 -point 1 -block 1 -minimum 10 -maximum 500 > ${REAL}