# Show the commands as are executed. set -uex # Install bio with: # pip install bio --upgrade # Create a directory to store the reference and all indices. mkdir -p refs # The accession number. ACC=AF086833 # The reference genome we align to. REF=refs/REFERENCE.fa # The "real" genome that is different from the reference. GENOME=GENOME.fa # The read alignment for the simulated reads. RESULTS=results.bam # The semi-global alignment of the genome to the reference. REALITY=reality.bam # Error rate for the simulation. ERR=0.05 # How many reads to generate. N=2500 # Obtain the data save the first 3kb of it as our reference FASTA. bio fetch -format fasta $ACC | bio fasta --end 3kb > $REF # We create the modified genome here. # We are replacing a sequence pattern with a different version. # You are welcome to experiment with your changes. MATCHED=AGGGTGGACAACAGAAGAACA REPLACE=AGGGTGGACAACAGAGAA # Modify the genome. cat $REF | biosed -filter -target $MATCHED -replace $REPLACE > $GENOME # Show the replacement bio align $MATCHED $REPLACE -matrix NUC.4.4 # Create a genome wide alignment of the refence vs the genome. # This is only possilble because we "know" what the actual genome is. minimap2 -a $REF $GENOME | samtools sort > $REALITY # Read names R1=read1.fq R2=read2.fq # Generate simulated reads with no other mutations. wgsim -N $N -e $ERR -r 0 $GENOME $R1 $R2 # Index the reference genome with bwa index. bwa index $REF # READ tagging means adding more information into each alignment. TAG="@RG\tID:variants\tSM:${ACC}\tLB:simulation" # Align the reads and generate athe BAM file. bwa mem -R $TAG $REF $R1 $R2 | samtools sort > $RESULTS # Create the indexes for the BAM files. samtools index $RESULTS samtools index $REALITY # Variant call with bcftools. bcftools mpileup -Ov -f $REF $RESULTS | bcftools call -mv -Ov -o bcftools.vcf # Variant call with freebayes. Filter at 0.1 pval. freebayes -P 0.1 -f $REF $RESULTS > freebayes.vcf