# Stop on errors. Show all commands. set -uxeo pipefail # Set the reference genome accession number. ACC=AF086833 # Set the SRR run number. SRR=SRR1972739 # Directory for various reference information. mkdir -p refs # The reference genome in GENBANK format. GB=refs/$ACC.gb # The reference genome in FASTA format. REF=refs/$ACC.fa # The reference genome in GFF (Gene Feature Format). GFF=refs/$ACC.gff # Get the reference sequence in GenBank format. efetch -db=nuccore -format=gb -id=$ACC > $GB # Reformat GenBank to FASTA. cat $GB | readseq -p -f fasta > $REF 2>> log.txt # Reformat GenBank to GFF. Keep only lines that match CDS. cat $GB | readseq -p -f gff 2>> log.txt | grep CDS > $GFF # Index reference for the bwa aligner. bwa index $REF 2>> log.txt # Index the reference genome so that it can be loaded into IGV. samtools faidx $REF # The directory that stores the sequencing reads mkdir -p reads # Get sequencing data from a SRR number. fastq-dump -X 10000 --split-files -O reads $SRR # Files names that store read pairs R1=reads/${SRR}_1.fastq R2=reads/${SRR}_2.fastq # The alignment file name. BAM=$SRR.bam # The variant file name. VCF=$SRR.vcf # Align and generate a sorted BAM file. bwa mem $REF $R1 $R2 2>> log.txt | samtools sort > $BAM # Index the BAM file. samtools index $BAM # Call snps with bcftools bcftools mpileup -Ou -f $REF $BAM 2>> log.txt | bcftools call -mv -Ov -o $VCF 2>> log.txt