# Stop on errors. set -uexo pipefail # Install the bio package # pip install bio --upgrade # The reference genome. ACC=NZ_CP008918 # Select sequencing run. SRR=SRR4124989 # The directory that stores the reference sequence. mkdir -p ref # The file that stores the reference genome as FASTA. REF=ref/${ACC}.fa # The file that stores the reference genome as GFF. GFF=ref/${ACC}.gff # The resulting alignment file. BAM=$ACC.bam # Download the reference accession number. bio fetch -format fasta $ACC > $REF # Download the accessions as intervals. bio fetch -format gff $ACC > $GFF # The directory that stores the sequencing reads. mkdir -p reads # Download the sequencing data. fastq-dump --origfmt --split-files -O reads $SRR # Generate a report on the data size seqkit stat reads/${SRR}_?.fastq # The one-liner code comes from: # # https://thegenomefactory.blogspot.com/2018/10/a-unix-one-liner-to-call-bacterial.html # The number of CPUs to use. CPUS=4 # The names for the read pairs. R1=reads/${SRR}_1.fastq R2=reads/${SRR}_2.fastq # Unix trick: A single command line that is too long may be broken # into several lines as long as each ends with a \ symbol. # # Generate the alignments. minimap2 -a -x sr -t $CPUS $REF $R1 $R2 \ | samtools sort -l 0 --threads $CPUS > $BAM # Call the variants. cat $BAM | bcftools mpileup -Ou -B --min-MQ 60 -f $REF - \ | bcftools call -Ou -v -m - \ | bcftools norm -Ou -f $REF -d all - \ | bcftools filter -Ov -e 'QUAL<40 || DP<10 || GT!="1/1"' \ > variants.vcf # Generate a statistics on variants. bcftools stats variants.vcf > variants-stats.txt # Plot the variant statistics. plot-vcfstats -P -p plots variants-stats.txt