# Stop on errors. set -uexo pipefail # The reference genome. ACC=NZ_CP008918 # Select sequencing run. SRR=SRR4124989 # The directory that stores the reference sequence. mkdir -p ref # The file that stores the reference genome. REF=ref/${ACC}.fa # Download the reference accession number. efetch -db nuccore -id $ACC -format fasta > $REF # The directory that stores the sequencing reads. mkdir -p reads # Download the sequencing data. fastq-dump --origfmt --split-files -O reads $SRR # Optional step. Generate a report on the data size echo "*** Sequence statistics" seqkit stat reads/${SRR}_?.fastq echo "*** " # # The one-liner code comes from: # # https://thegenomefactory.blogspot.com/2018/10/a-unix-one-liner-to-call-bacterial.html # # The number of CPUs to use. CPUS=4 # The names for the read pairs. R1=reads/${SRR}_1.fastq R2=reads/${SRR}_2.fastq # Unix trick: A single command line that is too long may be broken # into several lines as long as each ends with a \ symbol. # # Call the variants. minimap2 -a -x sr -t $CPUS $REF $R1 $R2 \ | samtools sort -l 0 --threads $CPUS \ | bcftools mpileup -Ou -B --min-MQ 60 -f $REF - \ | bcftools call -Ou -v -m - \ | bcftools norm -Ou -f $REF -d all - \ | bcftools filter -Ov -e 'QUAL<40 || DP<10 || GT!="1/1"' \ > variants.vcf # Generate a statistics on variants. bcftools stats variants.vcf > variants-stats.txt # Plot the variant statistics. plot-vcfstats -P -p plots variants-stats.txt