# Stop on errors. Show the commands. set -uex # Reference genome accession number. ACC=AF086833 # SRR run number. SRR=SRR1553500 # The directory to store the reference in. mkdir -p refs # The name of the file that stores the reference genome. REF=refs/$ACC.fa # The resulting alignment file name. BAM=$SRR.bam # Get the reference sequence. efetch -db=nuccore -format=fasta -id=$ACC | seqret -filter -sid $ACC > $REF # Index reference for the aligner. bwa index $REF >> log.txt 2>> log.txt # Index the reference genome for IGV samtools faidx $REF # Get data from an Ebola sequencing run. fastq-dump -X 100000 --split-files $SRR >> log.txt # Shortcut to read names. R1=${SRR}_1.fastq R2=${SRR}_2.fastq # Tag the alignments. GATK will only work when reads are tagged. TAG="@RG\tID:$SRR\tSM:$SRR\tLB:$SRR" # Align and generate a BAM file. bwa mem -R $TAG $REF $R1 $R2 2>> log.txt | samtools sort > $BAM 2>> log.txt # Index the BAM file. samtools index $BAM # Determine the genotype likelihoods for each base. bcftools mpileup -Ovu -f $REF $BAM > genotypes.vcf # Call variants with bcftools. bcftools call -vm -Ov genotypes.vcf > variants1.vcf 2>> log.txt # Call variants with freebayes. freebayes --ploidy 1 -f $REF $BAM > variants2.vcf # Call variants with GATK. See the book for GATK installation. # These commands are turned off in this example. # It needs yet another so called Dictionary index. # picard CreateSequenceDictionary REFERENCE=$REF OUTPUT=refs/$ACC.dict # Generate the variants. #gatk HaplotypeCaller -R $REF -I $BAM -O variants3.vcf