# Stop on errors. # Print all commands as these are executed. set -uexo pipefail # This recipe is a partial replication of the results from the paper published as: # # "Evaluating variant calling accuracy with CHM1 and CHM13 haploid data" # # https://www.nature.com/articles/s41592-018-0054-7 # # https://www.ebi.ac.uk/ena/data/view/PRJEB13208 # # # For efficiency considerations this script will take only the data # that aligns to chromosome 18 of the human genome. # # Directory to store our sequencing reads. mkdir -p reads # What range to extract read from the BAM file. RANGE=18:1-1000000 # The local BAM file that will contain alignments from the selected RANGE. UBAM=reads/chr18.bam # The url to the BAM file that contains the alignments. URL=ftp://ftp.sra.ebi.ac.uk/vol1/ERA596/ERA596361/bam/CHM1_CHM13_2.bam # It can be challenging to restore "pairedness" in data when you take a slice from an aligned BAM file. # To avoid dealing with that we will treat the data as single end and use the first-in-pair only. samtools view -b $URL $RANGE > $UBAM # The index file for the above BAM file is also downloaded. We don't need it. rm -f *.bai # The name of the files that contain the read pairs. R1=reads/r1.fq R2=reads/r2.fq # Unpack the BAM file into FASTQ read1 and read2. samtools fastq -f 2 -1 $R1 -2 $R2 $UBAM # Generate statistics on the sequencing reads. seqkit stat $R1 $R2 # Directory to store our reference genomes and indices. mkdir -p ref # This will be the reference genome. REF=ref/chr18.fa # Get the human chromosome 18 from USCS download site. # The paper uses the hg19 genomic build. So will follow that as well. # # http://hgdownload.soe.ucsc.edu/goldenPath/hg19/chromosomes/ # GENOME_URL=http://hgdownload.soe.ucsc.edu/goldenPath/hg19/chromosomes/chr18.fa.gz # Download the the reference genome. curl -s $GENOME_URL | gunzip -c > $REF # Index the reference with bwa. bwa index $REF 2> log.txt # Index the reference genome with samtools. samtools faidx $REF # The final alignment file. BAM=align18.bam # Align the data with a read group tag. Will only use one of the pairs. # Using the same read groups as contained in the BAM file. TAG='@RG\tID:1\tSM:CHM1_CHM13\tLB:Pond-482886' bwa mem -t 4 -R $TAG $REF $R1 2>> log.txt | samtools sort -@ 4 > $BAM # Index the resulting sorted BAM file. samtools index $BAM # Obtain the ground truth variants established via whole genome comparisons. # The link is at https://github.com/lh3/CHM-eval/ TRUTH_URL=https://github.com/lh3/CHM-eval/releases/download/v0.5/rep2.37.broad.hc.raw.vcf.gz # Download the ground truth VCF. curl -sL $TRUTH_URL > ref/truth.vcf.gz # Index the truth VCF file so that we can slice it. bcftools index ref/truth.vcf.gz # Subselect variants only from chromosome 18. bcftools view ref/truth.vcf.gz 18 > ${BAM}.truth.vcf # Call SNPs with freebayes. Filter for a probability over 50%. # Normalize variants to standard format. freebayes -f $REF -P .5 $BAM | vt normalize -r $REF - > ${BAM}.freebayes.vcf 2>> log.txt # Call SNPs with bcftools. Filter for a mapping quality of 40 bcftools mpileup -Ou -B --min-MQ 40 -f $REF $BAM | bcftools call -Ou -v -m - | bcftools norm -Ov -f $REF -d all > ${BAM}.bcftools.vcf # Call SNPs with GATK. # First we need to create *yet another* index for the reference. picard CreateSequenceDictionary REFERENCE=${REF} OUTPUT=ref/chr18.dict 2>> log.txt # Call SNPS with GATK HaploType caller. # You will need to replace the path to GATK to that that matches yours. # See installation help in the lecture. ~/bin/gatk HaplotypeCaller -R $REF -I $BAM -O ${BAM}.gatk.vcf 2>> log.txt