# Stop on errors.
# Print all commands as these are executed.
set -uexo pipefail

# This recipe is a partial replication of the results from the paper published as:
#
# "Evaluating variant calling accuracy with CHM1 and CHM13 haploid data"
#
# https://www.nature.com/articles/s41592-018-0054-7
#
# https://www.ebi.ac.uk/ena/data/view/PRJEB13208
#

#
# For efficiency considerations this script will take only the data
# that aligns to chromosome 18 of the human genome.
#

# Directory to store our sequencing reads.
mkdir -p reads

# What range to extract read from the BAM file.
RANGE=18:1-1000000

# The local BAM file that will contain alignments from the selected RANGE.
UBAM=reads/chr18.bam

# The url to the BAM file that contains the alignments.
URL=ftp://ftp.sra.ebi.ac.uk/vol1/ERA596/ERA596361/bam/CHM1_CHM13_2.bam

# It can be challenging to restore "pairedness" in data when you take a slice from an aligned BAM file.
# To avoid dealing with that we will treat the data as single end and use the first-in-pair only.
samtools view -b $URL $RANGE > $UBAM

# The index file for the above BAM file is also downloaded. We don't need it.
rm -f *.bai

# The name of the files that contain the read pairs.
R1=reads/r1.fq
R2=reads/r2.fq

# Unpack the BAM file into FASTQ read1 and read2.
samtools fastq -f 2 -1 $R1 -2 $R2 $UBAM

# Generate statistics on the sequencing reads.
seqkit stat $R1 $R2

# Directory to store our reference genomes and indices.
mkdir -p ref

# This will be the reference genome.
REF=ref/chr18.fa

# Get the human chromosome 18 from USCS download site.
# The paper uses the hg19 genomic build. So will follow that as well.
#
# http://hgdownload.soe.ucsc.edu/goldenPath/hg19/chromosomes/
#
GENOME_URL=http://hgdownload.soe.ucsc.edu/goldenPath/hg19/chromosomes/chr18.fa.gz

# Download the the reference genome.
curl -s $GENOME_URL | gunzip -c > $REF

# Index the reference with bwa.
bwa index $REF 2> log.txt

# Index the reference genome with samtools.
samtools faidx $REF

# The final alignment file.
BAM=align18.bam

# Align the data with a read group tag. Will only use one of the pairs.
# Using the same read groups as contained in the BAM file.
TAG='@RG\tID:1\tSM:CHM1_CHM13\tLB:Pond-482886'
bwa mem -t 4 -R $TAG $REF $R1 2>> log.txt | samtools sort -@ 4 > $BAM

# Index the resulting sorted BAM file.
samtools index $BAM

# Obtain the ground truth variants established via whole genome comparisons.
# The link is at https://github.com/lh3/CHM-eval/
TRUTH_URL=https://github.com/lh3/CHM-eval/releases/download/v0.5/rep2.37.broad.hc.raw.vcf.gz

# Download the ground truth VCF.
curl -sL $TRUTH_URL > ref/truth.vcf.gz

# Index the truth VCF file so that we can slice it.
bcftools index ref/truth.vcf.gz

# Subselect variants only from chromosome 18.
bcftools view ref/truth.vcf.gz 18 > ${BAM}.truth.vcf

# Call SNPs with freebayes. Filter for a probability over 50%.
# Normalize variants to standard format.
freebayes -f $REF -P .5 $BAM | vt normalize -r $REF - > ${BAM}.freebayes.vcf 2>> log.txt

# Call SNPs with bcftools. Filter for a mapping quality of 40
bcftools mpileup -Ou -B --min-MQ 40 -f $REF $BAM | bcftools call -Ou -v -m - | bcftools norm -Ov -f $REF -d all > ${BAM}.bcftools.vcf

# Call SNPs with GATK.
# First we need to create *yet another* index for the reference.
picard CreateSequenceDictionary REFERENCE=${REF}  OUTPUT=ref/chr18.dict 2>> log.txt

# Call SNPS with GATK HaploType caller.
# You will need to replace the path to GATK to that that matches yours.
# See installation help in the lecture.
~/bin/gatk HaplotypeCaller -R $REF -I $BAM -O ${BAM}.gatk.vcf 2>> log.txt