set -uex

#
# Download the taxonomy for kraken2.
# Needs to be done separately and only once.
#
# mkdir -p ~/refs
#
# (cd ~/refs && kraken2-build --download-taxonomy --db kraken2)
#

set -uex

#
# Download the taxonomy for kraken2.
# Needs to be done separately and only once.
#
# mkdir -p ~/refs
#
# (cd ~/refs && kraken2-build --download-taxonomy --db kraken2)
#

# Input file that contains the accession numbers.
ACC=FILENAME

# How many reads to simulate.
N=1000

# Lenght of reads.
L=100

# The error rate in the simulated reads.
E=0.02

# Will store data in these locations
mkdir -p input output code

# The FASTA file storing the sequence for accession numbers.
FASTA=input/reference.fa

# The FASTQ files that store the read pairs generated from reference.
R1=input/r1.fq
R2=input/r2.fq

# The accesion taxonomy file
TAXONOMY=input/taxonomy.txt

# The Kraken 2 classification report.
REPORT=output/kraken2-report.txt

# The Kraken2 output.
OUTPUT=output/kraken2-output.txt

# The Kraken2 accuracy report.
ACCURACY=accuracy-kraken2.txt

# Copy over the accession file to input folder to aid reproducibility.
cp $ACC input/accessions.txt

# Get sequences from NCBI.
epost -db nuccore -input $ACC | efetch -format fasta > $FASTA

# Identify the taxonomy for each accession number.
epost -db nuccore -input $ACC  | esummary | xtract -pattern DocumentSummary -element Caption,TaxId,Title > $TAXONOMY

# To avoid recomputing the same kraken2 library store with a unique name.
SHA=`shasum $FASTA | cut -c 1-8`

# Store the Kraken2 database under the checksum name.
DB=~/tmp/$SHA

# Create the directory for the kraken2 database.
mkdir -p $DB

# Link the taxonomy.
ln -sf ~/refs/kraken2/taxonomy $DB

# Add marker sequences to the kraken2 library.
kraken2-build --add-to-library $FASTA --db $DB

# Build the marker library.
kraken2-build --build --db $DB

# Install the plac command parser.
pip install plac -q

# Clone the read simulator. Must use wgsim2.
git clone --depth=1 https://github.com/ialbert/wgsim2.git > log.txt

# Compile the read simulator.
(cd wgsim2 && make 2> log.txt)

# Delete the .git folder as it creates too many unneeded files)
(cd wgsim2 && rm -rf .git)

# Simulate reads with errors in fixed N mode.
wgsim2/wgsim2 -f -r 0 -R 0 -N $N -e $E -1 $L -2 $L $FASTA $R1 $R2 > log.txt

# Run kraken2 classifier.
kraken2  -db $DB --paired $R1 $R2 --report-zero-counts --report $REPORT > $OUTPUT

# The location of the code that generates the validation.
URL2=https://raw.githubusercontent.com/biostars/biocode/master/scripts/classify/validate.py

# Get the code.
curl $URL2 > code/validate.py

# Run the validator
python code/validate.py -f $OUTPUT -t $TAXONOMY -c $N > $ACCURACY