set -ue

# Load the conda commands.
source /home/www/miniconda3/etc/profile.d/conda.sh

# Input file that contains the accession numbers.
ACC=FILENAME

# How many reads to simulate.
N=1000

# Length of reads.
L=100

# Create directories to store files.
mkdir -p input output code 

# The output file for the accuracy.
ACCURACY=classification-qiime2-accuracy.txt

# The FASTA file storing the sequence for accession numbers.
REF=input/reference.fa

# The FASTA file with accession sequences in Qiime2 compatible format.
REFERENCE=output/reference.qza

# Trained reference.
TRAINED_REF=output/reference_trained.qza

# The FASTA file storing the reads simulated from the reference.
DATA=input/reads.fa

# Sequence reads in Qiime2 compatible format.
READS=output/reads.qza

# The accesion taxonomy file.
TAXONOMY=input/taxonomy.txt

# File with taxa lineage.
LINEAGE=input/taxa_lineage.txt

# QIIME2 compatable taxa lineage file.
TAXA=output/taxa_lineage.qza

# File with acccessions without version.
ACCESSIONS=input/accessions.txt

# File with taxonomic classification in qiime format.
CLASSIFIED=output/classification.qza

# File with clasification results for each read.
RESULT=output/classified.txt

# NCBI taxonomy database.
TAXDIR=/export/refs/taxonomy

# A custom database with accessions, taxid and lineage can be created as
# URL=wget https://gist.githubusercontent.com/aswathyseb/98049028cdd72641485c9fc05715bdc1/raw/taxa_db.py
# curl $URL > taxa_db.py
# python taxa_db.py --dbpath taxa_db --accession nucl_gb.accession2taxid --lineage rankedlineage.dmp
# This needs to be done only once.

# Custome database with taxonomy lineage.
TAXADB=$TAXDIR/taxa_db

# Query to extract lineage from accession.
QUERY1="SELECT acc,lineage FROM accession INNER JOIN taxa ON accession.taxid=taxa.taxid WHERE acc='{}'"

# Query to extract taxid from lineage.
QUERY2="SELECT taxid,lineage FROM taxa WHERE lineage='{}'"

# Create lineage file by querying the database with accessions.
echo -e "Feature ID\tTaxon" >$LINEAGE
cat $ACC | parallel -j 4 -q --trim lr  sqlite3 -list $TAXADB "$QUERY1;" | tr "|" "\t" >>$LINEAGE

# Get sequences from NCBI.
epost -db nuccore -input $ACC | efetch -format fasta > $REF

# Identify the taxonomy for each accession number.
epost -db nuccore -input $ACC  | esummary | xtract -pattern DocumentSummary -element Caption,TaxId,Title > $TAXONOMY

# Remove version number from fasta header so that it matches with the taxonomy file.
cat $REF | sed -E 's/\.[0-9]+//' >tmp.fa
mv tmp.fa $REF

# Install the plac command parser.
pip install plac -q

# The location of the code that simulates the reads.
URL2=https://raw.githubusercontent.com/biostars/biocode/master/scripts/fasta/simulate.py

# Get the code.
curl $URL2 > code/simulate.py

# Generate the simulated reads.
python code/simulate.py --fname $REF --count $N > input/reads.fq

# Create reads fasta.
seqtk seq -A input/reads.fq >$DATA

# Switch to the qiime enviorment.
set +u
conda activate qiime2
set -ue

# Convert taxonomy lineage to qiime compatible format.
qiime tools import --type 'FeatureData[Taxonomy]' --input-path $LINEAGE --output-path $TAXA

# Convert reference sequence to qiime compatible format.
qiime tools import --type 'FeatureData[Sequence]' --input-path $REF --output-path $REFERENCE

# Convert reads to qiime compatible format.
qiime tools import --type 'FeatureData[Sequence]' --input-path $DATA --output-path $READS

# Method1 : Sequence classification with sklearn
# Train classifer.
# qiime feature-classifier fit-classifier-naive-bayes --i-reference-reads $REFERENCE --i-reference-taxonomy $TAXA --o-classifier $TRAINED_REF

# Classify input data with previously trained classifier.
# qiime feature-classifier classify-sklearn --i-classifier $TRAINED_REF --i-reads $READS --o-classification $CLASSIFIED --p-n-jobs 2

# Method2 : Sequence classification with consensus blast
# Classify sequences using consensus blast.
# qiime feature-classifier classify-consensus-blast --i-query $READS --i-reference-reads $REFERENCE --i-reference-taxonomy $TAXA \
# --output-dir blast --verbose --p-maxaccepts 1 --p-perc-identity 0.99

# Classify sequences using consensus blast defaults.
qiime feature-classifier classify-consensus-blast --i-query $READS --i-reference-reads $REFERENCE --i-reference-taxonomy $TAXA --output-dir blast --verbose

# Move blast results to output. Comment this when running sklearn
mv blast/classification.qza  output

# Remove the empty blast directory. Comment this when running sklearn
rm -rf blast

# Export classified qza file to tsv format (taxonomy.tsv). 
qiime tools export --input-path $CLASSIFIED --output-path output

# Install a required librararies
pip install csvkit plac --upgrade

# Extract taxid corresponding to lineage.
echo -e "Taxid\tTaxon" >output/taxid.txt
cat output/taxonomy.tsv| cut -f 2 | sort | uniq | parallel -j 4 -q --trim lr  sqlite3 -list $TAXADB "$QUERY2;" | tr "|" "\t" >>output/taxid.txt

# Join the taxid to the classification.
csvjoin --left -t -c 2,2 output/taxonomy.tsv output/taxid.txt| awk 'BEGIN{FS=",";OFS="\t"}{print $1,$4,$2,$3}' >$RESULT

# The location of the code that generates the validation.
URL2=https://raw.githubusercontent.com/biostars/biocode/master/scripts/classify/validate.py

# Get the code.
curl $URL2 > code/validate.py

# Run the validator
python code/validate.py -f $RESULT -t $TAXONOMY -c $N > $ACCURACY