set -ue # Load the conda commands. source /home/www/miniconda3/etc/profile.d/conda.sh # Input file that contains the accession numbers. ACC=FILENAME # How many reads to simulate. N=1000 # Length of reads. L=100 # Create directories to store files. mkdir -p input output code # The output file for the accuracy. ACCURACY=classification-qiime2-accuracy.txt # The FASTA file storing the sequence for accession numbers. REF=input/reference.fa # The FASTA file with accession sequences in Qiime2 compatible format. REFERENCE=output/reference.qza # Trained reference. TRAINED_REF=output/reference_trained.qza # The FASTA file storing the reads simulated from the reference. DATA=input/reads.fa # Sequence reads in Qiime2 compatible format. READS=output/reads.qza # The accesion taxonomy file. TAXONOMY=input/taxonomy.txt # File with taxa lineage. LINEAGE=input/taxa_lineage.txt # QIIME2 compatable taxa lineage file. TAXA=output/taxa_lineage.qza # File with acccessions without version. ACCESSIONS=input/accessions.txt # File with taxonomic classification in qiime format. CLASSIFIED=output/classification.qza # File with clasification results for each read. RESULT=output/classified.txt # NCBI taxonomy database. TAXDIR=/export/refs/taxonomy # A custom database with accessions, taxid and lineage can be created as # URL=wget https://gist.githubusercontent.com/aswathyseb/98049028cdd72641485c9fc05715bdc1/raw/taxa_db.py # curl $URL > taxa_db.py # python taxa_db.py --dbpath taxa_db --accession nucl_gb.accession2taxid --lineage rankedlineage.dmp # This needs to be done only once. # Custome database with taxonomy lineage. TAXADB=$TAXDIR/taxa_db # Query to extract lineage from accession. QUERY1="SELECT acc,lineage FROM accession INNER JOIN taxa ON accession.taxid=taxa.taxid WHERE acc='{}'" # Query to extract taxid from lineage. QUERY2="SELECT taxid,lineage FROM taxa WHERE lineage='{}'" # Create lineage file by querying the database with accessions. echo -e "Feature ID\tTaxon" >$LINEAGE cat $ACC | parallel -j 4 -q --trim lr sqlite3 -list $TAXADB "$QUERY1;" | tr "|" "\t" >>$LINEAGE # Get sequences from NCBI. epost -db nuccore -input $ACC | efetch -format fasta > $REF # Identify the taxonomy for each accession number. epost -db nuccore -input $ACC | esummary | xtract -pattern DocumentSummary -element Caption,TaxId,Title > $TAXONOMY # Remove version number from fasta header so that it matches with the taxonomy file. cat $REF | sed -E 's/\.[0-9]+//' >tmp.fa mv tmp.fa $REF # Install the plac command parser. pip install plac -q # The location of the code that simulates the reads. URL2=https://raw.githubusercontent.com/biostars/biocode/master/scripts/fasta/simulate.py # Get the code. curl $URL2 > code/simulate.py # Generate the simulated reads. python code/simulate.py --fname $REF --count $N > input/reads.fq # Create reads fasta. seqtk seq -A input/reads.fq >$DATA # Switch to the qiime enviorment. set +u conda activate qiime2 set -ue # Convert taxonomy lineage to qiime compatible format. qiime tools import --type 'FeatureData[Taxonomy]' --input-path $LINEAGE --output-path $TAXA # Convert reference sequence to qiime compatible format. qiime tools import --type 'FeatureData[Sequence]' --input-path $REF --output-path $REFERENCE # Convert reads to qiime compatible format. qiime tools import --type 'FeatureData[Sequence]' --input-path $DATA --output-path $READS # Method1 : Sequence classification with sklearn # Train classifer. # qiime feature-classifier fit-classifier-naive-bayes --i-reference-reads $REFERENCE --i-reference-taxonomy $TAXA --o-classifier $TRAINED_REF # Classify input data with previously trained classifier. # qiime feature-classifier classify-sklearn --i-classifier $TRAINED_REF --i-reads $READS --o-classification $CLASSIFIED --p-n-jobs 2 # Method2 : Sequence classification with consensus blast # Classify sequences using consensus blast. # qiime feature-classifier classify-consensus-blast --i-query $READS --i-reference-reads $REFERENCE --i-reference-taxonomy $TAXA \ # --output-dir blast --verbose --p-maxaccepts 1 --p-perc-identity 0.99 # Classify sequences using consensus blast defaults. qiime feature-classifier classify-consensus-blast --i-query $READS --i-reference-reads $REFERENCE --i-reference-taxonomy $TAXA --output-dir blast --verbose # Move blast results to output. Comment this when running sklearn mv blast/classification.qza output # Remove the empty blast directory. Comment this when running sklearn rm -rf blast # Export classified qza file to tsv format (taxonomy.tsv). qiime tools export --input-path $CLASSIFIED --output-path output # Install a required librararies pip install csvkit plac --upgrade # Extract taxid corresponding to lineage. echo -e "Taxid\tTaxon" >output/taxid.txt cat output/taxonomy.tsv| cut -f 2 | sort | uniq | parallel -j 4 -q --trim lr sqlite3 -list $TAXADB "$QUERY2;" | tr "|" "\t" >>output/taxid.txt # Join the taxid to the classification. csvjoin --left -t -c 2,2 output/taxonomy.tsv output/taxid.txt| awk 'BEGIN{FS=",";OFS="\t"}{print $1,$4,$2,$3}' >$RESULT # The location of the code that generates the validation. URL2=https://raw.githubusercontent.com/biostars/biocode/master/scripts/classify/validate.py # Get the code. curl $URL2 > code/validate.py # Run the validator python code/validate.py -f $RESULT -t $TAXONOMY -c $N > $ACCURACY