# Stop script on any error.
set -ue

#
# One of the assembled ebola genomes of the 2014 outbreak.
#
efetch -db nucleotide -id KM233118 -format fasta > KM233118.fa

#
# Create the two help files for easier reference.
#
blastdbcmd -help > HELP-blastdbcmd.txt
blastn -help > HELP-blastn.txt

#
# Make a blast database out of it
#
mkdir -p xdb
makeblastdb -dbtype nucl -parse_seqids -in KM233118.fa -out xdb/ebola

#
# Make a query from the genome
#
blastdbcmd -db xdb/ebola -entry KM233118 -range 1-40 > query-40bp.fa


#
# Run blastn to find our query we just took from the genome.
#

echo "*********************************************************"
echo "Blastn on the 40bp query"
echo "*********************************************************"
blastn -db xdb/ebola -query query-40bp.fa

#
# Make the query shorter.
#
blastdbcmd -db xdb/ebola -entry KM233118 -range 1-15 > query-15bp.fa

#
# Run blastn to find our query we just took from the genome.
#
echo "*********************************************************"
echo "Blastn on the 15bp query:"
echo "*********************************************************"
blastn -db xdb/ebola -query query-15bp.fa

#
# The default task is megablast
# Use a different strategy here.
#
echo "*********************************************************"
echo "Blastn with the 'blastn' task on the 15bp query:"
echo "*********************************************************"
blastn -task blastn  -db xdb/ebola -query query-15bp.fa

#
# Make the query shorter. Now both megablast and blastn will fail to find it.
# Use the 'short' task.
#
blastdbcmd -db xdb/ebola -entry KM233118 -range 1-7 > query-07bp.fa
echo "*********************************************************"
echo "Blastn with the 'blastn-short' task on the 15bp query"
echo "Note how many hits it generates!"
echo "*********************************************************"
blastn -task blastn-short -db xdb/ebola -query query-15bp.fa

#
# But wait there is more! The rabbit hole is deeper.
#
# Get chromosome 1 for the yeast genome and build a blast database for it.
#
efetch -db nucleotide -id NC_001133 -format fasta > NC_001133.fa
mkdir -p xdb
makeblastdb -dbtype nucl -parse_seqids -in NC_001133.fa -out xdb/yeast
blastdbcmd -db xdb/yeast -entry NC_001133 -range 1-80 > query-yeast.fa

#
# Why does this not work?
#
echo "*********************************************************"
echo "Blastn with the  on the 80bp query. No hits!!!! Why??????"
echo "*********************************************************"
blastn -db xdb/yeast -query query-yeast.fa

#
# By default low complexity regions are filtered out!
#
# See blastn -help
#

#
# Turn off the low complexity filtering that happens to be called 'dust'
# No really the parameter is called '-dust no'
#
# Thanks @NCBI!
#
echo "*********************************************************"
echo "Blastn with the '--dust-no' task on the 80bp query:"
echo "*********************************************************"
blastn -db xdb/yeast -dust no -query query-yeast.fa