# Stop script on any error. set -ue # # One of the assembled ebola genomes of the 2014 outbreak. # efetch -db nucleotide -id KM233118 -format fasta > KM233118.fa # # Create the two help files for easier reference. # blastdbcmd -help > HELP-blastdbcmd.txt blastn -help > HELP-blastn.txt # # Make a blast database out of it # mkdir -p xdb makeblastdb -dbtype nucl -parse_seqids -in KM233118.fa -out xdb/ebola # # Make a query from the genome # blastdbcmd -db xdb/ebola -entry KM233118 -range 1-40 > query-40bp.fa # # Run blastn to find our query we just took from the genome. # echo "*********************************************************" echo "Blastn on the 40bp query" echo "*********************************************************" blastn -db xdb/ebola -query query-40bp.fa # # Make the query shorter. # blastdbcmd -db xdb/ebola -entry KM233118 -range 1-15 > query-15bp.fa # # Run blastn to find our query we just took from the genome. # echo "*********************************************************" echo "Blastn on the 15bp query:" echo "*********************************************************" blastn -db xdb/ebola -query query-15bp.fa # # The default task is megablast # Use a different strategy here. # echo "*********************************************************" echo "Blastn with the 'blastn' task on the 15bp query:" echo "*********************************************************" blastn -task blastn -db xdb/ebola -query query-15bp.fa # # Make the query shorter. Now both megablast and blastn will fail to find it. # Use the 'short' task. # blastdbcmd -db xdb/ebola -entry KM233118 -range 1-7 > query-07bp.fa echo "*********************************************************" echo "Blastn with the 'blastn-short' task on the 15bp query" echo "Note how many hits it generates!" echo "*********************************************************" blastn -task blastn-short -db xdb/ebola -query query-15bp.fa # # But wait there is more! The rabbit hole is deeper. # # Get chromosome 1 for the yeast genome and build a blast database for it. # efetch -db nucleotide -id NC_001133 -format fasta > NC_001133.fa mkdir -p xdb makeblastdb -dbtype nucl -parse_seqids -in NC_001133.fa -out xdb/yeast blastdbcmd -db xdb/yeast -entry NC_001133 -range 1-80 > query-yeast.fa # # Why does this not work? # echo "*********************************************************" echo "Blastn with the on the 80bp query. No hits!!!! Why??????" echo "*********************************************************" blastn -db xdb/yeast -query query-yeast.fa # # By default low complexity regions are filtered out! # # See blastn -help # # # Turn off the low complexity filtering that happens to be called 'dust' # No really the parameter is called '-dust no' # # Thanks @NCBI! # echo "*********************************************************" echo "Blastn with the '--dust-no' task on the 80bp query:" echo "*********************************************************" blastn -db xdb/yeast -dust no -query query-yeast.fa