#!/usr/bin/env bash # Stop on errors. set -uexo pipefail # Genome Accesion Number ACC=AF086833 # The SRR number for the sequencing data. SRR=SRR1972739 # Make directory for database. mkdir -p db # The reference genome name. REF=db/${ACC}.fa # The files names for the paired end reads. READ1=${SRR}_1.fastq READ2=${SRR}_2.fastq # Obtain the reference genome. efetch -db=nuccore -format=fasta -id=${ACC} > $REF # Obtain the sequencing data. fastq-dump -X 10000 --split-files $SRR 1>> log.txt 2>> log.txt # Index the reference genome. # This only needs to be done once! bwa index $REF 1>> log.txt 2>> log.txt # Align the sequencing reads to produce the BAM file. bwa mem $REF $READ1 $READ2 2>> log.txt | samtools sort > results.bam 2>> log.txt # Index the BAM file. samtools index results.bam # We have the BAM file sorted, indexed. # UNIX trick: you can put the results of a command into a variable with $() # # FOO=$(ls -1 | wc -l) # echo $FOO # # It runs in a NEW subshell not the original bash shell you started. # How many alignments ALN_COUNT=$(samtools view -c results.bam) UNMAP_COUNT=$(samtools view -c -f 4 results.bam) MAPPED_COUNT=$(samtools view -c -F 4 results.bam) PROPER_PAIR=$(samtools view -c -F 4 -f 2 results.bam) FORWARD_STRAND=$(samtools view -c -F 4 -F 16 results.bam) REVERSE_STRAND=$(samtools view -c -F 4 -f 16 results.bam) MUNMAP_COUNT=$(samtools view -c -F 4 -f 8 results.bam) FILE1_REV_COUNT=$(samtools view -c -F 4 -f 96 results.bam) FILE2_REV_COUNT=$(samtools view -c -F 20 -f 160 results.bam) echo "---" echo "Reference genome," $ACC echo "SRR Run," $SRR echo "---" echo "Total alignments," $ALN_COUNT echo "Unmapped, $UNMAP_COUNT echo "Mapped," $MAPPED_COUNT" echo "Proper pair," $PROPER_PAIR echo "Reads on forward strand," $FORWARD_STRAND echo "Reads on reverse strand," $REVERSE_STRAND echo "Reads mapped but mate unmapped", $MUNMAP_COUNT echo "Reads in file 1 where mate is on reverse strand", $FILE1_REV_COUNT echo "Reads in file 2 on the forward strand where mate in file 1 is on reverse strand", $FILE2_REV_COUNT # UNIX trick: You can store the command in a variable then execute it with eval # CMD="ls -l" # eval $CMD # # With this we can both show a command and its results. # # Caveats: this only works for simple commands, for more complex situations you will need to # quote the various components. See https://unix.stackexchange.com/questions/175444/execute-command-within-variable # CMD1="samtools flags MREVERSE,READ1" RES1=$(eval $CMD1) CMD2="samtools flags 96" RES2=$(eval $CMD2) echo "----" echo "Flags" echo "Command: $CMD1" echo "$RES1" echo "" echo "Command: $CMD2" echo "$RES2" echo "" echo "--- Work strategy ---" echo "Suppose you need: supplementary alignments from file 1" echo "on the forward strand where the mate is on the reverse" echo "First find flags for each attribute:" echo "" samtools flags REVERSE samtools flags SUPPLEMENTARY samtools flags MREVERSE samtools flags READ1 echo "" echo "# Perform the filtering (long form):" CMD3="samtools view -c -F 16 -f 2048 -f 32 -f 64 results.bam" RES3=$(eval $CMD3) echo $CMD3 echo $RES3 echo "" echo "# Perform the filtering (short form, all flags combined into one):" CMD4="samtools view -c -F 16 -f 2144 results.bam" RES4=$(eval $CMD4) echo $CMD4 echo $RES4