# Stop on any error. Show the commands. set -uex # NCBI Bioproject ID. ID=PRJNA257197 # The number of samples to process. N=5 # NCBI accession number for the reference genome. ACC=KJ660346 # The reference data in GENBANK format. GENBANK=refs/${ACC}.gb # The reference data in FASTA format. FASTA=refs/${ACC}.fa # The reference data in GFF format. GFF=refs/${ACC}.gff # The directory that stores the reference files. mkdir -p refs # Get the reference sequence in GenBank format. efetch -db=nuccore -format=gb -id=$ACC > $GENBANK # Reformat GenBank to FASTA. cat $GENBANK | readseq -p -f fasta > $FASTA 2>> log.txt # Reformat GenBank to GFF. Keep only lines that match CDS. cat $GENBANK | readseq -p -f gff 2>> log.txt | grep CDS > $GFF # Index the FASTA file for bwa. bwa index $FASTA 2>> log.txt # Index the FASTA file for IGV. samtools faidx $FASTA >> log.txt # Obtain the SRR run number for the data deposited into the BioProject esearch -db sra -query $ID | efetch -format runinfo > runinfo.csv # Keep the top N SRR numbers for now. cat runinfo.csv | cut -f 1 -d , | grep SRR | head -$N > samples.txt # Make a directory for the sequencing data. mkdir -p reads # Limit each dataset to these many reads. LIMIT=100000 # Download the sequencing data in parallel. cat samples.txt | parallel "fastq-dump -X $LIMIT -O reads --split-files {} >> log.txt" # Generate alignments mkdir -p bam # Run the bwa mem processes in parallel. # Put an echo in front of the command to understand what it does. cat samples.txt | parallel "bwa mem $FASTA reads/{}_1.fastq reads/{}_2.fastq 2>> log.txt | samtools sort > bam/{}.bam 2>> log.txt" # Unix trick: once your commands become more advances redirecting inputs can be # a bit challenging. Here we want to run samples in parallel, redirect each parallel process into # its own file while collecting the overall messages into the log.txt file. # Index the BAM files. cat samples.txt | parallel "samtools index bam/{}.bam 2>> log.txt" # Call variants on all BAM files in parallel. bcftools mpileup -Ou -f $FASTA bam/*.bam 2>> log.txt | bcftools call --ploidy 1 -vm -Ou | bcftools norm -Ov -f $FASTA -d all - > all_variants.vcf # Annotate the variants with SNPeff. cat all_variants.vcf | snpEff ebola_zaire -s all_effects_report.html > all_variants_annotated.vcf