#
# This recipe downloads sequencing data from an NCBI project.
#

#
# The script limits the number of runs and the number of reads that it unpacks from each run.
#

# Stop on any error. Print the commands as they execute.
set -uex

# SRA folder
SRA=/export/sra

# NCBI BioProject ID.
PRJN=PRJNA257197

# How many sequencing runs to get for the project.
RUNS=5

# How many reads to download from each run.
READS=2000

# Obtain the run information.
esearch -db sra -query $PRJN | efetch -format runinfo > runinfo.csv

# Obtain the first few SRR numbers.
cat runinfo.csv | cut -f 1 -d , | grep SRR | head -$RUNS > runids.txt

# Download the FASTQ data for each SRR number.
cat runids.txt | parallel fastq-dump --split-files -X $READS --outdir $SRA {}

# Save quality control plots into a separate directory.
mkdir -p fastqc

# Run fastqc on each file. Generate the output into the fastqc directory.
fastqc -o fastqc reads/*.fastq

# Remove zip files created by fastqc
rm -f fastqc/*.zip