#
# This recipe works for single end sequencing runs.
#

# Stop on errors.
set -uex

# The bioproject id.
PROJECT=PRJNA272617

# The 3' adapter sequence
ADAPTER=TGGAATTCTCGGGTGCCAAGG

# Get the run information 
esearch -db sra -query $PROJECT | efetch -format runinfo > runinfo.csv

# This will install csvkit if you don't already have it.
pip install csvkit -q

# Get the smallest file, so the download is quickest.
# The ids file contains a single SRR number that corresponds to the smallest file.
cat runinfo.csv | csvcut -x -c Run,size_MB | sort -t , -k 2,2n | grep SRR | cut -f 1 -d ,  | head -1 > ids

# Get the data
cat ids | parallel fastq-dump --split-files -X 10000 {} 

# Run fastqc on each file.
cat ids | parallel fastqc {}_1.fastq 

# Run cutadapt to trim reads on the single end files
cat ids | parallel cutadapt -q 10 -b $ADAPTER {}_1.fastq \> {.}_cutadapt.fq

# With fastqc we're less worried about mismatching files so listing is fine.
cat ids | parallel fastqc {.}_cutadapt.fq