# # This recipe works for single end sequencing runs. # # Stop on errors. set -uex # The bioproject id. PROJECT=PRJNA272617 # The 3' adapter sequence ADAPTER=TGGAATTCTCGGGTGCCAAGG # Get the run information esearch -db sra -query $PROJECT | efetch -format runinfo > runinfo.csv # This will install csvkit if you don't already have it. pip install csvkit -q # Get the smallest file, so the download is quickest. # The ids file contains a single SRR number that corresponds to the smallest file. cat runinfo.csv | csvcut -x -c Run,size_MB | sort -t , -k 2,2n | grep SRR | cut -f 1 -d , | head -1 > ids # Get the data cat ids | parallel fastq-dump --split-files -X 10000 {} # Run fastqc on each file. cat ids | parallel fastqc {}_1.fastq # Run cutadapt to trim reads on the single end files cat ids | parallel cutadapt -q 10 -b $ADAPTER {}_1.fastq \> {.}_cutadapt.fq # With fastqc we're less worried about mismatching files so listing is fine. cat ids | parallel fastqc {.}_cutadapt.fq