# # This recipe downloads sequencing data from an NCBI project. # # # The script limits the number of runs and the number of reads that it unpacks from each run. # # Stop on any error. Print the commands as they execute. set -uex # SRA folder SRA=/export/sra # NCBI BioProject ID. PRJN=PRJNA257197 # How many sequencing runs to get for the project. RUNS=5 # How many reads to download from each run. READS=2000 # Obtain the run information. esearch -db sra -query $PRJN | efetch -format runinfo > runinfo.csv # Obtain the first few SRR numbers. cat runinfo.csv | cut -f 1 -d , | grep SRR | head -$RUNS > runids.txt # Download the FASTQ data for each SRR number. cat runids.txt | parallel fastq-dump --split-files -X $READS --outdir $SRA {} # Save quality control plots into a separate directory. mkdir -p fastqc # Run fastqc on each file. Generate the output into the fastqc directory. fastqc -o fastqc reads/*.fastq # Remove zip files created by fastqc rm -f fastqc/*.zip