# # The script assumes that reference data have been already installed with # # http://data.biostarhandbook.com/redo/zika/zika-references.sh # # Stop on errors. set -uex # Set the project name. PROJECT=PRJNA313294 # Shortcut to annotations GTF=~/refs/Homo_sapiens.GRCh38.96.chr.gtf # Full run information. RUNINFO=runinfo.all.csv # Run info for single end reads. SINGLE=runinfo.single.csv # Run information for paired end reads. PAIRED=runinfo.paired.csv # How many reads per sample to unpack. READNUM=1000000 # The HiSat2 index name. # The index was installed with # http://data.biostarhandbook.com/redo/zika/zika-references.sh IDX=~/refs/grch38/genome # Downloading the run information. esearch -db sra -query $PROJECT | efetch -format runinfo > $RUNINFO # Separate SRR numbers for the single end files. cat $RUNINFO | grep SRR.*SINGLE | cut -f 1 -d , > $SINGLE # Separate SRR numbers for the paired end files. cat $RUNINFO | grep SRR.*PAIRED | cut -f 1 -d , > $PAIRED # Download and unpack single end reads. cat $SINGLE | parallel fastq-dump -X $READNUM -O reads >> log.txt # Download and unpack paired end reads cat $PAIRED | parallel fastq-dump -X $READNUM --split-files -O reads >> log.txt # Store bam files here. mkdir -p bam # How many CPU cores on the system. CPUS=4 # Run hisat2 on the paired data. cat $PAIRED | parallel "hisat2 -p $CPUS -x $IDX -1 reads/{}_1.fastq -2 reads/{}_2.fastq 2>> log.txt | samtools sort > bam/{}.bam" # Run hisat2 on the single end data. cat $SINGLE | parallel "hisat2 -p $CPUS -x $IDX -U reads/{}.fastq 2>> log.txt | samtools sort > bam/{}.bam" # Run a samtools index on all BAM files ls -1 bam/*.bam | parallel samtools index {} # These are SRR number of the the control (MOCK) numbers MOCK=(SRR3191542 SRR3191543 SRR3194428 SRR3194429) # These are the SRR number of infected (ZIKV) samples. ZIKV=(SRR3191544 SRR3191545 SRR3194430 SRR3194431) # We do some extra work to generate bam file names # such as bam/SRR3191542.bam grouped by condition # Generate MOCK bam file names. for SRR in ${MOCK[@]}; do MOCK_BAMS+="bam/${SRR}.bam " done # Generate ZIKV bam file names. for SRR in ${ZIKV[@]}; do ZIKV_BAMS+="bam/${SRR}.bam " done # Count the features over each gene name. featureCounts -g gene_name -a $GTF -o counts.txt ${MOCK_BAMS} ${ZIKV_BAMS} 2>> log.txt # Simplify the counts. cat counts.txt | cut -f 1,7-14 > simple_counts.txt # Get the script that runs the deseq 1 method. wget -q -nc http://data.biostarhandbook.com/rnaseq/code/deseq1.r # Perform the differential expression study. cat simple_counts.txt | Rscript deseq1.r 4x4 > results.txt 2>> log.txt # Download the script to draw the heatmaps (requires the glplots package). wget -q -nc http://data.biostarhandbook.com/rnaseq/code/draw-heatmap.r # Generate heatmap from the deseq1 normalized matrix. cat norm-matrix-deseq1.txt | Rscript draw-heatmap.r > heatmap-norm-matrix-deseq1.pdf 2>> log.txt