## Cleaning and filtering 16S files in dada2; demo with SRA data # input: path to fastq files, other filtering/trimming parameters # output: folder with filtered fastq files # load the R packages require(dada2) require(phyloseq) ## ---- setVariables ---- # path to directory that contains fastq files PATH = "/export/sra/" # desired SRA numbers SRA = list.files(PATH, pattern = ".fastq") # choose 16S reference database REF_FILE = "SILVA" if(REF_FILE == "SILVA") { # choose SILVA path DBPATH = "/export/refs/silva/silva_nr_v132_train_set.fa.gz" } else if(REF_FILE == "GREENGENES") { # choose GREENGENES path DBPATH = "/export/refs/green/green_gene_13_8_train_set_97.fa.gz" } else { # error if neither database is chosen stop("Incorrect database option.") } # paired end characterization; most Illumina files are sample names + "_R1_001.fastq" for forward reads # however, sequences downloaded from NCBI have patterns: "_1.fastq" for forward and "_2.fastq" for reverse PATTERNF = "_1.fastq" PATTERNR = "_2.fastq" # Quality cut: "Truncate reads at the first instance of a quality score less than or equal to truncQ" TRUNCQ = 0 # Length cut: "Truncate reads after `trunclen` bases; reads shorter than this are discarded" # NOTE: if paired-end, need two values (one for forward and one for reverse) TRUNCLEN = c(0, 0) # Head trim: "Number of nucleotides to remove from the start of each read" TRIMLEFT = 0 # Tail trim: "Number of nucleotides to remove from the end of each read" TRIMRIGHT = 0 # Minimum length: "Remove reads with length less than minLen" -- this happens AFTER other trims/truncations MINLEN = 100 # Maxmimum N: "After truncation, sequences with more than `maxN` are discarded" # NOTE: dada2 does not allow Ns MAXN = 0 # Minimum quality: "After truncation, reads contain a quality score less than `minQ` are discarded" MINQ = 0 ## ---- getFiles---- # are files already split? if(any(grepl(PATTERNF, SRA))) { # find forward and reverse files fwd <- SRA[grep(PATTERNF, SRA)] rev <- SRA[grep(PATTERNR, SRA)] # get file paths fwdFiles <- paste0(PATH, fwd) revFiles <- paste0(PATH, rev) } else { # if files are not split, get path # get forward files fwdFiles <- paste0(PATH, SRA, PATTERNF) # get reverse files revFiles <- paste0(PATH, SRA, PATTERNR) } # error catch if(length(file(fwdFiles[1])) == 0) { stop("cannot read forward file path") } # error catch if(length(file(revFiles[1])) == 0) { stop("cannot read reverse file path") } # check to make sure that the lengths of both files are the same if(length(fwdFiles) != length(revFiles)) { stop("There is an unequal number of forward and reverse files") } # get sample names fwdNames <- sapply(strsplit(basename(fwdFiles), PATTERNF), `[`, 1) revNames <- sapply(strsplit(basename(revFiles), PATTERNR), `[`, 1) # error catch if unordered if(any(!fwdNames %in% revNames)) { stop("forward and reverse files are out of order") } ### ---- filterAndTrim ---- # create subdirectory for filtered files filtForward <- file.path("./filtered", paste0(fwdNames, "_F_filt.fastq.gz")) filtReverse <- file.path("./filtered", paste0(revNames, "_R_filt.fastq.gz")) # filter and trim cleaned <- filterAndTrim(fwd = fwdFiles, rev = revFiles, filt = filtForward, filt.rev = filtReverse, # add parameters that the user selected in previous chunk truncQ = TRUNCQ, truncLen = TRUNCLEN, trimLeft = TRIMLEFT, trimRight = TRIMRIGHT, maxN = MAXN, minLen = MINLEN, minQ = MINQ ) # track how many reads were removed during filtering process write.table(cleaned, file = "./filtered/filteredReads.txt", sep = "\t") ## ---- checkFilteredFiles ---- # path to filtered and cleaned reads CLEANEDPATH = "./filtered/" # pattern that specifies which reads are forward or reverse # if single-read pairs, only specifyforward FILTEREDF = "_F_filt.fastq.gz" FILTEREDR = "_R_filt.fastq.gz" ## ---- dada2 ---- # get forward and reverse reads forward <- sort(list.files(CLEANEDPATH, pattern = FILTEREDF, full.names = TRUE)) reverse <- sort(list.files(CLEANEDPATH, pattern = FILTEREDR, full.names = TRUE)) # check to make sure that the lengths of both files are the same and that they match fwdNames <- sapply(strsplit(basename(forward), FILTEREDF), `[`, 1) revNames <- sapply(strsplit(basename(reverse), FILTEREDR), `[`, 1) if(length(fwdNames) != length(revNames)) { stop("The number of forward and reverse files do not match.") } else { if(any(!fwdNames%in% revNames)) { stop("Forward and reverse reads are out of order.") } } # perform error learning errF <- learnErrors(forward, multithread = FALSE) errR <- learnErrors(reverse, multithread = FALSE) # perform denoising dadaForward <- dada(forward, err = errF, multithread = FALSE) dadaReverse <- dada(reverse, err = errR, multithread = FALSE) # merge paired reads mergers <- mergePairs(dadaF = dadaForward, derepF = forward, dadaR = dadaReverse, derepR = reverse, verbose = TRUE) # construct sequence table of ASVs seqtab <- makeSequenceTable(mergers) # remove chimeras seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = FALSE, verbose = FALSE) # assign taxonomy taxa <- assignTaxonomy(seqtab.nochim, DBPATH, multithread = FALSE) # OUTPUT: write ASV table and TAXA table to file write.table(seqtab.nochim, file = "AmpliconSequenceVariableTable.txt", sep = "\t") write.table(taxa, file = "TaxonomyTable.txt", sep = "\t") ### ---- phyloseq ---- # create demo sample data sampleDF <- data.frame(sample = sapply(strsplit(rownames(seqtab.nochim), "_"), `[`, 1), treatment = c("T", "T", "C", "C", "T")) rownames(sampleDF) <- rownames(seqtab.nochim) # note that we do not have sample data right now ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), sample_data(sampleDF), tax_table(taxa)) # replace strings of DNA with an ASV sequence taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps))) # plot alpha diversity # open graphics device pdf(file = "AlphaDiversity.pdf", width = 4, height = 4) plot_richness(ps, x="treatment", measures=c("Shannon", "Simpson"), color="treatment") # close graphics device dev.off() ## create a beta diversity ordination # Transform data to proportions as appropriate for Bray-Curtis distances ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu)) ord.nmds.bray <- ordinate(ps.prop, method="NMDS", distance="bray") # plot the ordination # open graphics device pdf(file = "BetaDiversity.pdf", width = 4, height = 4) plot_ordination(ps.prop, ord.nmds.bray, color="treatment", title="Bray-Curtis NMDS") # close graphics device dev.off() ## Abundance barplot by Sample # open graphics device # set taxonomy that was chosen tax = "Phylum" pdf(file = "AbundanceBarplot.pdf", width = 4, height = 4) plot_bar(ps, x = "sample", fill = tax, title = "Abundance Barplot by Sample") # close graphics device dev.off()