## Cleaning and filtering 16S files in dada2; demo with SRA data

# input: path to fastq files, other filtering/trimming parameters
# output: folder with filtered fastq files


# load the R packages
require(dada2)
require(phyloseq)

## ---- setVariables ----

# path to directory that contains fastq files
PATH = "/export/sra/"

# desired SRA numbers
SRA = list.files(PATH, pattern = ".fastq")

# choose 16S reference database
REF_FILE = "SILVA"

if(REF_FILE == "SILVA") {
  
  # choose SILVA path
  DBPATH = "/export/refs/silva/silva_nr_v132_train_set.fa.gz"
} else if(REF_FILE == "GREENGENES") {
  
  # choose GREENGENES path
  DBPATH = "/export/refs/green/green_gene_13_8_train_set_97.fa.gz"
} else {
  
  # error if neither database is chosen
  stop("Incorrect database option.")
}


# paired end characterization; most Illumina files are sample names + "_R1_001.fastq" for forward reads
# however, sequences downloaded from NCBI have patterns: "_1.fastq" for forward and "_2.fastq" for reverse
PATTERNF = "_1.fastq"
PATTERNR = "_2.fastq"

# Quality cut: "Truncate reads at the first instance of a quality score less than or equal to truncQ"
TRUNCQ = 0

# Length cut: "Truncate reads after `trunclen` bases; reads shorter than this are discarded"
# NOTE: if paired-end, need two values (one for forward and one for reverse)
TRUNCLEN = c(0, 0)

# Head trim: "Number of nucleotides to remove from the start of each read"
TRIMLEFT = 0

# Tail trim: "Number of nucleotides to remove from the end of each read"
TRIMRIGHT = 0

# Minimum length: "Remove reads with length less than minLen" -- this happens AFTER other trims/truncations
MINLEN = 100

# Maxmimum N: "After truncation, sequences with more than `maxN` are discarded"
# NOTE: dada2 does not allow Ns
MAXN = 0

# Minimum quality: "After truncation, reads contain a quality score less than `minQ` are discarded"
MINQ = 0

## ---- getFiles----

# are files already split? 
if(any(grepl(PATTERNF, SRA))) {
  
  # find forward and reverse files
  fwd <- SRA[grep(PATTERNF, SRA)]
  rev <- SRA[grep(PATTERNR, SRA)]
  
  # get file paths
  fwdFiles <- paste0(PATH, fwd)
  revFiles <- paste0(PATH, rev)
} else {  # if files are not split, get path
  
  # get forward files
  fwdFiles <- paste0(PATH, SRA, PATTERNF)
  # get reverse files
  revFiles <- paste0(PATH, SRA, PATTERNR)
  
}

# error catch
if(length(file(fwdFiles[1])) == 0) {
  
  stop("cannot read forward file path")
  
} 
# error catch
if(length(file(revFiles[1])) == 0) {
  
  stop("cannot read reverse file path")
}

# check to make sure that the lengths of both files are the same
if(length(fwdFiles) != length(revFiles)) {
  
  stop("There is an unequal number of forward and reverse files")
}

# get sample names
fwdNames <- sapply(strsplit(basename(fwdFiles), PATTERNF), `[`, 1)
revNames <- sapply(strsplit(basename(revFiles), PATTERNR), `[`, 1)

# error catch if unordered
if(any(!fwdNames %in% revNames)) {
  
  stop("forward and reverse files are out of order")
  
}
### ---- filterAndTrim ----

# create subdirectory for filtered files
filtForward <- file.path("./filtered", paste0(fwdNames, "_F_filt.fastq.gz"))
filtReverse <- file.path("./filtered", paste0(revNames, "_R_filt.fastq.gz"))


# filter and trim
cleaned <- filterAndTrim(fwd = fwdFiles, rev = revFiles,
                         filt = filtForward, filt.rev = filtReverse,
                         # add parameters that the user selected in previous chunk
                         truncQ = TRUNCQ,
                         truncLen = TRUNCLEN,
                         trimLeft = TRIMLEFT,
                         trimRight = TRIMRIGHT,
                         maxN = MAXN,
                         minLen = MINLEN,
                         minQ = MINQ
)

# track how many reads were removed during filtering process
write.table(cleaned, file = "./filtered/filteredReads.txt", sep = "\t")


## ---- checkFilteredFiles ----

# path to filtered and cleaned reads
CLEANEDPATH = "./filtered/"

# pattern that specifies which reads are forward or reverse
# if single-read pairs, only specifyforward 
FILTEREDF = "_F_filt.fastq.gz"
FILTEREDR = "_R_filt.fastq.gz"


## ---- dada2 ----

# get forward and reverse reads
forward <- sort(list.files(CLEANEDPATH, pattern = FILTEREDF, full.names = TRUE))
reverse <- sort(list.files(CLEANEDPATH, pattern = FILTEREDR, full.names = TRUE))

# check to make sure that the lengths of both files are the same and that they match
fwdNames <- sapply(strsplit(basename(forward), FILTEREDF), `[`, 1)
revNames <- sapply(strsplit(basename(reverse), FILTEREDR), `[`, 1)
if(length(fwdNames) != length(revNames)) {
  stop("The number of forward and reverse files do not match.")
} else {
  
  if(any(!fwdNames%in% revNames)) {
    
    stop("Forward and reverse reads are out of order.")
  }
}

# perform error learning
errF <- learnErrors(forward, multithread = FALSE)
errR <- learnErrors(reverse, multithread = FALSE)

# perform denoising
dadaForward <- dada(forward, err = errF, multithread = FALSE)
dadaReverse <- dada(reverse, err = errR, multithread = FALSE)

# merge paired reads
mergers <- mergePairs(dadaF = dadaForward,
                      derepF = forward,
                      dadaR = dadaReverse,
                      derepR = reverse,
                      verbose = TRUE)

# construct sequence table of ASVs
seqtab <- makeSequenceTable(mergers)

# remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, 
                                    method = "consensus",
                                    multithread = FALSE,
                                    verbose = FALSE)

# assign taxonomy
taxa <- assignTaxonomy(seqtab.nochim, DBPATH, multithread = FALSE)

# OUTPUT: write ASV table and TAXA table to file
write.table(seqtab.nochim, file = "AmpliconSequenceVariableTable.txt", sep = "\t")
write.table(taxa, file = "TaxonomyTable.txt", sep = "\t")

### ---- phyloseq ----

# create demo sample data
sampleDF <- data.frame(sample = sapply(strsplit(rownames(seqtab.nochim), "_"), `[`, 1),
                       treatment = c("T", "T", "C", "C", "T"))
rownames(sampleDF) <- rownames(seqtab.nochim)

# note that we do not have sample data right now
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), 
               sample_data(sampleDF), 
               tax_table(taxa))

# replace strings of DNA with an ASV sequence 
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))

# plot alpha diversity
# open graphics device
pdf(file = "AlphaDiversity.pdf", width = 4, height = 4)
plot_richness(ps, x="treatment", measures=c("Shannon", "Simpson"), color="treatment")
# close graphics device
dev.off()

## create a beta diversity ordination
# Transform data to proportions as appropriate for Bray-Curtis distances
ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
ord.nmds.bray <- ordinate(ps.prop, method="NMDS", distance="bray")

# plot the ordination
# open graphics device
pdf(file = "BetaDiversity.pdf", width = 4, height = 4)
plot_ordination(ps.prop, ord.nmds.bray, color="treatment", title="Bray-Curtis NMDS")
# close graphics device
dev.off()

## Abundance barplot by Sample
# open graphics device
# set taxonomy that was chosen
tax = "Phylum"
pdf(file = "AbundanceBarplot.pdf", width = 4, height = 4)
plot_bar(ps, x = "sample", fill = tax, title = "Abundance Barplot by Sample")
# close graphics device
dev.off()