Abstract

Despite the increasing popularity of DNA metabarcoding in the assessment of aquatic ecosystems using fish eDNA or ichthyoplankton, challenges have hampered its broader application in the Neotropical freshwaters. Using five mock communities composed of fish species from two Neotropical river basins, we evaluated the influence of DNA concentration and choice of mitochondrial 12S molecular markers (MiFish, NeoFish and Teleo) on species detection and Relative Read Abundance (RRA) using DNA metabarcoding. Of the three 12S markers analysed, only MiFish detected all species from all mock communities. The performance of a taxonomy-free approach using ASV/MOTUs was not as precise as assigning DNA reads to species using a curated 12S library that includes approximately 100 fish species, since more than one ASV/MOTU was observed for the same specimen. Thus, here we showcase the importance of a custom reference database to allow precise assignment of Neotropical fish species in metabarcoding studies and that the RRA is dependent on community composition, marker and DNA concentration. We highlight the importance of controlled experiments using known species communities before large investments are made in assessing biodiversity using non-invasive methods that apply DNA metabarcoding.

Bioinformatics

Data acquisiton

Download demultiplexed samples from Base Space using the bs interface.

#navigate to raw-data folder
cd $raw_data_folder/$run_folder;

#authenticate to BaseSpace (only at first log in)
bs auth;

#list datasets from runs on your BaseSpace
bs list datasets;

#create folders to organize fastq files
mkdir ~/runs/run_01mar21/fastq/;        #edna
mkdir ~/runs/run_09fev21/fastq/;        #edna
mkdir ~/runs/run_29jul20/fastq/;        #edna

#download runs from BaseSpace
bs download project -n fish_eDNA -o ~/runs/run_29jul20/fastq/ --extension=fastq.gz;          #primeira corrida LGC
bs download project -n eDNA_2run_B -o ~/runs/run_09fev21/fastq/ --extension=fastq.gz;        #segunda corrida LGC
bs download project -n iSeqRun2_Daniel -o ~/runs/run_01mar21/fastq/ --extension=fastq.gz;    #amostras iSeq ecomol

#organize all fastq files of each run in a single folder
mkdir ~/runs/run_01mar21/fastq/all;
mkdir ~/runs/run_09fev21/fastq/all;
mkdir ~/runs/run_29jul20/fastq/all;

#move all fastqfiles to a single folder
mv ~/runs/run_01mar21/fastq/*/*fastq.gz ~/runs/run_01mar21/fastq/all;
mv ~/runs/run_09fev21/fastq/*/*fastq.gz ~/runs/run_09fev21/fastq/all;
mv ~/runs/run_29jul20/fastq/*/*fastq.gz ~/runs/run_29jul20/fastq/all;


Load R libs and system programs

# 0 - load libraries ----
{
  library(dplyr)
  library(tidyr)
  library(tibble)
  library(stringr)
  library(ggplot2)
  library(ggbreak)
  library(ggtree)
  library(phyloseq)
  library(Biostrings)
  library(Matrix)
  library(ShortRead)
  library(dada2)
  library(DECIPHER)
  library(future)
  library(vegan)
  library(ape)
  library(phangorn)
  library(adegenet)
}

#set complete path to cutadapt executable
cutadapt <- "/usr/local/bin/cutadapt"

#important 
prjct_path <- "~/prjcts/fish_eDNA/sfjq"

notes_path <- paste0(prjct_path,"/notes")

results_path <- paste0(prjct_path,"/results")

figs_path <- paste0(results_path,"/figs")

prcj_radical <- "SFJq_fish_metabarcoding"
#path to project data folder were the processed reads will be stored
data_path <- paste0(prjct_path,"/data/reads")


Quality control

Chech overall quality of sequencing runs for all samples


Demultiplex SFJQ sample (MiFish & NeoFish mixed)

#Demultiplex SFJQ sample (MiFish & NeoFish mixed)
#samples MiniSeq LGC
cutadapt -j 79 --no-indels  -g file:~/prjcts/fish_eDNA/sfjq/data/primers_neo_mi.fasta  -G file:~/prjcts/fish_eDNA/sfjq/data/primers_neo_mi.fasta  -o ~/runs/run_09fev21/fastq/all/sfjq_dmx/SFJQ-{name1}-{name2}_R1_001.fastq.gz  -p ~/runs/run_09fev21/fastq/all/sfjq_dmx/SFJQ-{name1}-{name2}_R2_001.fastq.gz  ~/runs/run_09fev21/fastq/all/SFJQ-neo-mi_S23_L001_R1_001.fastq  ~/runs/run_09fev21/fastq/all/SFJQ-neo-mi_S23_L001_R2_001.fastq 2> ~/runs/run_09fev21/fastq/all/sfjq_dmx/cut_SFJQ_demux.txt

cp ~/runs/run_09fev21/fastq/all/sfjq_dmx/SFJQ-neo_FWD-neo_REV.* ~/runs/run_01mar21/fastq/all/
cp ~/runs/run_09fev21/fastq/all/sfjq_dmx/SFJQ-mif_FWD-mif_REV.* ~/runs/run_09fev21/fastq/all/

mv ~/runs/run_01mar21/fastq/all/SFJQ-neo-mi_S23_L001_R* ~/runs/run_01mar21/fastq/all/sfjq_dmx/


#samples iSeq Ecomol
cutadapt -j 79 --no-indels  -g file:~/prjcts/fish_eDNA/sfjq/data/primers_neo_mi.fasta  -G file:~/prjcts/fish_eDNA/sfjq/data/primers_neo_mi.fasta  -o ~/runs/run_01mar21/fastq/all/sfjq_dmx/Da23-{name1}-{name2}_R1_001.fastq.gz  -p ~/runs/run_01mar21/fastq/all/sfjq_dmx/Da23-{name1}-{name2}_R2_001.fastq.gz  ~/runs/run_01mar21/fastq/all/Da23_S72_L001_R1_001.fastq ~/runs/run_01mar21/fastq/all/Da23_S72_L001_R2_001.fastq 2> ~/runs/run_01mar21/fastq/all/sfjq_dmx/cut_SFJQ_demux.txt

cp ~/runs/run_01mar21/fastq/all/sfjq_dmx/Da23-neo_FWD-neo_REV.* ~/runs/run_01mar21/fastq/all/
cp ~/runs/run_01mar21/fastq/all/sfjq_dmx/Da23-mif_FWD-mif_REV.* ~/runs/run_01mar21/fastq/all/

mv ~/runs/run_01mar21/fastq/all/Da23_S72_L001_R* ~/runs/run_01mar21/fastq/all/sfjq_dmx/

Set path to raw data

#1 - load runs raw data ----
## All libs are demultiplexed
{
  # PATH to the directory containing raw fastq files after unzipping.
  libs_path1 <- "~/runs/run_29jul20/fastq/all"
  libs_path2 <- "~/runs/run_09fev21/fastq/all" 
  libs_path3 <- "~/runs/run_01mar21/fastq/all" 
}
#check content
list.files(path = libs_path1,pattern = "fastq") 
list.files(path = libs_path2,pattern = "fastq") 
list.files(path = libs_path3,pattern = "fastq") 

Identify sample names radicals

#2 - get sample names ----

# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
{
  all_fnFs1 <- sort(list.files(libs_path1, pattern="_R1_001.fastq", full.names = TRUE))
  all_fnRs1 <- sort(list.files(libs_path1, pattern="_R2_001.fastq", full.names = TRUE))

  all_fnFs2 <- sort(list.files(libs_path2, pattern="_R1_001.fastq", full.names = TRUE))
  all_fnRs2 <- sort(list.files(libs_path2, pattern="_R2_001.fastq", full.names = TRUE))

  all_fnFs3 <- sort(list.files(libs_path3, pattern="_R1_001.fastq", full.names = TRUE))
  all_fnRs3 <- sort(list.files(libs_path3, pattern="_R2_001.fastq", full.names = TRUE))

  all_fnFs <- c(all_fnFs1,all_fnFs2,all_fnFs3)
  all_fnRs <- c(all_fnRs1,all_fnRs2,all_fnRs3)
}
#load csv with primers and respective samples
primers_n_samples <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/primers_n_samples_sfjq.csv",
         header = TRUE)


#3 - map sample names to reads files ----
primers_n_samples <- primers_n_samples %>%
  mutate("R1" = "R1",
         "R2" = "R2")

for (sample in 1:nrow(primers_n_samples)) {
  
  primers_n_samples$R1[sample] <-
   all_fnFs[grep(pattern =  paste0("/",primers_n_samples$File_name[sample]),x = all_fnFs)]
  
  primers_n_samples$R2[sample] <-
   all_fnRs[grep(pattern =  paste0("/",primers_n_samples$File_name[sample]),x = all_fnRs)]

}


Define sample levels

#4 - set sample levels
primers_n_samples$File_name

sample_levels <- c(
"Da23-mif", "SFJQ-mif",
"Da23-neo", "SFJQ-neo",
"Da20","SFnNorm-mi",
"Da19","SFnNorm-neo",
"Da22","SFNorm-mi",
"Da21","SFNorm-neo",
"pJequei-N-norm-N","pJequei-N-norm-M","pJequei-N-norm-T",
"pJequei-norm-N","pJequei-norm-M","pJequei-norm-T",
"Cassaum","neg-PCR2")

primer_levels <- c("NeoFish", "MiFish", "Teleo","NeoFish/MiFish", "NeoFish/MiFish/Teleo")


### Remove primers from reads

As the primer-derived sequences are identical, they are not informative and thus must be removed for the following steps.


Load primer sequences

#4- identify primers ----

#primers sequences used for each sample
# inosine pairs with A, C, U
#                    T, G, A = IUPAC code:  D
#cutadapt  accepts IUPAC code !!!!!!!!
{
  #neo
  neo_FWD <- "CGCCGTCGCAAGCTTACCCT"
  names(neo_FWD) <- "neo_FWD"
  neo_REV <- "AGTGACGGGCGGTGTGTGC"
  names(neo_REV) <- "neo_REV" 
  
  #mif
  mif_FWD <- "GTCGGTAAAACTCGTGCCAGC"
  names(mif_FWD) <- "mif_FWD"
  mif_REV <- "ACATAGTGGGGTATCTAATCCCAGTTTG"
 # mif_REV <- "CATAGTGGGGTATCTAATCCCAGTTTG" #original
  names(mif_REV) <- "mif_REV" 
  
  #tel
  tel_FWD <- "ACACCGCCCGTCACTCT"
  names(tel_FWD) <- "tel_FWD"
  tel_REV <- "ACTTCCGGTACACTTACCATG"
  names(tel_REV) <- "tel_REV"  
  
  
#creates a list of single row tibbles for each primer
primers <- tibble(Primers = c(neo_FWD,neo_REV,
                              mif_FWD,mif_REV,
                              tel_FWD,tel_REV)) %>% 
  mutate(`Primer names`= names(Primers)) %>% 
  split(1:nrow(.))
}


Generate sequences for complement, reverse, and reverse complement for each primer

The function allOrients is used to generate all possible orientations for primers FWD e REV.

#5 - check primer orientation ----

#function to get all possible primer orientations
allOrients <- function(primers) {
   # Create all orientations of the input sequence
    # Must be a tibble with cols = c(Primers,`Primer names`)
  
   require(Biostrings)
   dna <- Biostrings::DNAString(primers$Primers)  # The Biostrings works w/ DNAString objects rather than character vectors
   orients <- c(Forward = dna, 
                Complement = Biostrings::complement(dna), 
                Reverse = Biostrings::reverse(dna),
                RevComp = Biostrings::reverseComplement(dna))
   names(orients) <- paste0(names(orients))
   
   primer_tbl <- sapply(orients, toString)
   
   primer_tbl <- dplyr::tibble(Sequence = primer_tbl,
                        `Primer orientation` = names(primer_tbl)) %>% 
     dplyr::mutate(Primer = primers$`Primer names`) %>%
     unite(col=`Orientation name`, Primer ,`Primer orientation`,remove = FALSE)
   
   return(primer_tbl)  # Convert back to character vector
}


#apply function 
primers_all_orients <- purrr::map_dfr(primers, allOrients)

names(primers_all_orients$Sequence) <- primers_all_orients$`Orientation name`


Remove reads with undetermined bases (Ns) and unpaired

Reads with undetermined bases prevent proper primer identification and ASV determination. These sequences must be removed from the data.

#6 - pre filter reads with Ns for primer checking ----
# create names for N-cleaned files

primers_n_samples <- primers_n_samples %>%
  mutate("R1 N-cleaned" = "R1 N-cleaned",
         "R2 N-cleaned" = "R2 N-cleaned")

for (sample in 1:nrow(primers_n_samples)) {
  primers_n_samples$`R1 N-cleaned`[sample] <-
   paste0(data_path,"/N-cleaned/",primers_n_samples$File_name[sample],"_R1_N-cleaned.fastq.gz")
  primers_n_samples$`R2 N-cleaned`[sample] <-
   paste0(data_path,"/N-cleaned/",primers_n_samples$File_name[sample],"_R2_N-cleaned.fastq.gz")
}

# remove reads with Ns to make primer filtering more accurate

dada2::filterAndTrim(
  fwd = primers_n_samples$R1, filt = primers_n_samples$`R1 N-cleaned`, 
  rev = primers_n_samples$R2, filt.rev = primers_n_samples$`R2 N-cleaned`,
  maxN = 0, multithread = TRUE, matchIDs = TRUE,
  verbose = TRUE, compress = TRUE)

# pivote table to longer format
primers_n_samples <- primers_n_samples %>% 
  pivot_longer(cols = c(R1,R2,`R1 N-cleaned`,`R2 N-cleaned`),
               names_to = "Stage", values_to = "Read file")


Count primer presence on reads

Before primer removal it is possible to count their presence on the reads. This procedures is carried on independently for each sample.

#6 - count primer orientation hits ----

#function to count primer on each specific library
primerHits <- function(primer, fn) {
   # Counts number of reads in which the primer is found
   nhits <- Biostrings::vcountPattern(primer, ShortRead::sread(ShortRead::readFastq(fn)), fixed = FALSE)
   return(sum(nhits > 0))
}

#function to call primerHits for multiple primers
multi_primerHits <- function(Read_file,primers){
  primer_counts <- purrr::map_df(primers,.f = primerHits, fn = Read_file)
  primer_counts <- primer_counts %>%  mutate(`Read file` = Read_file)
  return(primer_counts)
}
###########

#vector of read files to look on for primers
reads_seqs <- primers_n_samples %>% 
  filter(Stage %in% c("R1 N-cleaned", "R2 N-cleaned")) %>% 
  select(`Read file`) %>% as.list()
 

#named vector of primer sequences
primers_seqs <- primers_all_orients$Sequence


cores_to_be_used <- future::availableCores() - 2 # Usar todos os cores -2 = 78

future::plan(future::multisession(workers = cores_to_be_used))


#count primers
primers_in_Nreads <- furrr::future_map_dfr(reads_seqs$`Read file`, .f = multi_primerHits, primers = primers_seqs, .options = furrr::furrr_options(seed = NULL))

#get sample information into primers_in_Nreads table
primers_in_Nreads <- left_join(primers_in_Nreads,primers_n_samples,by = "Read file")

# 
# primers_in_Nreads_bckp <- primers_in_Nreads
# primers_in_Nreads <- primers_in_Nreads_bckp

Prepare primer counts for ploting

#7- prepare primer counts for plots ----

# cat(paste0(colnames(primers_in_Nreads),"\n"))

primers_in_Nreads <-
  primers_in_Nreads %>% 
  select(# `Read file
 File_name, Type, Group, Library, Primer, Run, Stage,
         neo_FWD_Forward, neo_REV_Forward, neo_FWD_Complement, neo_REV_Complement, 
         neo_FWD_Reverse, neo_REV_Reverse, neo_FWD_RevComp, neo_REV_RevComp, 
         mif_FWD_Forward, mif_REV_Forward, mif_FWD_Complement, mif_REV_Complement, 
         mif_FWD_Reverse, mif_REV_Reverse, mif_FWD_RevComp, mif_REV_RevComp, 
         tel_FWD_Forward, tel_REV_Forward, tel_FWD_Complement, tel_REV_Complement, 
         tel_FWD_Reverse, tel_REV_Reverse, tel_FWD_RevComp, tel_REV_RevComp)


#write.csv(x = primer_hits_tbl, file = "~/prjcts/fish_eDNA/notes/jequiDNApool/csv/primers_hits_in_reads.csv")

str(primers_in_Nreads)
colnames(primers_in_Nreads)
rownames(primers_in_Nreads)

primers_in_Nreads$Primer
primers_in_Nreads$Library


#8- prepare primer counts for plots in ggplot----

#convert primer hits table to long format
primers_in_Nreads_long <- primers_in_Nreads %>% 
  gather(key = Sequences, 
         value = Count,  
         neo_FWD_Forward, neo_FWD_Complement, neo_FWD_Reverse,neo_FWD_RevComp, 
         neo_REV_Forward, neo_REV_Complement, neo_REV_Reverse, neo_REV_RevComp,
         mif_FWD_Forward, mif_FWD_Complement, mif_FWD_Reverse, mif_FWD_RevComp, 
         mif_REV_Forward, mif_REV_Complement, mif_REV_Reverse, mif_REV_RevComp,
         tel_FWD_Forward, tel_FWD_Complement, tel_FWD_Reverse, tel_FWD_RevComp, 
         tel_REV_Forward, tel_REV_Complement, tel_REV_Reverse, tel_REV_RevComp
         ) %>% 
  mutate(Sequences = factor(Sequences,
                            levels = c("neo_FWD_Forward","neo_FWD_RevComp",
                                       "neo_REV_Forward","neo_REV_RevComp",
                                       "neo_FWD_Complement","neo_FWD_Reverse",
                                       "neo_REV_Complement","neo_REV_Reverse",
                                       
                                       
                                       "mif_FWD_Forward","mif_FWD_RevComp",
                                       "mif_REV_Forward","mif_REV_RevComp",
                                       "mif_FWD_Complement","mif_FWD_Reverse",
                                       "mif_REV_Complement","mif_REV_Reverse",
                                       
                                       
                                       "tel_FWD_Forward","tel_FWD_RevComp",
                                       "tel_REV_Forward","tel_REV_RevComp",
                                       "tel_FWD_Complement","tel_FWD_Reverse",
                                       "tel_REV_Complement","tel_REV_Reverse")),
                                       
                                       
         File_name = factor(File_name,levels = sample_levels),
         Run = as.factor(Run),
         Primer = factor(Primer,levels = c("NeoFish",
                                           "MiFish",
                                           "Teleo",
                                           "NeoFish/MiFish",
                                           "NeoFish/MiFish/Teleo"))) 



# PLOT 1: primers counts in reads tile plot - only primers FWD & REV, foward & revcomp ----
primers_tile <- 
  primers_in_Nreads_long %>% 
  # filter(Sequences  %in% c(
  #   "mif_REV_RevComp", "mif_REV_Forward", "mif_FWD_RevComp", "mif_FWD_Forward",
  #   "neo_REV_RevComp", "neo_REV_Forward", "neo_FWD_RevComp", "neo_FWD_Forward",
  #   "tel_REV_RevComp", "tel_REV_Forward", "tel_FWD_RevComp", "tel_FWD_Forward")) %>% 
  mutate(File_name = factor(File_name,levels = sample_levels)) %>% 
  ggplot2::ggplot(aes(y=File_name,x=Sequences,fill=log10(Count)
                      # ,col=Stage
                      )) +
  geom_tile()+
  geom_text(aes(label = Count),size=1)+
  # scale_fill_gradient(low="white", high="darkgreen",trans="log10") +
  scale_fill_gradientn(name = "Primer counts",
                       colours = c("white","darkgreen"),
                       values = c(0,1),
                       na.value ="white") +
  theme_light(base_line_size = 1,base_size = 6) +
  theme(axis.text.x = element_text(angle = 45,hjust = 1)) +
  geom_hline(yintercept = c(40.5,82.5,86.5,116.5),color = "grey") +
  geom_vline(xintercept = c(4.5,8.5,12.5,16.5),color = "grey") +
  # coord_fixed(ratio = 0.20) +
  xlab("Primers") +
  ylab("Amostra") +
  ggtitle(label = "eDNA 1st, 2nd & 3rd runs",
              subtitle = "Primer presence on sample reads") 

primers_tile

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/1-primers_in_reads_all_FR.png",
     plot = primers_tile,
     device = "png",
     width = 27,
     height = 40,
     units = "cm",
     dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/1-primers_in_reads_all_FR.pdf",
     plot = primers_tile,
     device = "pdf",
     width = 27,
     height = 40,
     units = "cm",
     dpi = 600)


#9- write csv file with primer hits counts per lib ----
write.csv(x = primer_hits_tbl,file = "~/prjcts/fish_eDNA/sfjq/results/primers_hits_tbl.csv",
          row.names = FALSE)

Remove primers from reads

Primer removal with Cutadapt

The cutadapt software (DOI:10.14806/ej.17.1.200) was used for primer removal on read sequences.

#10 - cutadapt ----

#set or create cutadapt processed reads dir path
path.cut <- file.path(data_path, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)


Generate and execute primer-specific commands

The original DADA2 ITS protocol removes only FWD and REV reverse complement sequences. This protocol is adjusted for selecting reads only of the expected primer and removing the primer.

# opitional: remove all primers from all reads and samples ----

#10 - map sample names to reads files ----

#name outputs
cutadapt_files <- primers_n_samples %>% 
  filter(Stage %in% c("R1 N-cleaned","R2 N-cleaned")) %>% 
  mutate(`Read file` = str_replace_all(.$`Read file`,pattern = "N-cleaned",replacement = "cutadapt")) %>% 
  mutate(Stage = str_replace_all(.$Stage,pattern = "N-cleaned",replacement = "cutadapt"))


primers_n_samples <- bind_rows(primers_n_samples,cutadapt_files)

#all ----
{ 
          #make reverse complements
        #  the XXX_Complement and XXX_reverse have no hits so were ignored at last plot and from now on
          all_FWD.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("FWD_Forward"))] 
          all_FWD.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("FWD_RevComp"))] 
          all_REV.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("REV_Forward"))] 
          all_REV.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("REV_RevComp"))] 
  
}
  
#remove primers and filter only the reads that contain the expected primer ----
{
  #MiFish ----
  mif_FWD.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("mif_FWD_Forward"))] 
  mif_FWD.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("mif_FWD_RevComp"))] 
  mif_REV.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("mif_REV_Forward"))] 
  mif_REV.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("mif_REV_RevComp"))] 
  
  #NeoFish ----
  neo_FWD.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("neo_FWD_Forward"))] 
  neo_FWD.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("neo_FWD_RevComp"))] 
  neo_REV.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("neo_REV_Forward"))] 
  neo_REV.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("neo_REV_RevComp"))] 
  
  #Teleo ----
  tel_FWD.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("tel_FWD_Forward"))]
  tel_FWD.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("tel_FWD_RevComp"))]
  tel_REV.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("tel_REV_Forward"))]
  tel_REV.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("tel_REV_RevComp"))]
  }
  
                #creat flags
                # Trim FWD and the reverse-complement of REV off of R1 (forward reads)
                all_R1.flags <- paste("-g", all_FWD.orients, "-a", all_REV.RC)
                # Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
                all_R2.flags <- paste("-G", all_REV.orients, "-A", all_FWD.RC)
                
#create primer-specific tags ----
  {
  #MiFish ----
  # Trim FWD and the reverse-complement of REV off of R1 (forward reads)
  mif_R1.flags <- paste("-g", mif_FWD.orients, "-a", mif_REV.RC)
  # Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
  mif_R2.flags <- paste("-G", mif_REV.orients, "-A", mif_FWD.RC)
  
  #NeoFish ----
  # Trim FWD and the reverse-complement of REV off of R1 (forward reads)
  neo_R1.flags <- paste("-g", neo_FWD.orients, "-a", neo_REV.RC)
  # Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
  neo_R2.flags <- paste("-G", neo_REV.orients, "-A", neo_FWD.RC)
  
  #Teleo ----
  # Trim FWD and the reverse-complement of REV off of R1 (forward reads)
  tel_R1.flags <- paste("-g", tel_FWD.orients, "-a", tel_REV.RC)
  # Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
  tel_R2.flags <- paste("-G", tel_REV.orients, "-A", tel_FWD.RC)
  }

#cutadapt files path and names ----
{
        all_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt"]
        names(all_fnFs.cut) <- all_fnFs.cut %>% 
            str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
            str_remove(pattern = "_cutadapt.fastq.gz")
        all_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt"]
        names(all_fnRs.cut) <- all_fnRs.cut %>% 
            str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
            str_remove(pattern = "_cutadapt.fastq.gz")
        
        all_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned"]
        all_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned"]

}

{
  #neo ----
  neo_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt" & primers_n_samples$Primer == "NeoFish"]
    names(neo_fnFs.cut) <- neo_fnFs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  neo_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt" & primers_n_samples$Primer == "NeoFish"]
    names(neo_fnRs.cut) <- neo_fnRs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  neo_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned" & primers_n_samples$Primer == "NeoFish"]
    names(neo_fnFs.filtN) <- neo_fnFs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
  neo_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned" & primers_n_samples$Primer == "NeoFish"]
    names(neo_fnRs.filtN) <- neo_fnRs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
    
  #mif ----
  mif_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt" & primers_n_samples$Primer == "MiFish"]
    names(mif_fnFs.cut) <- mif_fnFs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  mif_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt" & primers_n_samples$Primer == "MiFish"]
    names(mif_fnRs.cut) <- mif_fnRs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  mif_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned" & primers_n_samples$Primer == "MiFish"]
    names(mif_fnFs.filtN) <- mif_fnFs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
  mif_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned" & primers_n_samples$Primer == "MiFish"]
    names(mif_fnRs.filtN) <- mif_fnRs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
    
      
  #tel ----
  tel_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt" & primers_n_samples$Primer == "Teleo"]
    names(tel_fnFs.cut) <- tel_fnFs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  tel_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt" & primers_n_samples$Primer == "Teleo"]
    names(tel_fnRs.cut) <- tel_fnRs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  tel_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned" & primers_n_samples$Primer == "Teleo"]
    names(tel_fnFs.filtN) <- tel_fnFs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
  tel_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned" & primers_n_samples$Primer == "Teleo"]
    names(tel_fnRs.filtN) <- tel_fnRs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
    
          
  #neo/mif & neo/mif/tel ----
  nmt_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt" & primers_n_samples$Primer %in% c("NeoFish/MiFish","NeoFish/MiFish/Teleo")]
    names(nmt_fnFs.cut) <- nmt_fnFs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  nmt_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt" & primers_n_samples$Primer %in% c("NeoFish/MiFish","NeoFish/MiFish/Teleo")]
    names(nmt_fnRs.cut) <- nmt_fnRs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  nmt_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned" & primers_n_samples$Primer %in% c("NeoFish/MiFish","NeoFish/MiFish/Teleo")]
    names(nmt_fnFs.filtN) <- nmt_fnFs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
  nmt_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned" & primers_n_samples$Primer %in% c("NeoFish/MiFish","NeoFish/MiFish/Teleo")]
    names(nmt_fnRs.filtN) <- nmt_fnRs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
}


# e essa ordem tá perigosa
names(primers_n_samples$`Read file`)<- primers_n_samples$File_name


primers_n_samples$`Read file`[primers_n_samples$Stage %in% c("R1 N-cleaned", "R2 N-cleaned")] %>%  length()

#TODO esse tem q ir pro purrr...


# Run Cutadapt
#   output folder must exist
for(i in 1:40) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(all_R1.flags, all_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", all_fnFs.cut[i], "-p", all_fnRs.cut[i], # output files
all_fnFs.filtN[i], all_fnRs.filtN[i],  # input files
"--minimum-length 10")) # guarantee no zerolength reads
}


length(neo_fnFs.cut)
length(mif_fnFs.cut)
length(mif_fnRs.cut)
length(tel_fnFs.cut)
length(nmt_fnFs.cut)


#run cutadapt by primer ----

#MiFish ----
for(i in 1:length(mif_fnFs.cut)) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(mif_R1.flags, mif_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", mif_fnFs.cut[i], "-p", mif_fnRs.cut[i], # output files
mif_fnFs.filtN[i], mif_fnRs.filtN[i],  # input files
"--minimum-length 10 --discard-untrimmed")) # guarantee no zerolength reads
}

#NeoFish ----
for(i in 1:length(neo_fnFs.cut)) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(neo_R1.flags, neo_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", neo_fnFs.cut[i], "-p", neo_fnRs.cut[i], # output files
neo_fnFs.filtN[i], neo_fnRs.filtN[i],  # input files
"--minimum-length 10 --discard-untrimmed")) # guarantee no zerolength reads
}


#Teleo ----
for(i in 1:length(tel_fnFs.cut)) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(tel_R1.flags, tel_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", tel_fnFs.cut[i], "-p", tel_fnRs.cut[i], # output files
tel_fnFs.filtN[i], tel_fnRs.filtN[i],  # input files
"--minimum-length 10 --discard-untrimmed")) # guarantee no zerolength reads
}


#neo/mif & neo/mif/tel ----
  
for(i in 1:length(nmt_fnFs.cut)) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(all_R1.flags, all_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", nmt_fnFs.cut[i], "-p", nmt_fnRs.cut[i], # output files
nmt_fnFs.filtN[i], nmt_fnRs.filtN[i],  # input files
"--minimum-length 10 --discard-untrimmed")) # guarantee no zerolength reads
}


# as  Da23-mif & neo & SFJQ-mif & neo were previously demultiplexed, they do not bear the primers, so all reads are filtered out


Check primer removal

Change de library number index in order to check the presence of remaining primer sequences on each lib data. It is expected that all removed orientation counts change to zero since the primer sequences are removed.

# only ofr didatic purposes (or logical error checking)
# 8 - check for remaining adapters ----
# As a sanity check, we will count the presence of primers in the first cutadapt-ed sample:


#vector of read files to look on for primers
reads_seqs_cut <- primers_n_samples %>% 
  filter(Stage %in% c("R2 cutadapt", "R2 cutadapt")) %>% 
  select(`Read file`) %>% as.list()


#count primers
future::plan(future::multisession(workers = cores_to_be_used))
primers_in_cut_reads <- furrr::future_map_dfr(reads_seqs_cut$`Read file`, .f = multi_primerHits, primers = primers_seqs, .options = furrr::furrr_options(seed = NULL))

 # 
# primers_in_Nreads <- purrr::map_df(reads_seqs,.f = multi_primerHits, primers = primers_seqs)

#get sample information into primers_in_Nreads table
primers_in_cut_reads <- left_join(primers_in_cut_reads,primers_n_samples,by = "Read file")

# primers_in_cut_reads_bckp <- primers_in_cut_reads
# primers_in_cut_reads <- primers_in_cut_reads_bckp

primers_in_cut_reads <-
  primers_in_cut_reads %>% 
  select(# `Read file
 File_name, Type, Group, Library, Primer, Run, Stage,
         neo_FWD_Forward, neo_REV_Forward, neo_FWD_Complement, neo_REV_Complement, 
         neo_FWD_Reverse, neo_REV_Reverse, neo_FWD_RevComp, neo_REV_RevComp, 
         mif_FWD_Forward, mif_REV_Forward, mif_FWD_Complement, mif_REV_Complement, 
         mif_FWD_Reverse, mif_REV_Reverse, mif_FWD_RevComp, mif_REV_RevComp, 
         tel_FWD_Forward, tel_REV_Forward, tel_FWD_Complement, tel_REV_Complement, 
         tel_FWD_Reverse, tel_REV_Reverse, tel_FWD_RevComp, tel_REV_RevComp)

#convert primer hits table to long format
primers_in_cut_reads_long <- primers_in_cut_reads %>% 
  gather(key = Sequences, 
         value = Count,  
         neo_FWD_Forward, neo_FWD_Complement,
         neo_FWD_Reverse,neo_FWD_RevComp, 
         neo_REV_Forward, neo_REV_Complement,
         neo_REV_Reverse, neo_REV_RevComp,
         mif_FWD_Forward, mif_FWD_Complement,
         mif_FWD_Reverse, mif_FWD_RevComp, 
         mif_REV_Forward, mif_REV_Complement,
         mif_REV_Reverse, mif_REV_RevComp,
         tel_FWD_Forward, tel_FWD_Complement,
         tel_FWD_Reverse, tel_FWD_RevComp, 
         tel_REV_Forward, tel_REV_Complement,
         tel_REV_Reverse, tel_REV_RevComp) %>% 
  mutate(Sequences = factor(Sequences,levels = c("neo_FWD_Forward", "neo_REV_Forward",
                                                 "neo_FWD_RevComp", "neo_REV_RevComp",
                                                 "neo_FWD_Complement", "neo_REV_Complement",
                                                 "neo_FWD_Reverse", "neo_REV_Reverse",
                                       
                                                 "mif_FWD_Forward", "mif_REV_Forward",
                                                 "mif_FWD_RevComp", "mif_REV_RevComp",
                                                 "mif_FWD_Complement", "mif_REV_Complement",
                                                 "mif_FWD_Reverse", "mif_REV_Reverse",
                                       
                                                "tel_FWD_Forward", "tel_REV_Forward",
                                                "tel_FWD_RevComp", "tel_REV_RevComp",
                                                "tel_FWD_Complement", "tel_REV_Complement",
                                                "tel_FWD_Reverse", "tel_REV_Reverse")),
         File_name = factor(File_name,levels = sample_levels),
         Run = as.factor(Run),
         Primer = factor(Primer,levels = c("NeoFish","MiFish","Teleo","NeoFish/MiFish","NeoFish/MiFish/Teleo"))) 



# PLOT 1: primers counts in reads tile plot - only primers FWD & REV, foward & revcomp ----
primers_tile_clean <- 
  primers_in_cut_reads_long %>% 
  filter(Sequences  %in% c(
    "mif_REV_RevComp", "mif_REV_Forward", "mif_FWD_RevComp", "mif_FWD_Forward",
    "neo_REV_RevComp", "neo_REV_Forward", "neo_FWD_RevComp", "neo_FWD_Forward",
    "tel_REV_RevComp", "tel_REV_Forward", "tel_FWD_RevComp", "tel_FWD_Forward")) %>% 
  filter(Run %in% c("LGC_MiniSeq_1")) %>% 
  ggplot2::ggplot(aes(y=File_name,x=Sequences,fill=log10(Count))) +
  geom_tile()+
  geom_text(aes(label = Count),size=1)+
  # scale_fill_gradient(low="white", high="darkgreen",trans="log10") +
  scale_fill_gradientn(name = "Primer counts",
                       colours = c("white","darkgreen"),
                       values = c(0,1),
                       na.value ="white") +
  theme_light(base_line_size = 1,base_size = 6) +
  theme(axis.text.x = element_text(angle = 45,hjust = 1)) +
  geom_hline(yintercept = c(40.5,82.5,86.5,116.5),color = "grey") +
  geom_vline(xintercept = c(4.5,8.5,12.5,16.5),color = "grey") +
  # coord_fixed(ratio = 0.20) +
  xlab("Primers") +
  ylab("Amostra") +
  ggtitle(label = "eDNA 1st, 2nd & 3rd runs",
              subtitle = "Primer presence on sample reads") 
# +
#   facet_wrap(~Run, drop = TRUE)


primers_tile_clean



Quality filtering

Here the DADA2 pipeline starts.


Set input libs paths

Define the paths to the libraries after cutadapt primer removal.

# 9 - load clean seqs to DADA2 pipe ----

all_fnFs.cut <- c(mif_fnFs.cut,neo_fnFs.cut,tel_fnFs.cut,nmt_fnFs.cut)
all_fnRs.cut <- c(mif_fnRs.cut,neo_fnRs.cut,tel_fnRs.cut,nmt_fnRs.cut)


all_fnFs.cut
all_fnRs.cut


Set quality filtering output files names

# 11 - quality filter preparation ----


#name outputs
Qfilter_files <- primers_n_samples %>% 
  filter(Stage %in% c("R1 N-cleaned","R2 N-cleaned")) %>% 
  mutate(`Read file` = str_replace_all(.$`Read file`,pattern = "N-cleaned",replacement = "Qfiltered")) %>% 
  mutate(Stage = str_replace_all(.$Stage,pattern = "N-cleaned",replacement = "Qfiltered"))

primers_n_samples <- bind_rows(primers_n_samples,Qfilter_files)

#rename files so all can be traceble
names(primers_n_samples$`Read file`) <- primers_n_samples$File_name


# Qfiltered files path and names

all_filtFs <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 Qfiltered"]
names(all_filtFs) <- all_filtFs %>% 
    str_remove(pattern = paste0(data_path,"/Qfiltered/")) %>% 
    str_remove(pattern = "_Qfiltered.fastq.gz")
all_filtRs <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 Qfiltered"]
names(all_filtRs) <- all_filtRs %>% 
    str_remove(pattern = paste0(data_path,"/Qfiltered/")) %>% 
    str_remove(pattern = "_Qfiltered.fastq.gz")


Quality filtering

On this step it is possible to filter by size but, as we have already removed primers from the beginning/end of the reads, it is expected that the remaining sequences are already trimmed to lengths compatible with their respective amplicons. Thus, no length trimming was conducted.

# 12 - dada filtering ----

# We’ll use standard filtering parameters: maxN=0 (DADA2 requires no Ns), truncQ=2, rm.phix=TRUE and maxEE=2. The maxEE parameter sets the maximum number of “expected errors” allowed in a read, which is a better filter than simply averaging quality scores.

length(all_fnFs.cut)
length(all_filtFs)
length(all_fnRs.cut)
length(all_filtRs)

#sorting for next step not to mix files
all_fnFs.cut <- base::sort(all_fnFs.cut)
all_filtFs <- base::sort(all_filtFs)
all_fnRs.cut <- base::sort(all_fnRs.cut)
all_filtRs <- base::sort(all_filtRs)
names(all_fnFs.cut)
names(all_filtFs)
names(all_fnRs.cut)
names(all_filtRs)

#all

all_filtered_out <- dada2::filterAndTrim(fwd = all_fnFs.cut,
                                         filt = all_filtFs, 
                                         rev = all_fnRs.cut,
                                         filt.rev = all_filtRs,
                                         # truncLen=c(240,160),
                                         maxN=0,
                                         maxEE=c(2,2),
                                         # truncQ=2,
                                         rm.phix=TRUE,
                                         compress=TRUE, multithread=TRUE,verbose = TRUE,matchIDs = TRUE) # On Windows set multithread=FALSE
head(all_filtered_out)


View post-filtering quality profiles

#check quality profile after filtering and trimming
plotQualityProfile(all_filtFs[2])
plotQualityProfile(all_filtRs[2:8])

plotQualityProfile(mif_filtFs[])
plotQualityProfile(mif_filtRs[])


Identify error rates intrinsic to sequencing

# 13 - learn error rates ----

#Learn the Error Rates
primers_n_samples$Run %>%  unique()

#run LGC_MiniSeq_1 ----
run1_errF <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "LGC_MiniSeq_1" & 
                                                         primers_n_samples$Stage == "R1 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)
run1_errR <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "LGC_MiniSeq_1" & 
                                                         primers_n_samples$Stage == "R2 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)

#run LGC_MiniSeq_2 ----
run2_errF <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "LGC_MiniSeq_2" & 
                                                         primers_n_samples$Stage == "R1 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)
run2_errR <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "LGC_MiniSeq_2" & 
                                                         primers_n_samples$Stage == "R2 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)

#run ecomol_iSeq ----
run3_errF <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "ecomol_iSeq" & 
                                                         primers_n_samples$Stage == "R1 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)
run3_errR <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "ecomol_iSeq" & 
                                                         primers_n_samples$Stage == "R2 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)


#   # plotErrors(run3_errF, nominalQ=TRUE)
#   # plotErrors(run3_errR, nominalQ=TRUE)


Dereplication: grouping into ASVs

On this step each library is reduced to its unique composing sequences and their counts.

# 14 - dada dereplication ----


names(primers_n_samples$`Read file`) <- primers_n_samples$File_name ###!!!!!!!!!!!!! Everything must have unique names from here

unique(primers_n_samples$`Read file`)
names(primers_n_samples$`Read file`)


#run LGC_MiniSeq_1 ----
LGC_1_derep_forward <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "LGC_MiniSeq_1" & 
    primers_n_samples$Stage == "R1 Qfiltered"], verbose=TRUE)
# names(all_derep_forward) <- all_sample.names

LGC_1_derep_reverse <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "LGC_MiniSeq_1" & 
    primers_n_samples$Stage == "R2 Qfiltered"], verbose=TRUE)
# names(all_derep_reverse) <- all_sample.names

LGC_1_dadaFs <- dada(LGC_1_derep_forward, err=run1_errF, multithread=TRUE)
LGC_1_dadaRs <- dada(LGC_1_derep_reverse, err=run1_errR, multithread=TRUE)

#run LGC_MiniSeq_2 ----
LGC_2_derep_forward <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "LGC_MiniSeq_2" & 
    primers_n_samples$Stage == "R1 Qfiltered"], verbose=TRUE)
# names(all_derep_forward) <- all_sample.names

LGC_2_derep_reverse <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "LGC_MiniSeq_2" & 
    primers_n_samples$Stage == "R2 Qfiltered"], verbose=TRUE)
# names(all_derep_reverse) <- all_sample.names

LGC_2_dadaFs <- dada(LGC_2_derep_forward, err=run2_errF, multithread=TRUE)
LGC_2_dadaRs <- dada(LGC_2_derep_reverse, err=run2_errR, multithread=TRUE)

#run ecomol_iSeq ----
ecomol_derep_forward <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "ecomol_iSeq" & 
    primers_n_samples$Stage == "R1 Qfiltered"], verbose=TRUE)
# names(all_derep_forward) <- all_sample.names

ecomol_derep_reverse <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "ecomol_iSeq" & 
    primers_n_samples$Stage == "R2 Qfiltered"], verbose=TRUE)
# names(all_derep_reverse) <- all_sample.names

ecomol_dadaFs <- dada(ecomol_derep_forward, err=run3_errF, multithread=TRUE)
ecomol_dadaRs <- dada(ecomol_derep_reverse, err=run3_errR, multithread=TRUE)


all_dadaFs <- c(ecomol_dadaFs,LGC_2_dadaFs,LGC_1_dadaFs)
all_dadaRs <- c(ecomol_dadaRs,LGC_2_dadaRs,LGC_1_dadaRs)

names(all_dadaFs)


Merge read pairs

On this step the forward an reverse reads are merged, by overlap, in order to reconstruct the insert full sequence. As we have samples from three runs, they are all worked independently.

# 15 - merge read pairs ----

#run1 ----
run1_mergers <- mergePairs(dadaF = LGC_1_dadaFs,
                          derepF = LGC_1_derep_forward,
                          dadaR = LGC_1_dadaRs,
                          derepR = LGC_1_derep_reverse,
                          minOverlap = 20,
                          maxMismatch = 0,   # changed from 0 to 1 since a lot was being left out for single mismatch
                          returnRejects = TRUE,
                          verbose=TRUE)

#run2 ----
run2_mergers <- mergePairs(dadaF = LGC_2_dadaFs,
                          derepF = LGC_2_derep_forward,
                          dadaR = LGC_2_dadaRs,
                          derepR = LGC_2_derep_reverse,
                          minOverlap = 20,
                          maxMismatch = 0,   # changed from 0 to 1 since a lot was being left out for single mismatch
                          returnRejects = TRUE,
                          verbose=TRUE)

#run1 ----
run3_mergers <- mergePairs(dadaF = ecomol_dadaFs,
                          derepF = ecomol_derep_forward,
                          dadaR = ecomol_dadaRs,
                          derepR = ecomol_derep_reverse,
                          minOverlap = 20,
                          maxMismatch = 0,   # changed from 0 to 1 since a lot was being left out for single mismatch
                          returnRejects = TRUE,
                          verbose=TRUE)

all_mergers <- c(run3_mergers,run2_mergers,run1_mergers)

names(all_mergers)

length(all_dadaFs)
length(all_dadaRs)
head(all_mergers[[12]])
length(all_dadaFs)
names(all_mergers)
str(all_mergers)
class(all_mergers)


# all_seqtab <- makeSequenceTable(samples = c(run3_mergers,run2_mergers,run1_mergers))   #talvez essa função aceite varios mergers
all_seqtab <- makeSequenceTable(samples = all_mergers)
dim(all_seqtab)
View(all_seqtab)
str(all_seqtab)
# Inspect distribution of sequence lengths
table(nchar(getSequences(all_seqtab)))
table(nchar(getSequences(all_seqtab))) %>% plot()


names(all_dadaFs)
names(all_derep_forward)
names(all_dadaRs)
names(all_derep_reverse)


Remove chimeras

Chimeras are artificial read pairs that might have been generated erroneously on sequencing. The DADA2 package estimates the probability of a sequence to be chimeric given the abundancy of its parental sequnces. After chimeric sequences removal, the remaining ASVs length distribution is assessed. On further steps it will be used to restrict analisys to ASVs compatible with each primer amplicons’ length interval, in order to keep of unexpected ASVs.

# 16 - remove chimeras ----


# any(colnames(C1conc_seqtab) %in% colnames(all_seqtab))

all_seqtab.nochim <- removeBimeraDenovo(all_seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(all_seqtab.nochim)
sum(all_seqtab.nochim)/sum(all_seqtab) # =  0.8404743 , perda de 16% na abundancia
#count proportion of ASVs of a given length
table(nchar(getSequences(all_seqtab.nochim)))
table(nchar(getSequences(all_seqtab.nochim))) %>% plot()
rownames(all_seqtab.nochim)



View(all_seqtab.nochim)
str(all_seqtab.nochim)


Count reads and remaining ASVs

# 17 - count reads proportion throughout the pipeline ----

getN <- function(x) sum(getUniques(x))

#preparing subtables with named rows to combine latter
#raw files

names(primers_n_samples$`Read file`) <- primers_n_samples$Library 

raw_reads <- primers_n_samples %>% filter(Stage %in% c("R1","R2")) 

raw_reads_counts <- ShortRead::countFastq(dirPath = raw_reads$`Read file`) %>% as_tibble(rownames = "Read file")
raw_reads_counts <- left_join(x = raw_reads_counts, y = (raw_reads %>%  mutate(`Read file` = basename(`Read file`)) 
                                                         ),by = "Read file")

tbl_raw_FWD <- raw_reads_counts[raw_reads_counts$Stage %in% c("R1"),] %>% select(File_name, records) %>% `colnames<-`(c("File_name", "Raw FWD"))
tbl_raw_REV <- raw_reads_counts[raw_reads_counts$Stage %in% c("R2"),] %>% select(File_name, records) %>% `colnames<-`(c("File_name", "Raw REV"))



tbl_Denoised_FWD <- (sapply(all_dadaFs, getN) %>% as_tibble(rownames = "File_name")) %>% `colnames<-`(c("File_name", "Denoised FWD"))
tbl_Denoised_REV <- (sapply(all_dadaRs, getN) %>% as_tibble(rownames = "File_name")) %>% `colnames<-`(c("File_name", "Denoised REV"))
tbl_Merged <- (rowSums(all_seqtab) %>% as_tibble(rownames = "File_name")) %>% `colnames<-`(c("File_name", "Merged"))
tbl_Non_chimeric <- (rowSums(all_seqtab.nochim) %>% as_tibble(rownames = "File_name")) %>% `colnames<-`(c("File_name", "Non-chimeric"))

# combine all counts by sample to plot

all_track <- all_filtered_out %>%  as_tibble(rownames = "File_name") %>% 
  mutate(`File_name` = str_remove(string = `File_name`, pattern = "_R1_cutadapt.fastq.gz")) %>% 
  left_join(tbl_raw_FWD,by = "File_name") %>% 
  left_join(tbl_raw_REV,by = "File_name") %>% 
  left_join(tbl_Denoised_FWD,by = "File_name") %>% 
  left_join(tbl_Denoised_REV,by = "File_name") %>% 
  left_join(tbl_Merged,by = "File_name") %>% 
  left_join(tbl_Non_chimeric,by = "File_name") %>% 
  left_join(primers_n_samples[primers_n_samples$Stage == "R1",],by = "File_name") %>% 
  select(!c("Stage", "Read file"))



colnames(all_track) <- c("File_name","N-cleaned", "Filtered","Raw FWD", "Raw REV", "Denoised FWD", "Denoised REV", "Merged", "Non-Chimeric", "Type", "Group", "Library", "Primer", "Run")




# Combine tables together (if there is more than one)
track_tbl <- bind_rows(all_track)


{
all_track$File_name[all_track$File_name == "Cassaum"] <- "Positive Control\n(P.glauca)"
all_track$File_name[all_track$File_name == "Da19"] <- "Da19"
all_track$File_name[all_track$File_name == "Da20"] <- "Da20"
all_track$File_name[all_track$File_name == "Da21"] <- "Da21"
all_track$File_name[all_track$File_name == "Da22"] <- "Da22"
all_track$File_name[all_track$File_name == "Da23-mif"] <- "Da23-mif"
all_track$File_name[all_track$File_name == "Da23-neo"] <- "Da23-neo"
all_track$File_name[all_track$File_name == "neg-PCR2"] <- "neg-PCR2"
all_track$File_name[all_track$File_name == "pJequei-N-norm-M"] <- "Non-normalized JQmc\nMiFish"
all_track$File_name[all_track$File_name == "pJequei-N-norm-N"] <- "Non-normalized JQmc\nNeoFish"
all_track$File_name[all_track$File_name == "pJequei-N-norm-T"] <- "Non-normalized JQmc\nTeleo"
all_track$File_name[all_track$File_name == "pJequei-norm-M"] <- "Normalized JQmc\nMiFish"
all_track$File_name[all_track$File_name == "pJequei-norm-N"] <- "Normalized JQmc\nNeoFish"
all_track$File_name[all_track$File_name == "pJequei-norm-T"] <- "Normalized JQmc\nTeleo"
all_track$File_name[all_track$File_name == "SFJQ-mif"] <- "Normalized SFJQmc\nMiFish"
all_track$File_name[all_track$File_name == "SFJQ-neo"] <- "Normalized SFJQmc\nNeoFish"
all_track$File_name[all_track$File_name == "SFnNorm-mi"] <- "Non-normalized SFmc\nMiFish"
all_track$File_name[all_track$File_name == "SFnNorm-neo"] <- "Non-normalized SFmc\nNeoFish"
all_track$File_name[all_track$File_name == "SFNorm-mi"] <- "Normalized SFmc\nMiFish"
all_track$File_name[all_track$File_name == "SFNorm-neo"] <- "Normalized SFmc\nNeoFish"
}


# save reads counts table



writexl::write_xlsx(x = all_track,
                    path = "~/prjcts/fish_eDNA/sfjq/results/sfjq_read_counts_along_quality_control.xlsx",
                    col_names = TRUE,format_headers = TRUE)






# plot reads proportion troughout the pipeline ----


track_tbl$Primer %>% unique()
track_tbl$File_name %>% unique()

#TODO
# https://bhaskarvk.github.io/colormap/
#https://www.thinkingondata.com/something-about-viridis-library/
#set colors here ss




#perda por filtrar N
all_track %>% mutate(perda = `N-cleaned`/`Raw FWD`)


#18 - set colors for downstream plots ----

# colors 
scales::show_col(colors5)



colors5 <- c("#017504","#000791","#820000","#780058","#ff5500") #neo,mi,tel,all
colors_norm <- c("#017504","#4fc952",
                 "#000791","#3862eb",
                 "#820000","#bf4b4b")
scales::show_col(colors_norm)
scales::show_col(colors5)

#PLOT2 - sample track plot ----

# # track_tbl$File_name %>% paste0('"\n"',collapse = "") %>% cat()
# track_tbl$File_name %>% unique() %>% base::sort() %>% paste0(collapse = '\n') %>%  cat()
# sample_levels


 {
track_tbl$File_name[track_tbl$File_name == "Cassaum"] <- "Positive Control\n(P.glauca)"
track_tbl$File_name[track_tbl$File_name == "Da19"] <- "Da19"
track_tbl$File_name[track_tbl$File_name == "Da20"] <- "Da20"
track_tbl$File_name[track_tbl$File_name == "Da21"] <- "Da21"
track_tbl$File_name[track_tbl$File_name == "Da22"] <- "Da22"
track_tbl$File_name[track_tbl$File_name == "Da23-mif"] <- "Da23-mif"
track_tbl$File_name[track_tbl$File_name == "Da23-neo"] <- "Da23-neo"
#track_tbl$File_name[track_tbl$File_name == "neg-PCR2"] <-
track_tbl$File_name[track_tbl$File_name == "pJequei-N-norm-M"] <- "Non-normalized JQmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "pJequei-N-norm-N"] <- "Non-normalized JQmc\nNeoFish"
track_tbl$File_name[track_tbl$File_name == "pJequei-N-norm-T"] <- "Non-normalized JQmc\nTeleo"
track_tbl$File_name[track_tbl$File_name == "pJequei-norm-M"] <- "Normalized JQmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "pJequei-norm-N"] <- "Normalized JQmc\nNeoFish"
track_tbl$File_name[track_tbl$File_name == "pJequei-norm-T"] <- "Normalized JQmc\nTeleo"
track_tbl$File_name[track_tbl$File_name == "SFJQ-mif"] <- "Normalized SFJQmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "SFJQ-neo"] <- "Normalized SFJQmc\nNeoFish"
track_tbl$File_name[track_tbl$File_name == "SFnNorm-mi"] <- "Non-normalized SFmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "SFnNorm-neo"] <- "Non-normalized SFmc\nNeoFish"
track_tbl$File_name[track_tbl$File_name == "SFNorm-mi"] <- "Normalized SFmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "SFNorm-neo"] <- "Normalized SFmc\nNeoFish"
}




sample_levels <- c(
"Da23-mif", "Normalized SFJQmc\nMiFish",
"Da23-neo", "Normalized SFJQmc\nNeoFish",
"Da20","Non-normalized SFmc\nMiFish",
"Da19","Non-normalized SFmc\nNeoFish",
"Da22","Normalized SFmc\nMiFish",
"Da21","Normalized SFmc\nNeoFish",
"Non-normalized JQmc\nNeoFish","Non-normalized JQmc\nMiFish","Non-normalized JQmc\nTeleo",
"Normalized JQmc\nNeoFish","Normalized JQmc\nMiFish","Normalized JQmc\nTeleo",
"Positive Control\n(P.glauca)","neg-PCR2")


# Prepare counts for ploting ----

  track_tbl <- track_tbl %>%
  gather(key = "Stage",
        value = "Read Number",
        "Raw REV","Raw FWD",
        "N-cleaned", "Filtered", "Denoised FWD",
        "Denoised REV", "Merged", "Non-Chimeric") %>%
  mutate(Stage = factor(Stage, levels = c("Non-Chimeric", "Merged", "Denoised REV", "Denoised FWD", "Filtered","N-cleaned", "Raw REV","Raw FWD"))) %>%
  mutate(
    Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo","NeoFish/MiFish",  "NeoFish/MiFish/Teleo")),
    File_name = factor(File_name,levels = sample_levels))

  options(scipen = 22)
  
  # track_tbl %>% base::sort(track_tbl$Sample) 

# ploting ----
    
  track_plot <- track_tbl %>% 
    # filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>% 
    # filter(Group %in% c("DNA pool")) %>% 
    ggplot(aes(y = Stage,x = `Read Number`, fill = Primer, group = Stage)) +
    geom_bar(stat="identity") +
    geom_hline(yintercept = 300000, col = 1, linetype = 2) +
    scale_fill_manual(labels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish","NeoFish/MiFish/Teleo"),
                      values = alpha(colour = colors5,
                                     alpha =  0.75)) +
    labs(title = "LGC eDNA 1st & 2nd runs",
         subtitle = "Read counts per library and filtering step",
         x = "Read counts",
         y = "Data filtering step")+
    facet_wrap(~File_name,ncol = 6) +
    coord_fixed(ratio = 60000) +
    theme_bw(base_size = 7) +
    theme(legend.position = "bottom") +
    theme(axis.title = ggtext::element_markdown())

track_plot 







# save plot
ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_sample_track_plot.png",
     plot = track_plot,
     device = "png",
     width = 12,
     height = 10,
     units = "cm",
     dpi = 600)

# save plot
ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_sample_track_plot.svg",
     plot = track_plot,
     device = "svg",
     width = 12,
     height = 10,
     units = "cm",
     dpi = 600)


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_sample_track_plot.pdf",
     plot = track_plot,
     device = "pdf",
     width = 20,
     height = 16,
     units = "cm",
     dpi = 600)

Reads proportions are displayed below. The experimental design intended the same read yield for all libs, between 200K a 250K reads. The deviations of this reange are probably due to dosage/pipetting errors.



Classify taxonomy

On this step the ASVs identified by the DADA2 pipeline, jointly for all libraries of each primer, are associated (or not) to any of the sequences on the Reference 12S Sequences Database. DADA2 has two strategies to identify taxa. The first, assignSpecies, identify perfect matches of the ASVs in the Reference Database. The second, assignTaxonomy, use a RDP Naive Bayesian Classifier algorithm (Wang, 2007) with kmer size 8 and 100 bootstrap replicates to associate ASVs to the Reference Database Sequences. In the latter, the taxonomy ranks classification is proportional to the sequence similarity, although this relation is not yet clear to us.

#19 - classify taxonomy exactly ----

all_sps <- dada2::assignSpecies(seqs = all_seqtab.nochim,allowMultiple = 10,
                         refFasta =  "~/prjcts/fish_eDNA/data/refs/db/LGC/jul21/dada_tax_fullDB_order_SPs_jul21.fasta",
                         tryRC=TRUE,
                         n = 20000,
                         verbose = TRUE)


#check how many ASVs were exactly identified as species

View(all_sps)

      all_csv_sp <- all_sps %>% as_tibble() %>% mutate(ASV = rownames(all_sps))
      colnames(all_csv_sp) <- c("exact Genus", "exact Species", "ASV")
#20 - classify taxonomy ----

all_taxa <- dada2::assignTaxonomy(seqs = all_seqtab.nochim,
                     refFasta =  "~/prjcts/fish_eDNA/data/refs/db/LGC/jul21/dada_tax_fullDB_order_jul21.fasta",
                           multithread=TRUE, tryRC=TRUE,taxLevels = c("Kingdom","Phylum","Class","Order","Family", "Genus", "Species","Specimen","Basin"),outputBootstraps = TRUE)


all_taxa$boot



Here the DADA2 pipeline ends.





Phyloseq

On this step the ASVs associated to taxonomic ranks by DADA2 and their respective counts by library, are combined using the Phyloseq package.


Generate sample metadata table

Here the experiment metadata is associated to each sample.

# 22 - create sample table ----

#create a sample table for each primer

# primers_n_samples$File_name

all_samdf <- unique(primers_n_samples[1:6])

samdf<- all_samdf

#rownames must me assigned in order to the next step to work

samdf <- samdf %>% as.data.frame()
rownames(samdf) <- samdf$File_name


This sample metadata table was created with the information available for the samples analyzed on this first run. This table must be customized for each experiment.




Phyloseq data interpretation

#23 - interpret dada on phyloseq ----

all_ps <- phyloseq(otu_table(all_seqtab.nochim, taxa_are_rows = FALSE),
                   sample_data(samdf),
                   tax_table(all_taxa$tax))
                   # tax_table(all_taxa))

rownames(all_seqtab.nochim)


Merge and Flex Phyloseq results

Many different graphics can be generated, together or in isolation, for all primers/libraries and taxonomic ranks.

#24 - merge ps analisys ----

#melt phyloseq object into tbl
all_ps_tbl <- psmelt(all_ps) %>% as_tibble() %>% filter(Abundance >= 1)

colnames(all_ps_tbl)[colnames(all_ps_tbl) == "OTU"] <- "ASV"

unique(all_ps_tbl$ASV)
# unique(neo_ps_tbl$ASV)
# unique(mif_ps_tbl$ASV)

all_ps_tbl$Sample %>%  unique()
all_ps_tbl$Primer %>%  unique()

#concatenate exact species table

all_ps_tbl <- left_join(by = "ASV",x=all_ps_tbl,y= all_csv_sp)

# backup table
# all_ps_tbl_bckp <- all_ps_tbl
# all_ps_tbl <- all_ps_tbl_bckp

metaBLASTEr - Identify ASVs with inhouse BLASTn

This package is currently under development, but fully functional. You will need a working NCBI-BLAST+ installed and a BLAST formated reference DB on the same linux server your Rstudio-server is running (but we are improoving it to run on IOS).

# blastn ----

install.packages('BLASTr', repos = "https://heronoh.r-universe.dev")
# Annotate all ASVs by blastN

asvs_blast <- all_ps_tbl$ASV %>% unique() %>% as.character() 




#Identify using metaBLASTr package ----
# paralela com 2 threads ----
tictoc::tic("Parallel - Furrr 2 threads")

blast_res <- BLASTr::parallel_blast(
  db_path = "/data/databases/nt_jun2023/nt",
  asvs = asvs_blast_all,
  out_file = "~/prjcts/fish_eDNA/sfjq/results/blast/blast_out_res_1.csv",
  out_RDS = "~/prjcts/fish_eDNA/sfjq/results/blast/blast_out_res_1.RDS",
  total_cores = 80,
  perc_id = 80,
  num_threads = 2,
  perc_qcov_hsp = 80,
  num_alignments = 3,
  blast_type = "blastn"
)

tictoc::toc()# 


# #Save env
   base::save.image("~/prjcts/fish_eDNA/sfjq//env-canastra_posBLAST-25jul23.RData")

   



colnames(blast_res)

# blast_res <- blast_res %>% rename("OTU" ="Sequence")
# blast_res <- blast_res %>% rename("Sequence" ="OTU")

blast_res_full <- bind_rows(blast_res) %>% 
  select(-c("OTU")) %>%
  filter(!is.na(`1_subject header`))

nrow(blast_res)
dim(blast_res)

blast_res <- blast_res %>%  filter(`1_res` == 1 ) #remover o que não deu nada            

str(blast_res)

Rename samples for plots

primers_n_samples
sample_levels

{
all_ps_tbl$File_name[all_ps_tbl$File_name == "Cassaum"] <- "Positive Control\n(P.glauca)"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da19"] <- "Non-normalized SFmc\nNeoFish B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da20"] <- "Non-normalized SFmc\nMiFish B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da21"] <- "Normalized SFmc\nNeoFish B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da22"] <- "Normalized SFmc\nMiFish B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da23-mif"] <- "SFJQ-mif B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da23-neo"] <- "SFJQ-neo B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "neg-PCR2"] <-"neg-PCR2"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-N-norm-M"] <- "Non-normalized JQmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-N-norm-N"] <- "Non-normalized JQmc\nNeoFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-N-norm-T"] <- "Non-normalized JQmc\nTeleo"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-norm-M"] <- "Normalized JQmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-norm-N"] <- "Normalized JQmc\nNeoFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-norm-T"] <- "Normalized JQmc\nTeleo"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFJQ-mif"] <- "Normalized SFJQmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFJQ-neo"] <- "Normalized SFJQmc\nNeoFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFnNorm-mi"] <- "Non-normalized SFmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFnNorm-neo"] <- "Non-normalized SFmc\nNeoFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFNorm-mi"] <- "Normalized SFmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFNorm-neo"] <- "Normalized SFmc\nNeoFish"
}

{
all_ps_tbl$Sample[all_ps_tbl$Sample == "Cassaum"] <- "Positive Control\n(P.glauca)"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da19"] <- "Non-normalized SFmc\nNeoFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da20"] <- "Non-normalized SFmc\nMiFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da21"] <- "Normalized SFmc\nNeoFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da22"] <- "Normalized SFmc\nMiFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da23-mif"] <- "Normalized SFJQmc\nMiFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da23-neo"] <- "Normalized SFJQmc\nNeoFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "neg-PCR2"] <- "neg-PCR2"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-N-norm-M"] <- "Non-normalized JQmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-N-norm-N"] <- "Non-normalized JQmc\nNeoFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-N-norm-T"] <- "Non-normalized JQmc\nTeleo"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-norm-M"] <- "Normalized JQmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-norm-N"] <- "Normalized JQmc\nNeoFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-norm-T"] <- "Normalized JQmc\nTeleo"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFJQ-mif"] <- "Normalized SFJQmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFJQ-neo"] <- "Normalized SFJQmc\nNeoFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFnNorm-mi"] <- "Non-normalized SFmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFnNorm-neo"] <- "Non-normalized SFmc\nNeoFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFNorm-mi"] <- "Normalized SFmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFNorm-neo"] <- "Normalized SFmc\nNeoFish"
}




all_ps_tbl$Group %>%  unique() %>% base::sort()


# all_ps_tbl$Group <- all_ps_tbl$Group %>% unfactor()
{
all_ps_tbl$Group[all_ps_tbl$Group %in% c("JqNnorm")] <- "Non-normalized JQmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("JqNorm")] <- "Normalized JQmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("SFnNorm")] <- "Non-normalized SFmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("SFNorm")] <- "Normalized SFmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("SFJQ")] <- "Normalized SFJQmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("Positive control")] <- "Positive control\n(P. glauca)"
}
all_ps_tbl$Group %>% unique()

all_ps_tbl$Group <- all_ps_tbl$Group %>%  factor(levels = c("Non-normalized JQmc",
                                                            "Normalized JQmc",
                                                            "Non-normalized SFmc",
                                                            "Normalized SFmc",
                                                            "Normalized SFJQmc",
                                                            "Positive control\n(P. glauca)"))


sample_levels <- c(
"Normalized SFJQmc\nMiFish B", "Normalized SFJQmc\nMiFish",
"Normalized SFJQmc\nNeoFish B", "Normalized SFJQmc\nNeoFish",

"Normalized SFmc\nMiFish B","Normalized SFmc\nMiFish",
"Non-normalized SFmc\nMiFish B","Non-normalized SFmc\nMiFish",
"Normalized SFmc\nNeoFish B","Normalized SFmc\nNeoFish",
"Non-normalized SFmc\nNeoFish B","Non-normalized SFmc\nNeoFish",

"Non-normalized JQmc\nMiFish","Normalized JQmc\nMiFish",
"Non-normalized JQmc\nNeoFish","Normalized JQmc\nNeoFish",
"Non-normalized JQmc\nTeleo","Normalized JQmc\nTeleo",

"Positive Control\n(P.glauca)","neg-PCR2")

# all_ps_tbl_blast$Sample[all_ps_tbl_blast$Sample %in% c("pool não-normalizado\nMiFish")] 

all_ps_tbl$Sample <- all_ps_tbl$Sample %>%  factor(levels = sample_levels)


class(asvs_blast)

  
all_ps_tbl_blast <- left_join(x = all_ps_tbl,y = blast_res,by = "ASV")

colnames(all_ps_tbl_blast)

#all_ps_tbl_blast_bckp <- all_ps_tbl_blast
#all_ps_tbl_blast <- all_ps_tbl_blast_bckp

ASVs seqs

#25 - recover all ASVs sequences to prepare fasta ----


#all ----
# giving our seq headers more manageable names (ASV_1, ASV_2...)
# all_asv_seqs <- tibble("ASV" = colnames(seqtab.nochim))
all_asv_seqs <- tibble("ASV" = asvs_blast)

all_asv_seqs <- all_asv_seqs %>% 
  mutate("ASV length" = nchar(ASV),
         "ASV header" = as.character(""))

all_asv_seqs <- all_asv_seqs[base::order(all_asv_seqs$`ASV length`),]
  for (i in 1:nrow(all_asv_seqs)) {

    all_asv_seqs$`ASV header`[i] <- paste0(">ASV_", i, "_", all_asv_seqs$`ASV length`[i], "bp")

  }


#combine ASV headers and all_ps_tbl
all_ps_tbl_blast <- dplyr::left_join(x = all_ps_tbl_blast,    
                               y = all_asv_seqs,
                               by = "ASV" )


# making and writing out a fasta of our final ASV seqs with tax
for (asv in 1:nrow(all_asv_seqs)) {
  
  tax <- all_ps_tbl_blast %>% 
    filter(ASV == all_asv_seqs$ASV[asv]) %>% 
    select("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Specimen") %>% 
    unique() %>% 
    paste0(collapse = "|")
  
  all_asv_seqs$`ASV header`[asv] <- paste0(all_asv_seqs$`ASV header`[asv],"_",tax)
  
  # if (condition) {
  # fazer algum teste pra ver ser ta certo
  # }
}

#write fasta file with ASVs and Taxonomy
all_asv_fasta <- c(rbind(all_asv_seqs$`ASV header`, all_asv_seqs$ASV))

write(all_asv_fasta, "~/prjcts/fish_eDNA/sfjq/results/sfjq_all_ASVs_all_primers.fasta")

SWARM - ASVs to OTUs

# #swarm

asvs_abd <- all_ps_tbl_blast %>%
  group_by(`ASV`,`ASV header`) %>%
  mutate("ASV total abundance" = sum(Abundance)) %>%
  select(c(`ASV`,`ASV header`,`ASV total abundance`)) %>%
  unique() %>%
  mutate(`ASV header` = paste0(`ASV header`,"_",`ASV total abundance`))


asvs_abd$ASV %>% unique()
asvs_abd$`ASV header` %>% unique()


#write fasta file with ASVs and  abundance
# all_asv_fasta_abd <- c(rbind(asvs_abd$`ASV header primer`, asvs_abd$`ASV`))
all_asv_fasta_abd <- c(rbind(asvs_abd$`ASV header`, asvs_abd$`ASV`))

# write(all_asv_fasta_abd, "~/prjcts/fish_eDNA/sfjq/results/sfjq_ASVs_abd_primer.fasta")
write(all_asv_fasta_abd, paste0(results_path,"/sfjq_ASVs_abd.fasta"))

# ~/prjcts/fish_eDNA/sfjq/swarm$ swarm -t 50 ~/prjcts/fish_eDNA/sfjq/results/sfjq_ASVs_abd.fasta -s sfjq_swarm.stats -o sfjq_swarm.out -w sfjq_representative_OTUs.fasta -i sfjq_swarm.structure -f


swarm_clust <- readr::read_lines("~/prjcts/fish_eDNA/sfjq/swarm/sfjq_swarm.out")





asvs_abd <- asvs_abd %>% mutate("OTU"= 0)



for (asv in 1:nrow(asvs_abd)){
  for (line in 1:length(swarm_clust)) {
    if (str_detect(string =  swarm_clust[line],
                   pattern = str_remove(asvs_abd$`ASV header`[asv],
                                        pattern = ">"))) {
  asvs_abd$OTU[asv] <- line
    }
    }
}



all_ps_tbl_blast <- left_join(x = all_ps_tbl_blast,y = asvs_abd[,c(1,3,4)],by="ASV" ) 

# all_ps_tbl_blast %>% select(`final ID`,OTU) %>% View() 
# all_ps_tbl_blast %>% select(`final ID`,OTU) %>% select(OTU) %>% unique() 
# all_ps_tbl_blast %>% select(ASV,`final ID`,OTU) %>% select(ASV) %>% unique() 


all_ps_tbl_blast$OTU %>% unique()
all_ps_tbl_blast$Group %>% unique()

Calculate sample abundances —-

#add ASV legth to table
# all_ps_tbl_blast_bckp2 <- all_ps_tbl_blast
# all_ps_tbl_blast <- all_ps_tbl_blast_bckp2


all_ps_tbl_blast <- all_ps_tbl_blast %>% 
  mutate("Relative abundance to all samples" = 0,
         "Relative abundance on sample" = 0,
         "Sample total abundance" = 0)

abd_total <- sum(all_ps_tbl_blast$Abundance)




all_ps_tbl_blast <- all_ps_tbl_blast %>%
  group_by(Sample) %>%
  mutate("Sample total abundance" = sum(Abundance),
         "Relative abundance to all samples" = Abundance/abd_total,
         "Relative abundance on sample" = Abundance/`Sample total abundance`) %>%
  ungroup()

Set final identification from all possibilities

all_ps_tbl_blast <- all_ps_tbl_blast %>% 
  mutate(`exact GenSp` = paste(`exact Genus`,`exact Species`,sep=" "))



all_ps_tbl_blast <- all_ps_tbl_blast %>% 
  mutate("final ID" = if_else((`exact Species` %in% c(NA,"NA", "NA NA")),
                              if_else((Species %in% c(NA,"NA")),
                                      if_else(Genus %in% c(NA,"NA"),
                                              substr(as.character(`1_subject header`),1,30),
                                              Genus),
                                      Species),
                              as.character(`exact GenSp`)))

#Group/correct species for ploting

# all_ps_tbl_blast_bckp3 <- all_ps_tbl_blast

#Species detected
all_ps_tbl_blast$`final ID` %>% unique() %>% base::sort()
{
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Astyanax aff_fasciatus","Astyanax cf_fasciatus"))] <- "Astyanax fasciatus"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Astyanax cf_lacustris"))] <- "Astyanax lacustris"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Characidium sp"))] <- "Characidium"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Hypostomus sp"))] <- "Hypostomus"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Hoplias malabaricus/sp"))] <- "Hoplias malabaricus"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Rhamdia aff_quelen"))] <- "Rhamdia quelen"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Coptodon zillii KAUM:I:90126 m"))] <- "Coptodon zillii"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Trachelyopterus cf_galeatus/galeatus"))] <- "Trachelyopterus galeatus"
}

Identify primers expected ASV legth range

# create ranges column
all_ps_tbl_blast <- all_ps_tbl_blast %>%
   mutate("Expected length" = "FALSE")


# fill ranges column with expected primer insert ranges

for (asv in 1:nrow(all_ps_tbl_blast)) {
   if (all_ps_tbl_blast$Primer[asv] == "NeoFish") {
      if (all_ps_tbl_blast$`ASV length`[asv] >= 185 && all_ps_tbl_blast$`ASV length`[asv] <= 200) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
         }

   }
   if (all_ps_tbl_blast$Primer[asv] == "MiFish") {
      if (all_ps_tbl_blast$`ASV length`[asv] >= 165 && all_ps_tbl_blast$`ASV length`[asv] <= 180) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
      }

   }
  if (all_ps_tbl_blast$Primer[asv] == "Teleo") {
      if (all_ps_tbl_blast$`ASV length`[asv] >= 60 && all_ps_tbl_blast$`ASV length`[asv] <= 75) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
      }

   }
  if (all_ps_tbl_blast$Primer[asv] == "NeoFish/MiFish") {
      if (all_ps_tbl_blast$`ASV length`[asv] >= 165 && all_ps_tbl_blast$`ASV length`[asv] <= 200) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
      }

   }
  if (all_ps_tbl_blast$Primer[asv] == "NeoFish/MiFish/Teleo") {
      if (all_ps_tbl_blast$`ASV length`[asv] %in% c(60:75,165:180,185:200)) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
      }

   }

}


#factorize comlumn
all_ps_tbl_blast$`Expected length` <- as.factor(all_ps_tbl_blast$`Expected length`)

#Reorder table

paste0(colnames(all_ps_tbl_blast),"\n") %>%  cat()


# all_ps_tbl_blast_bckp4 <- all_ps_tbl_blast
# all_ps_tbl_blast <- all_ps_tbl_blast_bckp4


all_ps_tbl_blast <- 
  all_ps_tbl_blast %>% 
  select(c("Sample","Group","Type","Primer","File_name","Library","Run",
           "final ID",
           "Abundance",
           "Relative abundance to all samples",
           "Relative abundance on sample",
           "Sample total abundance",
           "Kingdom","Phylum","Class","Order","Family",
           "Genus","Species","Specimen","Basin",
           "exact Genus","exact Species",
           "exact GenSp",
           "1_subject header","1_subject",
           "1_indentity","1_length",
           # "1_mismatches","1_gaps",
           # "1_query start","1_query end","1_subject start",
           # "1_subject end","1_e-value","1_bitscore",
           "2_subject header","2_subject",
           "2_indentity","2_length",
           # "2_mismatches","2_gaps",
           # "2_query start","2_query end","2_subject start",
           # "2_subject end","2_e-value","2_bitscore",
           "3_subject header","3_subject",
           "3_indentity","3_length",
           # "3_mismatches","3_gaps",
           # "3_query start","3_query end","3_subject start",
           # "3_subject end","3_e-value","3_bitscore",
           "ASV","ASV length","ASV header","Expected length","OTU"
           ))

# paste0(colnames(all_ps_tbl_blast),"\n") %>%  cat()
names(all_ps_tbl_blast)[which(names(all_ps_tbl_blast)=="ASV")] <- "ASV (Sequence)"
names(all_ps_tbl_blast)[which(names(all_ps_tbl_blast)== "ASV length")] <- "ASV size (pb)"

###save complete table

#order by abundance

smp_abd_ID <- all_ps_tbl_blast[rev(base::order(all_ps_tbl_blast$Abundance)),] %>% 
  filter(`Abundance` > 0) 

dim(smp_abd_ID)

writexl::write_xlsx(x = smp_abd_ID,
                    path = "~/prjcts/fish_eDNA/sfjq/results/sfjq_all_analysis_info_06-03-22.xlsx",
                    col_names = TRUE,format_headers = TRUE)





ASVs_per_sample <- all_ps_tbl_blast %>% 
  # filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>%
  filter(!`final ID` %in% c(NA,"NA")) %>% 
  mutate("Sample" = factor(Sample,levels = sample_levels)) %>%
  group_by(Sample) %>%
   summarize("Library" = unique(`Library`),
     "Group" = unique(Group),
             "Primer" = unique(Primer),
             "Total ASV" = length(unique(`ASV (Sequence)`[Abundance != 0])),
             "ASVs out of range" = length(unique(`ASV (Sequence)`[Abundance != 0 & `Expected length` == "out of range"])),
             "ASVs in range" = length(unique(`ASV (Sequence)`[Abundance != 0 & `Expected length` == "in range"]))
             ,
             "Identified Species" = length(unique(`final ID`[Abundance != 0 & `Expected length` == "in range"]))
             ) 


writexl::write_xlsx(x = ASVs_per_sample,
                    path = "~/prjcts/fish_eDNA/sfjq/results/sfjq_ASVs_per_sample_06-07-21.xlsx",
                    col_names = TRUE,format_headers = TRUE)

curing: manual checking of the species assignment

all_ps_tbl_bl_cur <- smp_abd_ID



all_ps_tbl_bl_cur <- all_ps_tbl_bl_cur %>% 
  mutate("revised final ID" = `final ID`)


all_ps_tbl_bl_cur$`revised final ID` %>% unique() %>%  sort()




#correct misidentifications one by one
{
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Prochilodus")] <- "Prochilodus argenteus/hartii"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Prochilodus argenteus")] <- "Prochilodus argenteus/hartii"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Pimelodus")] <- "Pimelodus pohli"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Astyanax")] <- "Astyanax lacustris"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Hypostomus")] <- "Hypostomus alatus"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("NA elegans/gilbert")] <- "Cyphocharax gilbert"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("NA lepidura/xenodon")] <- "Curimatella lepidura"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Roeboides xenodon")] <- "Curimatella lepidura"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Curimatella lepidura")] <- "Roeboides xenodon"





all_ps_tbl_bl_cur$`revised final ID`[(all_ps_tbl_bl_cur$`revised final ID` %in% c("Hoplias brasiliensis/intermedius")) & 
                                       all_ps_tbl_bl_cur$Group %in% c("Non-normalized SFmc","Normalized SFmc")] <- "Hoplias intermedius"


all_ps_tbl_bl_cur$`revised final ID`[(all_ps_tbl_bl_cur$`revised final ID` %in% c("Hoplias brasiliensis/intermedius")) & 
                                       all_ps_tbl_bl_cur$Group %in% c("Non-normalized JQmc","Normalized JQmc")] <- "Hoplias brasiliensis"



all_ps_tbl_bl_cur$`revised final ID`[(all_ps_tbl_bl_cur$`revised final ID` %in% c("Hoplias intermedius")) & 
                                       all_ps_tbl_bl_cur$Group %in% c("Normalized SFJQmc")] <- "Hoplias brasiliensis/intermedius"




all_ps_tbl_bl_cur$`revised final ID`[(all_ps_tbl_bl_cur$`revised final ID` %in% c("Hoplias intermedius")) & 
                                       all_ps_tbl_bl_cur$Group %in% c("Non-normalized JQmc","Normalized JQmc")] <- "Hoplias brasiliensis"

}

Identify expected species

As this was a controlled experiment, we must identify the species that were expected and those who were not.

#duplicate final ID so we can identify partially identified species

# all_ps_tbl_blast_bckp5 <- all_ps_tbl_blast 
# all_ps_tbl_blast <- all_ps_tbl_blast_bckp5
# 
#                           all_ps_tbl_blast <- all_ps_tbl_bl_cur %>%
#                             mutate(`revised final ID` = `final ID`)



# expctd_sps_tbl <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/sfjq_species.csv",
# expctd_sps_tbl <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/sfjq_species_06mar22.csv",
# expctd_sps_tbl <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/sfjq_species_10mar22.csv",
expctd_sps_tbl <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/sfjq_species_02jun22.csv",
                           header = TRUE, check.names = FALSE) %>% as_tibble() 

#mudei o T. chalceus de 0.0073 para 0.0299 no pool SF

colnames(expctd_sps_tbl)[colnames(expctd_sps_tbl) == "Species"] <- "revised final ID"

Proportions table

As this was a controlled experiment, we must identify the species that were expected and those who were not.

# tabelas de proporções ----
# JQmc ----

# criando a tabela de proporções do JQ


# if used left_join, p hartii wont show up
all_ps_tbl_jq <- full_join(
  all_ps_tbl_bl_cur[all_ps_tbl_bl_cur$Sample %in% c(
  "Normalized JQmc\nNeoFish", "Non-normalized JQmc\nNeoFish",
  "Normalized JQmc\nMiFish", "Non-normalized JQmc\nMiFish",
  "Normalized JQmc\nTeleo", "Non-normalized JQmc\nTeleo"),],
  expctd_sps_tbl[expctd_sps_tbl$Pool=="JQ",c(2:9)],
          by = "revised final ID") 

# all_ps_tbl_jq %>% select(`revised final ID`,Abundance,`Relative abundance on sample`, 64:73) %>% View()

all_ps_tbl_jq$`Expected Species` <- "not expected"

all_ps_tbl_jq$`revised final ID` %>% unique()

all_ps_tbl_jq <- all_ps_tbl_jq %>% mutate( 
  # `Expected Species`=if_else((!is.na(`Full name`) & (.$`revised final ID` %in% expected_sps)),"expected","not expected")
  `Expected Species`=if_else((!is.na(`Full name`)),"expected","not expected")
  )

colnames(all_ps_tbl_jq)


jq_proportions <- all_ps_tbl_jq %>%
  select(c(1:24,37:50)) %>%
  pivot_longer(cols = c("Percentage on respective Norm pool", "Percentage on respective Non-norm pool" ),
               names_to = "Pool",values_to = "Proportion") %>% 
  # mutate(Proportion = Proportion*100) %>% 
  mutate(Sample = factor(Sample,levels = c(
    "Normalized JQmc\nNeoFish", "Non-normalized JQmc\nNeoFish",
    "Normalized JQmc\nMiFish", "Non-normalized JQmc\nMiFish",
    "Normalized JQmc\nTeleo", "Non-normalized JQmc\nTeleo"
    # ,
    # "Non-normalized SFmc\nMiFish","Non-normalized SFmc\nNeoFish",
    # "Normalized SFmc\nMiFish","Normalized SFmc\nNeoFish",
    # "Normalized SFJQmc\nMiFish", "Normalized SFJQmc\nNeoFish"
    ))) %>%
  filter((Sample %in% c("Non-normalized JQmc\nNeoFish", "Non-normalized JQmc\nMiFish", "Non-normalized JQmc\nTeleo"
                        # , "Non-normalized SFmc\nMiFish","Non-normalized SFmc\nNeoFish"
                        ) & Pool %in% c("Percentage on respective Non-norm pool"
                                        ) )|(Sample %in% c("Normalized JQmc\nNeoFish", "Normalized JQmc\nMiFish", "Normalized JQmc\nTeleo"
                                                           # , "Normalized SFmc\nMiFish", "Normalized SFmc\nNeoFish", "Normalized SFJQmc\nMiFish", "Normalized SFJQmc\nNeoFish"
                                                           ) & Pool %in% c("Percentage on respective Norm pool") )) %>% 
  group_by(Sample,`revised final ID`) %>%
  summarize(Proportion = unique(Proportion),
            `Relative abundance on sample` = sum(`Relative abundance on sample`),
            Sample = unique(Sample),
            # `revised final ID` = unique(`revised final ID`),
            `revised final ID` = unique(`revised final ID`),
            Primer = unique(Primer),
            `Expected Species` = unique(`Expected Species`),
            Pool = unique(Pool),
            `Num ASVs` = length(`ASV (Sequence)`),
            `Num OTUs` = length(unique(OTU))) %>% 
  ungroup()


#SFmc ----

# criando a tabela de proporções do 

all_ps_tbl_sf <- full_join(
  all_ps_tbl_bl_cur[all_ps_tbl_bl_cur$Sample %in% c(
  "Normalized SFmc\nNeoFish", "Non-normalized SFmc\nNeoFish",
  "Normalized SFmc\nMiFish", "Non-normalized SFmc\nMiFish"),],
  expctd_sps_tbl[expctd_sps_tbl$Pool=="SF",c(2:9)],
          by = "revised final ID")

# all_ps_tbl_jq %>% select(`revised final ID`,Abundance,`Relative abundance on sample`, 64:73) %>% View()

all_ps_tbl_sf$`Expected Species` <- "not expected"

all_ps_tbl_sf$`revised final ID` %>% unique()

all_ps_tbl_sf <- all_ps_tbl_sf %>% mutate( 
  # `Expected Species`=if_else((!is.na(`Full name`) & (.$`revised final ID` %in% expected_sps)),"expected","not expected")
  `Expected Species`=if_else((!is.na(`Full name`)),"expected","not expected")
  )

colnames(all_ps_tbl_sf)


sf_proportions <- all_ps_tbl_sf %>%
  # select(c(1:15,63:68,73:74)) %>% 
  select(c(1:24,37:50)) %>%
  pivot_longer(cols = c("Percentage on respective Norm pool", "Percentage on respective Non-norm pool" ),
               names_to = "Pool",values_to = "Proportion") %>% 
  # mutate(Proportion = Proportion*100) %>% 
  mutate(Sample = factor(Sample,levels = c(
    "Normalized SFmc\nNeoFish", "Non-normalized SFmc\nNeoFish",
    "Normalized SFmc\nMiFish", "Non-normalized SFmc\nMiFish"
    # ,
    # "Non-normalized SFmc\nMiFish","Non-normalized SFmc\nNeoFish",
    # "Normalized SFmc\nMiFish","Normalized SFmc\nNeoFish",
    # "Normalized SFJQmc\nMiFish", "Normalized SFJQmc\nNeoFish"
    ))) %>%
  filter((Sample %in% c("Non-normalized SFmc\nNeoFish", "Non-normalized SFmc\nMiFish"
                        # , "Non-normalized SFmc\nMiFish","Non-normalized SFmc\nNeoFish"
                        ) & Pool %in% c("Percentage on respective Non-norm pool"
                                        ) )|(Sample %in% c("Normalized SFmc\nNeoFish", "Normalized SFmc\nMiFish"
                                                           # , "Normalized SFmc\nMiFish", "Normalized SFmc\nNeoFish", "Normalized SFJQmc\nMiFish", "Normalized SFJQmc\nNeoFish"
                                                           ) & Pool %in% c("Percentage on respective Norm pool") )) %>% 
  group_by(Sample,`revised final ID`) %>% 
  summarize(Proportion = unique(Proportion),
            `Relative abundance on sample` = sum(`Relative abundance on sample`),
            Sample = unique(Sample),
            # `revised final ID` = unique(`revised final ID`),
            `revised final ID` = unique(`revised final ID`),
            Primer = unique(Primer),
            `Expected Species` = unique(`Expected Species`),
            Pool = unique(Pool),
            `Num ASVs` = length(`ASV (Sequence)`),
            `Num OTUs` = length(unique(OTU))) %>% 
  ungroup()


#SFJQmc ----
# View(expctd_sps_tbl[expctd_sps_tbl$Pool=="SFJQ",c(2:9)])
# criando a tabela de proporções do SFJQ

all_ps_tbl_sfjq <- full_join(
  all_ps_tbl_bl_cur[all_ps_tbl_bl_cur$Sample %in% c(
  "Normalized SFJQmc\nNeoFish", "Normalized SFJQmc\nMiFish"),],
  expctd_sps_tbl[expctd_sps_tbl$Pool=="SFJQ",c(2:9)],
          by = "revised final ID")


# all_ps_tbl_jq %>% select(`revised final ID`,Abundance,`Relative abundance on sample`, 64:73) %>% View()

all_ps_tbl_sfjq$`Expected Species` <- "not expected"

all_ps_tbl_sfjq$`revised final ID` %>% unique()

all_ps_tbl_sfjq <- all_ps_tbl_sfjq %>% mutate( 
  # `Expected Species`=if_else((!is.na(`Full name`) & (.$`revised final ID` %in% expected_sps)),"expected","not expected")
  `Expected Species`=if_else((!is.na(`Full name`)),"expected","not expected")
  )

colnames(all_ps_tbl_sfjq)


sfjq_proportions <- all_ps_tbl_sfjq %>%
  select(c(1:24,37:50)) %>%
  # select(c(1:15,63:68,73:74)) %>% 
  pivot_longer(cols = c("Percentage on respective Norm pool", "Percentage on respective Non-norm pool" ),
               names_to = "Pool",values_to = "Proportion") %>% 
  # mutate(Proportion = Proportion*100) %>% 
  mutate(Sample = factor(Sample,levels = c(
    "Normalized SFJQmc\nNeoFish", "Normalized SFJQmc\nMiFish"
    ))) %>%
  filter((Sample %in% c("Normalized SFJQmc\nNeoFish", "Normalized SFJQmc\nMiFish"
                        ) & Pool %in% c("Percentage on respective Norm pool"
                                        ) )) %>% 
  group_by(Sample,`revised final ID`) %>% 
  summarize(Proportion = unique(Proportion),
            `Relative abundance on sample` = sum(`Relative abundance on sample`),
            Sample = unique(Sample),
            # `revised final ID` = unique(`revised final ID`),
            `revised final ID` = unique(`revised final ID`),
            Primer = unique(Primer),
            `Expected Species` = unique(`Expected Species`),
            Pool = unique(Pool),
            `Num ASVs` = length(`ASV (Sequence)`),
            `Num OTUs` = length(unique(OTU))) %>% 
  ungroup()



#this table will be used
all_ps_tbl_sfjq_full <-  dplyr::bind_rows(all_ps_tbl_sfjq,all_ps_tbl_sf,all_ps_tbl_jq)



all_ps_tbl_sfjq_full %>% colnames() %>% unique() %>% paste0(collapse = '",\n"') %>% cat()

Pearson correlations between DNA input and sequence yield

#correlação entre o DNA input e reads ABD -----

# JQmc
{
jq_df_neo_norm <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized JQmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_mif_norm <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized JQmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_tel_norm <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized JQmc\nTeleo")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_neo_skew <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized JQmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_mif_skew <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized JQmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_tel_skew <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized JQmc\nTeleo")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

#SF 
sf_df_neo_norm <- sf_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized SFmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

sf_df_mif_norm <- sf_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized SFmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

sf_df_neo_skew <- sf_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized SFmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

sf_df_mif_skew <- sf_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized SFmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

# SFJQ
sfjq_df_neo_norm <- sfjq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized SFJQmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

sfjq_df_mif_norm <- sfjq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized SFJQmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()
}




#correlações 
#JQmc
cor.test(x = jq_df_neo_norm$Proportion ,
         y = jq_df_neo_norm$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = jq_df_neo_skew$Proportion ,
         y = jq_df_neo_skew$`Relative abundance on sample` ,
         method = "pearson")



cor.test(x = jq_df_mif_norm$Proportion ,
         y = jq_df_mif_norm$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = jq_df_mif_skew$Proportion ,
         y = jq_df_mif_skew$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = jq_df_tel_norm$Proportion ,
         y = jq_df_tel_norm$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = jq_df_tel_skew$Proportion ,
         y = jq_df_tel_skew$`Relative abundance on sample` ,
         method = "pearson")

#SF
cor.test(x = sf_df_neo_norm$Proportion ,
         y = sf_df_neo_norm$`Relative abundance on sample` ,
         method = "pearson",)


cor.test(x = sf_df_neo_skew$Proportion ,
         y = sf_df_neo_skew$`Relative abundance on sample` ,
         method = "pearson")



cor.test(x = sf_df_mif_norm$Proportion ,
         y = sf_df_mif_norm$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = sf_df_mif_skew$Proportion ,
         y = sf_df_mif_skew$`Relative abundance on sample` ,
         method = "pearson")

Richness analysis on Vegan

library(vegan)
# data(dune)
# decorana(dune)

# class(dune)
#1- prepare data for entry in vegan ----

all_ps_blst_vegan <- all_ps_tbl_bl_cur %>% 
  filter(`Expected length` %in% c("in range")) %>% 
  mutate("Normalization" = str_split(.$Group,pattern = " ",2,simplify = TRUE)[,1],
         "Mock Community" = str_split(.$Group,pattern = " ",2,simplify = TRUE)[,2]) %>% 
  filter(!(Sample %in% c("Positive Control\n(P.glauca)","neg-PCR2"))) %>%   #remove control samples
  select(c(Sample,Group,Normalization,`Mock Community`,Type,Primer,File_name,Library,Run,`final ID`,`Relative abundance on sample`)) %>% 
  group_by(Sample,`final ID`,Group,Type,Primer,File_name,Library,Run,Normalization,`Mock Community`) %>% 
  summarise(`Relative abundance on sample` = sum(`Relative abundance on sample`)) %>% 
  pivot_wider(c(Sample,Group,Type,Primer,File_name,Library,Run,Normalization,`Mock Community`),names_from = `final ID` ,values_from = `Relative abundance on sample`) %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>% 
  # mutate(Library = unfactor(Library)) %>% 
  mutate("Sample number" = 0) %>% 
  ungroup()  %>% 
  select(`Sample number`, 1:(ncol(.)-1)) %>% 
  mutate(Normalization = factor(Normalization))

#2- associate sample numbers to sample names ----
for (sample in 1:nrow(all_ps_blst_vegan)) {
  all_ps_blst_vegan$`Sample number`[sample] <- sample 
  
}

#tirando as amostras da ecomol pra facilitar

all_ps_blst_vegan <- all_ps_blst_vegan[all_ps_blst_vegan$Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2"),] 




colnames(all_ps_blst_vegan)
hist(colSums(all_ps_blst_vegan[,-c(1:10)]))
hist(rowSums(all_ps_blst_vegan[,-c(1:10)]))
all_ps_blst_vegan[,-c(1:10)]

all_ps_blst_vegan %>% select(Sample, `Sample number`)
# all_ps_blst_vegan %>% select(`Sample number`, 1:(ncol(.)-1))

#3- create data.frame of species counts: rownames are Sample numbers ----

all_ps_blst_vegan_df <- all_ps_blst_vegan %>% 
  # filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>% 
  select(-c("Sample", "Group", "Type", "Primer", "File_name", "Library", "Run","Normalization","Mock Community")) %>% 
  select(base::sort(colnames(.))) %>% 
  as.data.frame() 

#4- name rows as Sample numbers and remove column ----
row.names(all_ps_blst_vegan_df) <- all_ps_blst_vegan_df$`Sample number`
all_ps_blst_vegan_df <- all_ps_blst_vegan_df %>% 
  select(-c(`Sample number`))

library(vegan)
#5- 

all_ps_ord <- decorana(veg = all_ps_blst_vegan_df)

all_ps_ord %>% summary()

all_ps_ord %>% str()

all_ps_ord$cproj


plot(all_ps_ord)
plot(all_ps_ord,type = "p")
plot(all_ps_ord,type = "c") 

points(all_ps_ord, display = "sites", cex = 0.8, pch=21, col="red", bg="yellow")
text(all_ps_ord, display = "sites", cex=0.7, col="blue")
text(all_ps_ord, display = "spec", cex=0.7, col="blue")



#6- NMDS analisys ----



# library(vegan)
# data(varespec)
#6a- Calculate distances ----
all_ps_vg_dist <- vegdist(all_ps_blst_vegan_df, method="bray")

vegan::scores(all_ps_vg_dist)

# all_ps_vg_dist_metaMDS <- metaMDS(comm = all_ps_vg_dist, autotransform = FALSE) 
# actually autotransform = FALSE doesn't seem to change the results

# plot(all_ps_vg_dist_metaMDS)

# all_ps_vg_dist_metaMDS_2 <- metaMDS(comm = all_ps_vg_dist, distance = "bray", k =2)

# plot(all_ps_vg_dist_metaMDS_2)

#selecionar apenas espécies esperadas?

all_ps_blst_vegan_df %>% ncol()
all_ps_blst_vegan_df <- all_ps_blst_vegan_df[,(colnames(all_ps_blst_vegan_df) %in% expected_sps)]


all_ps_blst_vegan_df %>% ncol()
all_ps_vg_dist <- vegdist(all_ps_blst_vegan_df, method="bray")

all_ps_ord <- decorana(veg = all_ps_blst_vegan_df)

all_ps_ord %>% summary()

all_ps_ord %>% str()

all_ps_ord$cproj
all_ps_ord


plot(all_ps_ord)
plot(all_ps_ord,type = "p")
plot(all_ps_ord,type = "c") 
vegan::scores(all_ps_vg_dist)




# all_ps_blst_vegan_df[,(colnames(all_ps_blst_vegan_df) %in% expected_sps)] %>% colnames()
# all_ps_blst_vegan_df%>% colnames()



all_ps_vegan_ord_meta <- metaMDS(veg = all_ps_blst_vegan_df, comm = all_ps_vg_dist)
# actually autotransform = FALSE doesn't seem to change the results
plot(all_ps_vegan_ord_meta, type = "t")


all_ps_vegan_ord_meta %>% str()
all_ps_vegan_ord_meta$stress


  
#6b- extract NMDS scores from results
  
all_vegan_meta <- (vegan::scores(all_ps_vegan_ord_meta) %>% tidyr::as_tibble(rownames = "Sample number")) %>% mutate(`Sample number` = as.numeric(`Sample number`))
            # all_vegan_meta <- as.data.frame(vegan::scores(all_ps_vegan_ord_meta))
            
            #Using the scores function from vegan to extract the site scores and convert to a data.frame
            
            # all_vegan_meta$`Sample number` <- rownames(all_vegan_meta) %>% as.numeric()  
            
            # all_vegan_meta %>% left_join()# create a column of site names, from the rownames of data.scores
            
            # all_vegan_meta <- all_vegan_meta  %>% as_tibble() # create a column of site names, from the rownames of data.scores

#7- bring NMDS scores to complete table

all_vegan_meta_tbl <- left_join(x = unique(all_ps_blst_vegan[,c(1:10)]),y = all_vegan_meta, by = "Sample number") %>% 
  mutate(Primer=factor(Primer,levels = c("NeoFish", "MiFish", "Teleo")),
         `Mock Community`=factor(`Mock Community`))





library(factoextra)
library(ggforce)



nmds_PLOT <- all_vegan_meta_tbl %>% 
  # filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>% 
  ggplot(aes(x = NMDS1,y = NMDS2, col = Primer,shape = Normalization,label = Sample,Group = `Mock Community`))+
    # stat_ellipse()+ 
  geom_point(size = 11)+
  theme_light(base_size = 18) +
  theme(legend.position="bottom") +
  coord_fixed(ratio = 1) +
  # ggrepel::geom_label_repel(label.size = 0.8,size = 3,min.segment.length = 2) +
  # ggrepel::geom_text_repel(col="black",size = 3,min.segment.length = 2) +
  # scale_shape_manual() %>% 
  scale_color_manual(
    # labels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish", "NeoFish/MiFish/Teleo"),
    labels = c("NeoFish", "MiFish", "Teleo"),
                     values = alpha(colour = colors_norm[c(1,3,5)] ))+
  annotate(geom = "text",
           x=c(0.275),y=c(-0.275),label=paste0("Stress: ",round(all_ps_vegan_ord_meta$stress,digits = 4)),size=5) +

    # ADD ggforce's ellipses
  ggforce::geom_mark_ellipse(inherit.aes = FALSE,
                             aes(x = NMDS1,y = NMDS2,
                                 group=`Mock Community`,
                                 label=`Mock Community`),
                             n = 100,
                             expand = 0.03,
                             label.fontsize = 20,con.cap = 0.1) 
  
    # facet_wrap(~`Mock Community`,ncol = 2)
  
#   
nmds_PLOT
  # 
# ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_NMDS.pdf",
ggsave(file = "~/outros/sfjq_temp/SFJQ_NMDS.pdf",
     plot = nmds_PLOT,
     device = "pdf",
     width = 40,
     height =25,
     units = "cm",
     dpi = 300)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_NMDS.png",
     plot = nmds_PLOT,
     device = "png",
     width = 31,
     height =20,
     units = "cm",
     dpi = 300)




# anosim----

################################# anosin function###############################
anosin_auto<- function(tbl,cols_out, Coluna){
  df <- tbl[,c(1,(1+cols_out):ncol(tbl))] %>% 
    as.data.frame() %>% 
    `rownames<-`(.$`Sample number`) %>% 
    select(-c("Sample number"))
  
  ano <- anosim(df, grouping = tbl[[Coluna]],
       permutations = 9999, distance = "bray", strata = NULL)
  return(ano)
  
}
################################################################################


#Primers ----
#Todas MC juntas
all_ps_blst_vegan %>% 
  # filter(`Mock Community` %in% c("SFmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Primer")

#Apenas JQ
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("JQmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Primer")

#Apenas SF
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("SFmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Primer")

#Apenas SFJQ
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("SFJQmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Primer")


#Normalization ----
#Todas MC juntas
all_ps_blst_vegan %>% 
anosin_auto(cols_out = 10,Coluna = "Normalization")

#Apenas JQ
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("JQmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Normalization")

#Apenas SF
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("SFmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Normalization")

#Apenas SFJQ
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("SFJQmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Normalization")


all_vegan_meta_tbl %>% 
  filter(Run %in% c("LGC_MiniSeq_1","LGC_MiniSeq_2")) %>% 
  select(Sample,`Sample number`,Primer)

Ploting ASVs

# all_ps_tbl_blast_bckp6 <- all_ps_tbl_blast

all_ps_tbl_blast <- all_ps_tbl_bl_cur 




#28- ASVs plots by sample and species ----

options(set.seed(seed = 13))

#28a- ASV size distribution - alphabetical----
all_ps_tbl_blast$Run %>% unique()
  
ASV_size_by_Sample <- all_ps_tbl_blast %>%
  filter(!Sample %in% c("Positive Control\n(P.glauca)")) %>% 
  filter(!Run %in% c("ecomol_iSeq")) %>% 
  mutate(Sample = factor(Sample,levels = sample_levels)) %>% 
  mutate(Primer = factor(Primer,levels = c("NeoFish", "MiFish", 
                                           "Teleo", "NeoFish/MiFish/Teleo"))) %>% 
  ggplot(aes(y=Sample,
             x=`ASV size (pb)`,
             colour = Primer,
             size=`Relative abundance on sample`,
             shape=`Expected length`
             )) +
  geom_jitter(height = 0.2,
              width = 0) +
  ggplot2::scale_colour_manual(
                     values = ggplot2::alpha(colour = colors5[1:4] ,alpha =  0.3)) +
  coord_fixed(ratio = 8) +
  scale_x_continuous(breaks = c(20,60,80,100,120,140,160,180,200,220,240,260,280,300,320,340),expand = c(0.02,0.02)) +
  xlab("ASV length (bp)") +
  ylab("Sample") +
  ggtitle(label = "SFJQ mock communities ",
          subtitle = "All ASVs found in samples, by length and abundance") +
  theme_bw(base_size = 15) +
  theme(legend.position = "right") 

ASV_size_by_Sample


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/3-ASV_size_by_sample.png",
     plot = ASV_size_by_Sample,
     device = "png",
     width = 18,
     height = 10,
     dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/3-ASV_size_by_sample.svg",
     plot = ASV_size_by_Sample,
     device = "svg",
     width = 18,
     height = 10,
     dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/3-ASV_size_by_sample.pdf",
     plot = ASV_size_by_Sample,
     device = "pdf",
     width = 18,
     height = 10,
     dpi = 600)
dev.off()



##################### paper ################################# ----

#29 - ASVS - sample X species (12S) - distribution: blast---- 
all_ps_tbl_blast$Group %>%unique()

#Jq & SF DNA pools - ASVs size and species by sample----

all_ps_tbl_blast$Sample %>% unfactor() %>% unique() %>% base::sort()
all_ps_tbl_blast$`revised final ID` %>% unique() %>% base::sort() %>% paste0(collapse = '", \n"') %>% cat()
all_ps_tbl_blast$`final ID` %>% unique() %>% base::sort() %>% paste0(collapse = '", \n"') %>% cat()


pool_labels <- c(
  "Normalized JQmc" = "Jequitinhonha\nNormalized DNA pool",
  "Non-normalized JQmc" = "Jequitinhonha\nNon-Normalized DNA pool",
  "Normalized SFJQmc" = "São Francisco & Jequitinhonha\nNormalized DNA pool",
  "Non-normalized SFmc" = "São Francisco\nNon-Normalized DNA pool",
  "Normalized SFmc" = "São Francisco\nNormalized DNA pool"
)

pools_levels <- c(
"Normalized JQmc",
"Non-normalized JQmc",
"Normalized SFJQmc",
"Normalized SFmc",
"Non-normalized SFmc"
)

#final ID levels
{
finalID_levels<- c(
#jq
"Astyanax lacustris", 
"Australoheros sp", 
"Delturus brevis", 
"Eugerres brasilianus", 
"Hypomasticus steindachneri", 

"Hypostomus nigrolineatus", 
"Megaleporinus elongatus", 
"Megaleporinus garmani", 
"Moenkhausia costae", 
"Rhamdia quelen", 
"Steindachneridion amblyurum", 
"Wertheimeria maculata",
#ambos
"Gymnotus carapo", 
"Hoplias", 
"Hoplias brasiliensis", 

"Hoplias intermedius", 
"Hoplias malabaricus", 
"Prochilodus", 
"Prochilodus argenteus", 
 
"Prochilodus costatus", 
"Steindachnerina elegans", 
"Trachelyopterus galeatus",
#sf

"Astyanax fasciatus", 
"Brycon orthotaenia", 
"Characidium lagosantense", 
"Crenicichla lepidota", 
# "Curimatella lepidura", esse é na vdd roeboides
"Roeboides xenodon",
"Eigenmannia virescens", 
"Franciscodoras marmoratus", 
"Hypostomus alatus", 
"Imparfinis minutus", 
"Leporinus reinhardti", 
"Microglanis leptostriatus", 
"Moenkhausia sanctaefilomenae", 
"Myleus micans", 
"Pamphorichthys hollandi", 
"Phalloceros uai", 

"Pimelodus maculatus", 
"Pimelodus pohli", 
"Pseudoplatystoma corruscans", 
"Pterygoplichthys etentaculatus", 
"Serrasalmus brandtii", 
"Tetragonopterus chalceus", 

#partial
"Astyanax", 
"Hoplias brasiliensis/intermedius",
"Hypostomus", 
"Pimelodus", 
"Prochilodus argenteus/hartii",
#trash


"NA elegans/gilbert", 
"NA lepidura/xenodon", 
"Acestrorhynchus lacustris",
"Coptodon zillii",
"Cyphocharax gilbert",
"Geophagus brasiliensis",
"Planaltina myersi", 
"Poecilia reticulata mitochondr"
)
}
finalID_levels[finalID_levels %>% duplicated()]



# final ID levels 2
{
finalID_levels <- c(
"Acestrorhynchus lacustris", 
"Acinocheirodon melanogramma", 
"Astyanax fasciatus", 
"Astyanax lacustris", 
"Australoheros sp", 
"Bos taurus", 
"Brycon orthotaenia", 
"Characidium lagosantense", 
"Coptodon zillii", 
"Crenicichla lepidota", 
# "Curimatella lepidura", agora é roeboides
"Roeboides xenodon",
"Cyphocharax gilbert", 
"Delturus brevis", 
"Eigenmannia virescens", 
"Eugerres brasilianus", 
"Franciscodoras marmoratus", 
"Geophagus brasiliensis", 
"Gymnotus carapo", 
"Hoplias brasiliensis", 
"Hoplias intermedius", 
"Hoplias malabaricus", 
"Hypomasticus steindachneri", 
"Hypostomus alatus", 
"Hypostomus nigrolineatus", 
"Imparfinis minutus", 
"Leporinus reinhardti", 
"Megaleporinus elongatus", 
"Megaleporinus garmani", 
"Microglanis leptostriatus", 
"Moenkhausia costae", 
"Moenkhausia sanctaefilomenae", 
"Myleus micans", 
"Pamphorichthys hollandi", 
"Phalloceros uai", 
"Pimelodus maculatus", 
"Pimelodus pohli", 
"Planaltina myersi", 
"Prionace glauca", 
"Prochilodus argenteus", 
"Prochilodus costatus", 
"Prochilodus hartii",
"Pseudoplatystoma corruscans", 
"Pterygoplichthys etentaculatus", 
"Rhamdia quelen", 
"Roeboides xenodon", 
"Serrasalmus brandtii", 
"Steindachneridion amblyurum", 
"Tetragonopterus chalceus", 
"Trachelyopterus galeatus", 
"Wertheimeria maculata",

#partial
"Astyanax", 
"Hoplias brasiliensis/intermedius", 
"Hypostomus", 
"Pimelodus", 
"Prochilodus", 
"Prochilodus argenteus/hartii")
}

sps_remove <- c(
NA,"NA",
"Acestrorhynchus lacustris", 
"Acinocheirodon melanogramma", 
"Bos taurus", 
"Coptodon zillii", 
"Curimatella lepidura", 
"Eugerres brasilianus", 
"Geophagus brasiliensis", 
"Leporinus reinhardti", 
"Moenkhausia costae", 
"Planaltina myersi", 
"Prionace glauca", 
"Pseudoplatystoma corruscans"
)

Images arcticle

Fold-change bar plots

##Ven diagram

##Upset plot

DNA/RRA correlation plots

colnames(all_ps_tbl_sfjq_full_uniq)
all_ps_tbl_sfjq_full_uniq$Group
all_ps_tbl_sfjq_full_uniq$MC
all_ps_tbl_sfjq_full_uniq$Normalization




######função pra plotar lm

# lm_eqn <- function(df){
#     m <- lm(y ~ x, df);
#     eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
#          list(a = format(unname(coef(m)[1]), digits = 2),
#               b = format(unname(coef(m)[2]), digits = 2),
#              r2 = format(summary(m)$r.squared, digits = 3)))
#     as.character(as.expression(eq));
# }





#inclinação das retas ----

#LM
 models_corr <- all_ps_tbl_sfjq_full_uniq %>% 
   filter(Normalization %in% c("Non-normalized")) %>% 
   # filter(Primer %in% c("NeoFish")) %>% 
  group_by(Primer,MC) %>% 
   select(c("Percentage on respective Non-norm pool", "Relative abundance on sample")) %>% 
  do(model = lm(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`, data = .))
   # lm(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`)
   # aov(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`)

models_corr$model
models_corr$Primer
models_corr$MC



models_corr$model[[1]] %>% broom::glance(model) #NeoFish JQmc
models_corr$model[[2]] %>% broom::glance(model) #NeoFish SFmc
models_corr$model[[3]] %>% broom::glance(model) #MiFish JQmc
models_corr$model[[4]] %>% broom::glance(model) #MiFish SFmc
models_corr$model[[5]] %>% broom::glance(model) #Teleo JQmc

#AOV
models_corr <- all_ps_tbl_sfjq_full_uniq %>% 
   filter(Normalization %in% c("Non-normalized")) %>% 
   # filter(Primer %in% c("NeoFish")) %>% 
  group_by(Primer) %>% 
   select(c("Percentage on respective Non-norm pool", "Relative abundance on sample")) %>% 
  do(model = aov(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`, data = .))
   # lm(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`)
   # aov(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`)

models_corr$model
models_corr$Primer

models_corr$model[[1]] %>% broom::tidy()
models_corr$model[[2]] %>% broom::tidy()
models_corr$model[[3]] %>% broom::tidy()



#JQmc ----
# jq_coef <- lm(`Percentage on respective Non-norm pool` ~ `Relative abundance on sample`,
#    (all_ps_tbl_sfjq_full_uniq %>% 
#       filter(MC %in% c("JQmc")) %>% 
#       filter(Normalization %in% c("Non-normalized"))))




############################ tentando com a tib com norm e n norm na mesma coluna
tab_curated_SFJQ_all_pools %>% colnames()


sfjq_sp_corr <-
  tab_curated_SFJQ_all_pools %>% 
  ggplot(aes(x=`input DNA (%)`,
             y=`RRA (%)`,
             col=Primer,
             shape=Pool))+
  geom_point() + 
  # geom_smooth(method=lm) +
  # coord_fixed()+
  scale_colour_manual(values = alpha(colour = colors_norm[c(1,3,5)] ,alpha =  0.8)) +
  # scale_shape_manual(drop=TRUE) +
  xlab("Input DNA (%)") +
  ylab("Relative Read Abundance (%)") +
  ggtitle("SFmc & JQmc: Correlation between\nInput DNA and RRA") +
  scale_x_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  scale_y_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  coord_fixed(ratio = 1)+
  geom_smooth(method=lm) +
  theme_bw(base_size = 10) +
  # facet_wrap(MC~Primer,ncol = 3) 
  facet_wrap(Primer~Group,ncol = 5) 
# +
#   scale_x_log10() +
#   scale_y_log10()
# minor_breaks = mb
# minor_breaks = mb

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_RRA_DNA_bySp2.png", plot = sfjq_sp_corr, device = "png", width = 30, height = 15, units = "cm", dpi = 600)









###########################




sfjq_sp_corr <- all_ps_tbl_sfjq_full_uniq %>% 
  # filter(MC %in% c("JQmc")) %>%
  filter(Normalization %in% c("Non-normalized")) %>%
  # mutate(`Percentage on respective Non-norm pool`= if_else(`Percentage on respective Non-norm pool` %in% c(NA,"NA"),0,`Percentage on respective Non-norm pool`)) %>% 
  # mutate(`Relative abundance on sample`= if_else(`Relative abundance on sample` %in% c(NA,"NA"),0,`Relative abundance on sample`)) %>% View()
  
  ggplot(aes(x=`Percentage on respective Non-norm pool`*100,
             y=`Relative abundance on sample`*100,
             col=Primer,
             shape=MC
             ))+
  geom_point() + 
  # geom_smooth(method=lm) +
  # coord_fixed()+
  scale_colour_manual(values = alpha(colour = colors_norm[c(1,3,5)] ,alpha =  0.8)) +
  # scale_shape_manual(drop=TRUE) +
  xlab("Input DNA (%)") +
  ylab("Relative Read Abundance (%)") +
  ggtitle("SFmc & JQmc: Correlation between\nInput DNA and RRA") +
  scale_x_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  scale_y_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  coord_fixed(ratio = 1)+
  geom_smooth(method=lm) +
  theme_bw(base_size = 10) +
  # facet_wrap(MC~Primer,ncol = 3) 
  facet_wrap(Primer~Normalization,ncol = 3) 
# +
#   scale_x_log10() +
#   scale_y_log10()
# minor_breaks = mb
# minor_breaks = mb

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_RRA_DNA_bySp.png", plot = sfjq_sp_corr, device = "png", width = 30, height = 15, units = "cm", dpi = 600)




#SFmc ----


library("plotly")

sf_sp_corr <- all_ps_tbl_sfjq_full_uniq %>% 
  filter(MC %in% c("SFmc")) %>% 
  filter(Normalization %in% c("Non-normalized")) %>% 
  # mutate(`Percentage on respective Non-norm pool`= if_else(`Percentage on respective Non-norm pool` %in% c(NA,"NA"),0,`Percentage on respective Non-norm pool`)) %>% 
  # mutate(`Relative abundance on sample`= if_else(`Relative abundance on sample` %in% c(NA,"NA"),0,`Relative abundance on sample`)) %>% View()
  
  ggplot(aes(x=`Percentage on respective Non-norm pool`*100,
             y=`Relative abundance on sample`*100,
             col=Primer,
             shape=MC))+
  geom_point() + 
  # geom_smooth(method=lm) +
  # coord_fixed()+
  scale_colour_manual(values = alpha(colour = colors_norm[c(1,3,5)] ,alpha =  0.8)) +
  xlab("input DNA (%)") +
  ylab("Relative Read Abundance (%)") +
  ggtitle("SFmc: Correlation between\nInput DNA and RRA") +
  scale_x_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  scale_y_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  coord_fixed(ratio = 1)+
  geom_smooth(method=lm) +
  theme_bw(base_size = 10) +
  facet_wrap(~Primer,ncol = 3) 
    
    
    
    ggplotly(sf_sp_corr)


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SF_RRA_DNA_bySp.png", plot = sf_sp_corr, device = "png", width = 20, height = 15, units = "cm", dpi = 600)

Aditional Graphics

# set colors ----

colors6 <- c("#19821c","#8dbc8f", #neo norm & non norm
             "#171d9a","#8d90c7", #mif norm & non norm
             "#8c1717","#c18d8d"  #tel norm & non norm
             )

scales::show_col(colors6)






# carregando tabela 
# https://docs.google.com/spreadsheets/d/1sTsOaI999E9Py_f4ll5j838r7CVc991a8mPmd8LvykU/edit#gid=0

tab_curated_SFJQ_all_pools <- read.csv("~/prjcts/fish_eDNA/sfjq/data/SFJQ_all_pools-tabelas_curadas_para_o_R-SFJQmc-copy.csv",check.names = F) %>% 
  as_tibble()

tab_curated_SFJQ_all_pools <- tab_curated_SFJQ_all_pools %>% 
    filter(`input DNA (%)` != 0 & `RRA (%)` !=0) %>%
  group_by(Pool,Normalization,Primer,Species) %>% 
  summarize(Pool = unique(Pool),
            Normalization = unique(Normalization),
            Status = unique(Status),
            Species = unique(Species),
            Primer = unique(Primer),
            `Num ASVs` = `Num ASVs`,
            `Num OTUs` = `Num OTUs`,
            `input DNA (%)` = `input DNA (%)`,
            `RRA (%)` = `RRA (%)`
            # `Expected species` = length(unique(`revised final ID`[`Expected Species` %in% c("expected")])),
            # `Expected species list` = list(unique(base::sort(`revised final ID`[`Expected Species` %in% c("expected")]))),
            # `revised final ID`= unique(`revised final ID`),
            # `RRA (%)` = sum(`Relative abundance on sample`),
            # `Percentage on respective Norm pool` = unique(`Percentage on respective Norm pool`),
            # `Percentage on respective Non-norm pool` = unique(`Percentage on respective Non-norm pool`),
            # `Relative abundance on sample` = unique(`Relative abundance on sample`)
            ) %>%
  ungroup() %>% 
  # mutate(`revised final ID`=factor(`revised final ID`
  #                                  # , levels = rev(finalID_levels)
  #                                  )) %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish","NeoFish/MiFish/Teleo"))) %>% 
  mutate(`Fold change (RRA/DNA input)` = gtools::foldchange(denom = `input DNA (%)`,num = `RRA (%)`)) %>% 
  unique()
  # mutate(Group = factor(Group,levels = c("Normalized SFJQmc", "Non-normalized JQmc", "Normalized JQmc", "Non-normalized SFmc", "Normalized SFmc" ))) %>% 
  # mutate(MC = str_remove(Group,"Normalized |Non-normalized ")) %>% 
  # mutate(Normalization = str_remove(Group," SFmc| JQmc| SFJQmc")) 


tab_curated_SFJQ_all_pools$`Fold change (RRA/DNA input)` %>% sort() %>% duplicated()

tab_curated_SFJQ_all_pools[c(which(tab_curated_SFJQ_all_pools$`Fold change (RRA/DNA input)` %>% as.character() %>% duplicated()),
                             which(tab_curated_SFJQ_all_pools$`Fold change (RRA/DNA input)` %>% as.character() %>% duplicated())-1),] %>% View()




# SFJQmc ----
## build tree ----
### read 12S db seqs for the species present in pools
SFJQ_Sps_seqs <- Biostrings::readDNAStringSet(filepath = "~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique.fas") %>%
# SFJQ_Sps_seqs <- Biostrings::readDNAStringSet(filepath = "~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique.fas") %>% 
  DECIPHER::RemoveGaps()

### align seqs
SFJQ_Sps_algn <- DECIPHER::AlignSeqs(myXStringSet = SFJQ_Sps_seqs, 
                                      refinements = 100,
                                      iterations = 100,
                                      verbose = TRUE)

### generate distance matrix
SFJQ_Sps_dist <- DECIPHER::DistanceMatrix(myXStringSet = SFJQ_Sps_algn,
                                            includeTerminalGaps = FALSE,
                                            correction = "Jukes-Cantor",
                                            processors = 20,
                                            verbose = TRUE)

### generate dendrogram/tree from alignment and distance matrix
SFJQmc_tree <- ape::nj(SFJQ_Sps_dist)
        # tree <- phangorn::NJ(SFJQ_Sps_dist)
class(SFJQmc_tree)

### save tree as newick
ape::write.tree(phy = SFJQmc_tree,file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique_APEtree.nwk")
# ape::write.tree(phy = SFJQmc_tree,file = "~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique_APEtree.nwk")

### read tree from file (or stay with the same object)
# SFJQmc_tree <- read.tree("~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique_APEtree.nwk")
SFJQmc_tree <- read.tree("~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique_APEtree.nwk")


## species metadata ----
### read table with pools species, input DNA and RRA
# tab_curated_SFJQ_all_pools <- read.csv("~/outros/sfjq_temp/SFJQ_all_pools-tabelas_curadas_para_o_R-SFJQmc.csv",check.names = F)
# tab_curated_SFJQ_all_pools <- read.csv("~/prjcts/fish_eDNA/sfjq/data/SFJQ_all_pools-tabelas_curadas_para_o_R-SFJQmc.csv",check.names = F) %>% 
#   as_tibble()
# 
# tab_curated_SFJQ_all_pools




tab_curated_SFJQ_all_pools %>% colnames()
tab_curated_SFJQ_all_pools %>% unique()
all_ps_tbl_sfjq_full_uniq %>% colnames()





# tab_curated_SFJQ_all_pools  <- all_ps_tbl_sfjq_full_uniq



#converting the corrected table to the format required for next steps


tab_curated_SFJQ_all_pools <-
  all_ps_tbl_sfjq_full_uniq %>%
  pivot_longer(c("Percentage on respective Norm pool", "Percentage on respective Non-norm pool"),
               names_to = "DNA Originary Pool", values_to = "input DNA (%)") %>% 
  pivot_longer(c("Recovered proportion norm", "Recovered proportion non norm"),
               names_to = "Fold Change Originary Pool", values_to = "Fold Change") %>% 
  select(-c(
    # "DNA Originary Pool", "RRA Originary Pool",
            # "Sample,"
            # "Group",
            "Run"
            )) %>% 
  # colnames()
  # View()
  rename(
   Species = `revised final ID`,
  `Num OTUs` = `OTUs`,
  `Num ASVs` = `ASVs`,
  `Num IDs` = `IDs`,
  # `RRA (%)` = `Relative abundance on sample`,
  # `Expected species`, 
  # `Expected species list`,
  # `Fold Change` = `Ratio`,
  `Pool` = `MC`) %>%
  filter((`DNA Originary Pool` %in% c("Percentage on respective Norm pool") & `Normalization` %in% c("Normalized")) |
           (`DNA Originary Pool` %in% c("Percentage on respective Non-norm pool") & `Normalization` %in% c("Non-normalized")) ) %>% 
  filter((`Fold Change Originary Pool` %in% c("Recovered proportion norm") & `Normalization` %in% c("Normalized")) |
           (`Fold Change Originary Pool` %in% c("Recovered proportion non norm") & `Normalization` %in% c("Non-normalized")) ) %>% 
  select(-c("DNA Originary Pool", "Fold Change Originary Pool")) %>% 
  mutate(Status = if_else(`Expected species` == 0, "Contamination","Expected")) %>% 
  mutate(`RRA (%)` = `RRA (%)` * 100,
         `input DNA (%)` = `input DNA (%)` * 100) %>% unite(Normalization, Primer, col = "Primer_norm",remove = F,sep = " ") %>% 
  mutate(Primer_norm = factor(Primer_norm, levels = c("Normalized NeoFish","Normalized MiFish","Normalized Teleo","Non-normalized NeoFish","Non-normalized MiFish","Non-normalized Teleo"))) %>% View()

# all_ps_tbl_sfjq_full_uniq %>% colnames()


 

# tab_curated_SFJQ_all_pools[sort(c(which(tab_curated_SFJQ_all_pools$`Fold Change` %>% as.character() %>% duplicated()),which(tab_curated_SFJQ_all_pools$`Fold Change`%>% as.character() %>% duplicated())-1)),] %>% View()
# tab_curated_SFJQ_all_pools[tab_curated_SFJQ_all_pools$`Fold Change` %>% duplicated(),] %>% View()

### tidy table
tab_curated_SFJQ <- tab_curated_SFJQ_all_pools %>%
  as_tibble() %>%
  filter(Status == "Expected") %>%
  filter(Pool == "SFJQmc") %>%
  # filter(MC == "SFJQmc") %>% 
  # filter(Pool == "SFJQmc") %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish","MiFish","Teleo")),
         Species = factor(Species),
         # `revised final ID` = factor(`revised final ID`),
         Normalization = factor(Normalization),
         Pool = factor(Pool),
         Status = factor(Status),
         # `RRA (%)` = as.numeric(`Relative abundance on sample`)*100,
         `RRA (%)` = as.numeric(`RRA (%)`),
         `input DNA (%)` = as.numeric(`input DNA (%)`))


## correct tips labels ----
### check tip names in tree
SFJQmc_tree$tip.label


### rename tree tips to mach table
SFJQmc_tree$tip.label
tab_curated_SFJQ$Species %>% unique() %>% sort()

{
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1339_Prochilodus_costatus_JQ_2860"] <- "Prochilodus costatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_5612_Prochilodus_argenteus_hartii"] <- "Prochilodus argenteus/hartii"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_6586_Steindachnerina_elegans-Cyphocharax_gilbert"] <- "Cyphocharax gilbert"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1230_Serrasalmus_brandtii"] <- "Serrasalmus brandtii"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1162_Myleus_micans"] <- "Myleus micans"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_2984_Hypomasticus_steindachneri"] <- "Hypomasticus steindachneri"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_4290_Megaleporinus_garmani"] <- "Megaleporinus garmani"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_1762_Megaleporinus_elongatus"] <- "Megaleporinus elongatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1037_Brycon_orthotaenia"] <- "Brycon orthotaenia"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1381_Characidium_lagosantense"] <- "Characidium lagosantense"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "1136_Acestrorhynchus_lacustris"] <- "Acestrorhynchus lacustris"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_7822_Hoplias_malabaricus"] <- "Hoplias malabaricus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1377_Hoplias_intermedius_brasiliensis"] <- "Hoplias brasiliensis/intermedius"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_0916_Roeboides_xenodon"] <- "Roeboides xenodon"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_0901_Tetragonopterus_chalceus"] <- "Tetragonopterus chalceus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1113_Moenkhausia_sanctaefilomenae"] <- "Moenkhausia sanctaefilomenae"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SFJQ_7812_Moenkhausia_costae"] <- "Moenkhausia costae"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1153_Astyanax_cf_fasciatus"] <- "Astyanax fasciatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_7880_Astyanax_lacustris"] <- "Astyanax lacustris"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1141_Pterygoplichthys_etentaculatus"] <- "Pterygoplichthys etentaculatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1041_Hypostomus_alatus"] <- "Hypostomus alatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_7893_Hypostomus_nigrolineatus"] <- "Hypostomus nigrolineatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1248_Gymnotus_carapo_JQ_1631"] <- "Gymnotus carapo"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1117_Eigenmannia_virescens"] <- "Eigenmannia virescens"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_1936_Delturus_brevis"] <- "Delturus brevis"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_0708_Franciscodoras_marmoratus"] <- "Franciscodoras marmoratus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_7817_Wertheimeria_maculata"] <- "Wertheimeria maculata"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1243_Trachelyopterus_galeatus_JQ_5675"] <- "Trachelyopterus galeatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1403_Pimelodus_pohli"] <- "Pimelodus pohli"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1280_Pimelodus_maculatus"] <- "Pimelodus maculatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "1135_Pseudoplatystoma_corruscans"] <- "Pseudoplatystoma corruscans"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_2996_Steindachneridion_amblyurum"] <- "Steindachneridion amblyurum"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1104_Microglanis_leptostriatus"] <- "Microglanis leptostriatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1161_Imparfinis_minutus"] <- "Imparfinis minutus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_5645_Rhamdia_aff_quelen"] <- "Rhamdia quelen"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SFJQ_2943_Eugerres_brasilianus"] <- "Eugerres brasilianus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1368_Phalloceros_uai"] <- "Phalloceros uai"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1099_Pamphorichthys_hollandi"] <- "Pamphorichthys hollandi"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1264_Crenicichla_lepidota"] <- "Crenicichla lepidota"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_6584_Australoheros_sp"] <- "Australoheros sp"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "304_Geophagus_brasiliensis"] <- "Geophagus brasiliensis"
}

### check if all names have correspondences
SFJQmc_tree$tip.label %in% tab_curated_SFJQ$Species
tab_curated_SFJQ$Species %in% SFJQmc_tree$tip.label
tab_curated_SFJQ$Species[!tab_curated_SFJQ$Species %in% SFJQmc_tree$tip.label]





### convert ape tree to prettier ggtree object
SFJQmc_tree_plot <- ggtree(SFJQmc_tree) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)


### save tree plot
# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique-tree_plot.pdf", plot = SFJQmc_tree_plot, device = "pdf", width = 24, height = 24, units = "cm", dpi = 600)
ggsave(file = "~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique-tree_plot.pdf", plot = SFJQmc_tree_plot, device = "pdf", width = 24, height = 24, units = "cm", dpi = 600)



### extract tips order from tree to reproduce on plots
SPs_order_in_SFJQ_tree <- ggtree::get_taxa_name(SFJQmc_tree_plot) %>% rev()

            # SPs_order_in_SFJQ_tree <- extract_tree_data(tree_plot) %>% 
            #     dplyr::filter(isTip) %>% 
            #     dplyr::pull(label)



## plots ----
### RRA histogram ----

# library(ggdist)



SFJQmc_RRA_plot <-
  tab_curated_SFJQ %>%
  filter(Pool %in% c("SFJQmc")) %>% 
  mutate(Species = factor(Species,levels = SPs_order_in_SFJQ_tree)) %>% 
  ggplot(aes(y = Species,
             x = `RRA (%)`,
             fill = Primer, 
             col = Normalization
             ))+
  geom_bar(stat = "identity", size = 0.3,width = .75,
           # alpha = 0.7,
           position = position_dodge(preserve = "single" ,width = 1.1)) +
           # position = position_dodgejust(preserve = "single" ,width = 1.2)) +
  scale_color_manual(values = c("#000000","#747474")) +
  scale_fill_manual(values = colors6[c(1,3)]) +
  geom_point(aes(y = Species,
                 x = `input DNA (%)`),
             shape = "|",
             size = 3,
             colour = "#000000") +
  scale_x_break(c(13, 30),
                scales = "fixed") +
  scale_x_continuous(breaks=c(0,5,10,13,30,32)) +
  
  xlab("Relative read abundance (%)")+ 
  # opts(axis.title.y = theme_text(vjust=-0.5))
 theme(axis.text = element_text(vjust = -0.5))

## save plot 

# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique-RRA_barplot.pdf", plot = SFJQmc_RRA_plot, device = "pdf", width = 18, height = 24, units = "cm", dpi = 600)
dev.off()
ggsave(file = "~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique-RRA_barplot.pdf", plot = SFJQmc_RRA_plot, device = "pdf", width = 30, height = 24, units = "cm", dpi = 600)




# SFmc ----
## build tree ----
### read 12S db seqs for the species present in pools and select only pool species
names(SFJQ_Sps_seqs) %>% sort() %>% paste0(collapse = '",\n"') %>% cat()

SF_Sps_seqs <- SFJQ_Sps_seqs[c("SF_0708_Franciscodoras_marmoratus", "SF_0901_Tetragonopterus_chalceus",
                               "SF_0916_Roeboides_xenodon", "SF_1037_Brycon_orthotaenia",
                               "SF_1041_Hypostomus_alatus", "SF_1099_Pamphorichthys_hollandi",
                               "SF_1104_Microglanis_leptostriatus", "SF_1113_Moenkhausia_sanctaefilomenae",
                               "SF_1117_Eigenmannia_virescens", "SF_1141_Pterygoplichthys_etentaculatus",
                               "SF_1153_Astyanax_cf_fasciatus", "SF_1161_Imparfinis_minutus",
                               "SF_1162_Myleus_micans", "SF_1230_Serrasalmus_brandtii",
                               "SF_1243_Trachelyopterus_galeatus_JQ_5675", "SF_1248_Gymnotus_carapo_JQ_1631",
                               "SF_1264_Crenicichla_lepidota", "SF_1280_Pimelodus_maculatus",
                               "SF_1339_Prochilodus_costatus_JQ_2860", "SF_1368_Phalloceros_uai",
                               "SF_1377_Hoplias_intermedius_brasiliensis", "SF_1381_Characidium_lagosantense",
                               "SF_1403_Pimelodus_pohli")]
### align seqs
SF_Sps_algn <- DECIPHER::AlignSeqs(myXStringSet = SF_Sps_seqs, 
                                      refinements = 100,
                                      iterations = 100,
                                      verbose = TRUE)

### generate distance matrix
SF_Sps_dist <- DECIPHER::DistanceMatrix(myXStringSet = SF_Sps_algn,
                                            includeTerminalGaps = FALSE,
                                            correction = "Jukes-Cantor",
                                            processors = 20,
                                            verbose = TRUE)

### generate dendrogram/tree from alignment and distance matrix
SFmc_tree <- ape::nj(SF_Sps_dist)
        # tree <- phangorn::NJ(SFJQ_Sps_dist)
class(SFmc_tree)

### save tree as newick
# ape::write.tree(phy = SFmc_tree,file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFmc-23_SPs_fused-unique_APEtree.nwk")
ape::write.tree(phy = SFmc_tree,file = "~/outros/sfjq_temp/trees/SFmc-23_SPs_fused-unique_APEtree.nwk")

### read tree from file (or stay with the same object)
SFmc_tree <- read.tree("~/prjcts/fish_eDNA/sfjq/data/trees/SFmc-23_SPs_fused-unique_APEtree.nwk")
SFmc_tree <- read.tree("~/outros/sfjq_temp/trees/SFmc-23_SPs_fused-unique_APEtree.nwk")

## species metadata ----
### read table with pools species, input DNA and RRA

### tidy table
tab_curated_SF <- tab_curated_SFJQ_all_pools %>% 
  filter(Pool %in% c("SFmc")) %>% 
  filter(Status %in% c("Expected")) %>%
  # filter(MC == "SFJQmc") %>% 
  # filter(Pool == "SFJQmc") %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish","MiFish","Teleo")),
         Species = factor(Species),
         # `revised final ID` = factor(`revised final ID`),
         Normalization = factor(Normalization),
         Pool = factor(Pool),
         Status = factor(Status),
         # `RRA (%)` = as.numeric(`Relative abundance on sample`)*100,
         `RRA (%)` = as.numeric(`RRA (%)`),
         `input DNA (%)` = as.numeric(`input DNA (%)`))


## correct tips labels ----
### check tip names in tree
SFmc_tree$tip.label


### rename tree tips to mach table
SFmc_tree$tip.label
tab_curated_SF$Species %>% unique() %>% sort()

{
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1339_Prochilodus_costatus_JQ_2860"] <- "Prochilodus costatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1230_Serrasalmus_brandtii"] <- "Serrasalmus brandtii"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1162_Myleus_micans"] <- "Myleus micans"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1037_Brycon_orthotaenia"] <- "Brycon orthotaenia"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1381_Characidium_lagosantense"] <- "Characidium lagosantense"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1377_Hoplias_intermedius_brasiliensis"] <- "Hoplias intermedius"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_0916_Roeboides_xenodon"] <- "Roeboides xenodon"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_0901_Tetragonopterus_chalceus"] <- "Tetragonopterus chalceus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1113_Moenkhausia_sanctaefilomenae"] <- "Moenkhausia sanctaefilomenae"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1153_Astyanax_cf_fasciatus"] <- "Astyanax fasciatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1141_Pterygoplichthys_etentaculatus"] <- "Pterygoplichthys etentaculatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1041_Hypostomus_alatus"] <- "Hypostomus alatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1248_Gymnotus_carapo_JQ_1631"] <- "Gymnotus carapo"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1117_Eigenmannia_virescens"] <- "Eigenmannia virescens"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_0708_Franciscodoras_marmoratus"] <- "Franciscodoras marmoratus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1243_Trachelyopterus_galeatus_JQ_5675"] <- "Trachelyopterus galeatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1403_Pimelodus_pohli"] <- "Pimelodus pohli"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1280_Pimelodus_maculatus"] <- "Pimelodus maculatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1104_Microglanis_leptostriatus"] <- "Microglanis leptostriatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1161_Imparfinis_minutus"] <- "Imparfinis minutus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1368_Phalloceros_uai"] <- "Phalloceros uai"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1099_Pamphorichthys_hollandi"] <- "Pamphorichthys hollandi"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1264_Crenicichla_lepidota"] <- "Crenicichla lepidota"
}

### check if all names have correspondences
SFmc_tree$tip.label %in% tab_curated_SF$Species
tab_curated_SF$Species %in% SFmc_tree$tip.label
tab_curated_SF$Species[!tab_curated_SF$Species %in% SFmc_tree$tip.label]





### convert ape tree to prettier ggtree object
SFmc_tree_plot <- ggtree(SFmc_tree) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)


### save tree plot
# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFmc-23_SPs_fused-unique-tree_plot.pdf", 
ggsave(file = "~/outros/sfjq_temp/trees/SFmc-23_SPs_fused-unique-tree_plot.pdf", 
       plot = SFmc_tree_plot, 
       device = "pdf", 
       width = 24, height = 20, 
       units = "cm", dpi = 600)



### extract tips order from tree to reproduce on plots
SPs_order_in_SF_tree <- get_taxa_name(SFmc_tree_plot) %>% rev()

            # SPs_order_in_SF_tree <- extract_tree_data(tree_plot) %>% 
            #     dplyr::filter(isTip) %>% 
            #     dplyr::pull(label)




## plots ----
### RRA histogram ----

SFmc_RRA_plot <-
  tab_curated_SF %>%
  # filter(Normalization %in% c("Normalized")) %>%
  unite(Normalization, Primer, col = "Primer_norm",remove = F,sep = " ") %>%
  mutate(Primer_norm = factor(Primer_norm, levels = c("Normalized NeoFish","Normalized MiFish","Non-normalized NeoFish","Non-normalized MiFish"))) %>%
  mutate(Species = factor(Species,levels = SPs_order_in_SF_tree)) %>% 
  ggplot(aes(y = Species,
             x = `RRA (%)`,
             # fill = Primer, 
             fill = Primer_norm, 
             col = Normalization
             ))+
  geom_bar(stat = "identity", size = 0.3,width = 1,
           # alpha = 0.7,
           position = position_dodge(preserve = "single" ,width = 1.5)) +
           # position = position_dodgejust(preserve = "single" ,width = 1.2)) +
    scale_fill_manual(values = colors6[c(1,3,2,4)], name = "") +
  geom_point(aes(y = Species,
                 x = `input DNA (%)`,
             colour = Normalization),
             shape = "|",
             size = 3) +
    scale_color_manual(values = c("#747474","#000000")) +
  xlab("Relative read abundance (%)") 
# +
#   scale_color_manual(values = c("#000000","#848484"))

## save plot 

# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFmc-23_SPs_fused-unique-RRA_barplot.pdf", 
#      plot = SFmc_RRA_plot, device = "pdf", width = 24, height = 20, units = "cm", dpi = 600)
ggsave(file = "~/outros/sfjq_temp/trees/SFmc-23_SPs_fused-unique-RRA_barplot.pdf", 
       plot = SFmc_RRA_plot, device = "pdf", width = 30, height = 30, units = "cm", dpi = 600)



# JQmc ----

## build tree ----
### read 12S db seqs for the species present in pools and select only pool species
names(SFJQ_Sps_seqs) %>% sort() %>% paste0(collapse = '",\n"') %>% cat()

JQ_Sps_seqs <- SFJQ_Sps_seqs[c("JQ_1762_Megaleporinus_elongatus",
                               "JQ_1936_Delturus_brevis",
                               "JQ_2984_Hypomasticus_steindachneri",
                               "JQ_2996_Steindachneridion_amblyurum",
                               "JQ_4290_Megaleporinus_garmani",
                               "JQ_5612_Prochilodus_argenteus_hartii",
                               "JQ_5645_Rhamdia_aff_quelen",
                               "JQ_6584_Australoheros_sp",
                               "JQ_6586_Steindachnerina_elegans-Cyphocharax_gilbert",
                               "JQ_7817_Wertheimeria_maculata",
                               "JQ_7822_Hoplias_malabaricus",
                               "JQ_7880_Astyanax_lacustris",
                               "JQ_7893_Hypostomus_nigrolineatus",
                               "SF_1243_Trachelyopterus_galeatus_JQ_5675",
                               "SF_1248_Gymnotus_carapo_JQ_1631",
                               "SF_1339_Prochilodus_costatus_JQ_2860",
                               "SF_1377_Hoplias_intermedius_brasiliensis")]
### align seqs
JQ_Sps_algn <- DECIPHER::AlignSeqs(myXStringSet = JQ_Sps_seqs, 
                                      refinements = 100,
                                      iterations = 100,
                                      verbose = TRUE)

### generate distance matrix
JQ_Sps_dist <- DECIPHER::DistanceMatrix(myXStringSet = JQ_Sps_algn,
                                            includeTerminalGaps = FALSE,
                                            correction = "Jukes-Cantor",
                                            processors = 20,
                                            verbose = TRUE)

### generate dendrogram/tree from alignment and distance matrix
JQmc_tree <- ape::nj(JQ_Sps_dist)
        # tree <- phangorn::NJ(SFJQ_Sps_dist)
class(JQmc_tree)

### save tree as newick
# ape::write.tree(phy = JQmc_tree,file = "~/prjcts/fish_eDNA/sfjq/data/trees/JQmc-23_SPs_fused-unique_APEtree.nwk")
ape::write.tree(phy = JQmc_tree,file = "~/outros/sfjq_temp/trees/JQmc-23_SPs_fused-unique_APEtree.nwk")

### read tree from file (or stay with the same object)
JQmc_tree <- read.tree("~/prjcts/fish_eDNA/sfjq/data/trees/JQmc-23_SPs_fused-unique_APEtree.nw")
# JQmc_tree <- read.tree("~/outros/sfjq_temp/trees/JQmc-23_SPs_fused-unique_APEtree.nwk")

## species metadata ----
### read table with pools species, input DNA and RRA

### tidy table
tab_curated_JQ <- tab_curated_SFJQ_all_pools %>% 
  filter(Pool %in% c("JQmc")) %>% 
  filter(Status %in% c("Expected")) %>%
  # filter(MC == "SFJQmc") %>% 
  # filter(Pool == "SFJQmc") %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish","MiFish","Teleo")),
         Species = factor(Species),
         # `revised final ID` = factor(`revised final ID`),
         Normalization = factor(Normalization),
         Pool = factor(Pool),
         Status = factor(Status),
         # `RRA (%)` = as.numeric(`Relative abundance on sample`)*100,
         `RRA (%)` = as.numeric(`RRA (%)`),
         `input DNA (%)` = as.numeric(`input DNA (%)`))


## correct tips labels ----
### check tip names in tree
JQmc_tree$tip.label


### rename tree tips to mach table
JQmc_tree$tip.label
tab_curated_JQ$Species %>% unique() %>% sort()

{
JQmc_tree$tip.label[JQmc_tree$tip.label == "SF_1339_Prochilodus_costatus_JQ_2860"] <- "Prochilodus costatus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_5612_Prochilodus_argenteus_hartii"] <- "Prochilodus argenteus/hartii"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_6586_Steindachnerina_elegans-Cyphocharax_gilbert"] <- "Cyphocharax gilbert"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_2984_Hypomasticus_steindachneri"] <- "Hypomasticus steindachneri"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_4290_Megaleporinus_garmani"] <- "Megaleporinus garmani"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_1762_Megaleporinus_elongatus"] <- "Megaleporinus elongatus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_7822_Hoplias_malabaricus"] <- "Hoplias malabaricus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "SF_1377_Hoplias_intermedius_brasiliensis"] <- "Hoplias brasiliensis"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_7880_Astyanax_lacustris"] <- "Astyanax lacustris"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_7893_Hypostomus_nigrolineatus"] <- "Hypostomus nigrolineatus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "SF_1248_Gymnotus_carapo_JQ_1631"] <- "Gymnotus carapo"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_1936_Delturus_brevis"] <- "Delturus brevis"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_7817_Wertheimeria_maculata"] <- "Wertheimeria maculata"
JQmc_tree$tip.label[JQmc_tree$tip.label == "SF_1243_Trachelyopterus_galeatus_JQ_5675"] <- "Trachelyopterus galeatus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_2996_Steindachneridion_amblyurum"] <- "Steindachneridion amblyurum"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_5645_Rhamdia_aff_quelen"] <- "Rhamdia quelen"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_6584_Australoheros_sp"] <- "Australoheros sp"
}

### check if all names have correspondences
JQmc_tree$tip.label %in% tab_curated_JQ$Species
tab_curated_JQ$Species %in% JQmc_tree$tip.label
tab_curated_JQ$Species[!tab_curated_JQ$Species %in% JQmc_tree$tip.label]




#invert branches to match Siluriformes position on the top


JQmc_tree$edge
JQmc_tree %>% as_tibble() %>% View()


JQmc_tree  %>% 
  ggtree() + 
  # geom_text(aes(label=node)) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)%>% 
  ggtree::flip(node1 = 21,node2 = 20)
 

rotateNodes(tree = JQmc_tree, "all") %>%
  ggtree() + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)



### convert ape tree to prettier ggtree object
JQmc_tree_plot <- ggtree(JQmc_tree) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)


### save tree plot
# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/JQmc-17_SPs_fused-unique-tree_plot.pdf", 
ggsave(file = "~/outros/sfjq_temp/trees/JQmc-17_SPs_fused-unique-tree_plot.pdf", 
       plot = JQmc_tree_plot, 
       device = "pdf", 
       width = 24, height = 24, 
       units = "cm", dpi = 600)



### extract tips order from tree to reproduce on plots
SPs_order_in_JQ_tree <- get_taxa_name(JQmc_tree_plot) %>% rev()

            # SPs_order_in_JQ_tree <- extract_tree_data(tree_plot) %>% 
            #     dplyr::filter(isTip) %>% 
            #     dplyr::pull(label)




## plots ----
### RRA histogram ----

JQmc_RRA_plot <-
  tab_curated_JQ %>%
    unite(Normalization, Primer, col = "Primer_norm",remove = F,sep = " ") %>%
  mutate(Primer_norm = factor(Primer_norm, levels = c("Normalized NeoFish","Normalized MiFish","Normalized Teleo","Non-normalized NeoFish","Non-normalized MiFish","Non-normalized Teleo"))) %>%
  mutate(Species = factor(Species,levels = SPs_order_in_JQ_tree)) %>% 
  ggplot(aes(y = Species,
             x = `RRA (%)`,
             fill = Primer_norm, 
             col = Normalization
             ))+
  geom_bar(stat = "identity", size = 0.3,width = 1,
           # alpha = 0.7,
           position = position_dodge(preserve = "single" ,width = 1.5)) +
           # position = position_dodgejust(preserve = "single" ,width = 1.2)) +
  scale_fill_manual(values = colors6[c(1,3,5,2,4,6)], name = "") +
  geom_point(aes(y = Species,
                 x = `input DNA (%)`,
             colour = Normalization),
             shape = "|",
             size = 3) +
    scale_color_manual(values = c("#747474","#000000")) +
  xlab("Relative read abundance (%)") 


## save plot 

# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/JQmc-23_SPs_fused-unique-RRA_barplot.pdf", plot = JQmc_RRA_plot, device = "pdf", width = 24, height = 20, units = "cm", dpi = 600)
ggsave(file = "~/outros/sfjq_temp/trees/JQmc-17_SPs_fused-unique-RRA_barplot.pdf", plot = JQmc_RRA_plot, device = "pdf", width = 30, height = 30, units = "cm", dpi = 600)




#arvore ----


# SFJQmc_tree <- read.tree("~/prjcts/fish_eDNA/sfjq/data/12S_full_SFJQmc_fused_SPs_e_3_contams.nwk")




ggplot(SFJQmc_tree) + geom_tree() + theme_tree()

# This is convenient shorthand

# # tree_plot <- 
#   ggtree(SFJQmc_tree) + 
#   theme_tree2() +
#   geom_tiplab(offset = 0,align = T)+ 
#   xlim(0, 0.42) 

  
  tree_plot
  
  
  
    
#TODO create function to build double graph of tree and bars

  
  
# newick tree to plot along the graph
  tree4plot <- SFJQmc_tree

  
# table for ggplot to go alongside the tree (long format, can have factors)
  tbl4plot <- tab_curated_SFJQ
  
# plot to put by side (y axis must be the species in tree (column name == Species))
  plot4tree <- SFJQmc_RRA_plot
  
  
  
### generate plot from plylo object (.nwk read by ape::read.tree)
  

class(SFJQmc_tree)

tree4plot$edge.length %>% sort() %>% sum() 
tree4plot %>% str()


tree_plot$data

# tree_plot <-
  ggtree(tr = tree4plot,layout = "rectangular") +
  theme_tree2() +
  # geom_tiplab(offset = 0,align = T)+
  geom_tiplab(align = T)+
  xlim(0, 0.42)
  
  
### extract tips order from tree to reproduce on plots
SPs_order_in_tree <- ggtree::get_taxa_name(SFJQmc_tree_plot) %>% rev()
  
  
  
  plot4tree$data %>% 
    dplyr::mutate(Species = factor(Species, levels = SPs_order_in_tree))
    
  
  
    
    
    ## check tip names in tree
JQmc_tree$tip.label







### convert ape tree to prettier ggtree object
JQmc_tree_plot <- ggtree(JQmc_tree) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)
    
    
treeNbar_plot  <- function(){
    
  }  
  

# https://www.r-bloggers.com/2016/12/add-layer-to-specific-panel-of-facet_plot-output-2/
# facet_plot(tree_plot, panel = 'Stacked Barplot', 
#            data = tab_curated_SFJQ, geom = geom_histogram,
#            mapping = aes(x = Species,y =`RRA (%)`,  fill = as.factor(Primer)),
#            stat='identity',position = 'dodge' )
# 
# # 
# p3 <- facet_plot(tree_plot, panel='bar', data=tab_curated_SFJQ, geom=geom_bar, 
#                  aes(x=`RRA (%)`, y=Species,fill = Primer),
#                  stat = "identity",
#                  position = "dodge")


#https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/mbe/35/12/10.1093_molbev_msy194/3/msy194_supp.pdf?Expires=1648204036&Signature=gZC6A1vfyaUWOyL~fKn8wxgY~3fbTBI3jPOGbVtwZSzv3jlXISjCahA37gwR3QTr6oN0SK-bdwAlHQyaPpkdj2~yi5scNQXAQrUi0EQNOqkOo3HUvFvCr-Dir2y7N03vIo5urr1n2idrPclTXtTRtiu7avn255T5eg~cXv0NBNUgiVFcwHHnZ81qQUrSdiA54wIvEs~RF18DYkp-Gla1CJT0eUGuYF8LfFXG5Dq1CgcZV0qGs0fKgfIKRlAT~AP25Xxkdh20RzAkqgBFvxp0JazrVOz5uvdok3uSu3023etErTxhaW7rm67VkCUBVxRgtG8GdFT3fOFJAsPg26Wagw__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA

# 
# tree_plot %<+% tab_curated_SFJQ + 
#   geom_histogram(aes(y = Species,
#                      x = `RRA (%)`,
#                      fill = Primer),
#                  stat = "identity",
#                  position = "dodge")

colnames(tab_curated_SFJQ)[colnames(tab_curated_SFJQ) == "Species"] <- "tip.label"



p2 <- 
  facet_plot(p = tree_plot, panel = "SNP", data = tab_curated_SFJQ, geom = geom_histogram,
                 mapping=aes(y = tip.label, x = `RRA (%)`, fill = Primer),
                 stat = "identity",
                 position = "dodge") +
# %>%
#   facet_plot("Trait", bar_data, ggstance::geom_barh,
#              aes(x = dummy_bar_value, color = location, fill = location),
#              stat = "identity", width = .6) +
  theme_tree2(legend.position=c(.05, .85))
print(p2)



p2 <- tree_plot + geom_facet(panel = "RRA (%)",
                       data = tab_curated_SFJQ,
                       geom = geom_bar,
                       mapping = aes(x=`RRA (%)`, fill = Primer),orientation = 'y',stat="identity")





extract_tree_data <- function(tree_disp, displayorder=TRUE) {
    td_out <- tree_disp$data
    if (displayorder) {
       td_out <- dplyr::arrange(td_out,y)
    }
    return(td_out)
}

SPs_order_in_tree <- extract_tree_data(tree_plot) %>% 
    dplyr::filter(isTip) %>% 
    dplyr::pull(label)

dendrograms of ASVs & db species

fold change standard deviation

References

This analyses were based and inpired on other incredible resources, listed below:


#Bonus

NeoFish redesign

This is a dinamic report, intended to show the current state of analyses. Many procedures and conclusions might change as the pipeline evolves. If you notice errors/mistakes/typos, or have any suggestions, we would be glad to know.

---
title: "DNA metabarcoding of mock communities highlights potential biases when assessing Neotropical fish diversity"
author: 
  - "Hilário, OH; Mendes, IS; Sales, NG; Carvalho, DC"
date: "27/07/2023"
output: 
  html_document:
    code_download: yes
    theme: flatly
    toc: true
    toc_depth: 4
    toc_float: true
  bibliography: references.bib 
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE}
# knitr::opts_chunk$set(echo = TRUE)
```

***

\pagebreak


# Abstract

Despite the increasing popularity of DNA metabarcoding in the
assessment of aquatic ecosystems using fish eDNA or ichthyoplankton,
challenges have hampered its broader application in the Neotropical
freshwaters. Using five mock communities composed of fish species from
two Neotropical river basins, we evaluated the influence of DNA
concentration and choice of mitochondrial 12S molecular markers
(MiFish, NeoFish and Teleo) on species detection and Relative Read
Abundance (RRA) using DNA metabarcoding. Of the three 12S markers
analysed, only MiFish detected all species from all mock communities.
The performance of a taxonomy-free approach using ASV/MOTUs was
not as precise as assigning DNA reads to species using a curated 12S
library that includes approximately 100 fish species, since more than one
ASV/MOTU was observed for the same specimen. Thus, here we
showcase the importance of a custom reference database to allow
precise assignment of Neotropical fish species in metabarcoding studies
and that the RRA is dependent on community composition, marker and
DNA concentration. We highlight the importance of controlled
experiments using known species communities before large investments
are made in assessing biodiversity using non-invasive methods that
apply DNA metabarcoding.



# Bioinformatics

## Data acquisiton  

Download demultiplexed samples from _Base Space_ using the _bs_ interface.  

```{bash, eval=FALSE}
#navigate to raw-data folder
cd $raw_data_folder/$run_folder;

#authenticate to BaseSpace (only at first log in)
bs auth;

#list datasets from runs on your BaseSpace
bs list datasets;

#create folders to organize fastq files
mkdir ~/runs/run_01mar21/fastq/;        #edna
mkdir ~/runs/run_09fev21/fastq/;        #edna
mkdir ~/runs/run_29jul20/fastq/;        #edna

#download runs from BaseSpace
bs download project -n fish_eDNA -o ~/runs/run_29jul20/fastq/ --extension=fastq.gz;          #primeira corrida LGC
bs download project -n eDNA_2run_B -o ~/runs/run_09fev21/fastq/ --extension=fastq.gz;        #segunda corrida LGC
bs download project -n iSeqRun2_Daniel -o ~/runs/run_01mar21/fastq/ --extension=fastq.gz;    #amostras iSeq ecomol

#organize all fastq files of each run in a single folder
mkdir ~/runs/run_01mar21/fastq/all;
mkdir ~/runs/run_09fev21/fastq/all;
mkdir ~/runs/run_29jul20/fastq/all;

#move all fastqfiles to a single folder
mv ~/runs/run_01mar21/fastq/*/*fastq.gz ~/runs/run_01mar21/fastq/all;
mv ~/runs/run_09fev21/fastq/*/*fastq.gz ~/runs/run_09fev21/fastq/all;
mv ~/runs/run_29jul20/fastq/*/*fastq.gz ~/runs/run_29jul20/fastq/all;

```

<br>

## Load _R libs_ and system programs

```{r, eval=FALSE,echo=TRUE}
# 0 - load libraries ----
{
  library(dplyr)
  library(tidyr)
  library(tibble)
  library(stringr)
  library(ggplot2)
  library(ggbreak)
  library(ggtree)
  library(phyloseq)
  library(Biostrings)
  library(Matrix)
  library(ShortRead)
  library(dada2)
  library(DECIPHER)
  library(future)
  library(vegan)
  library(ape)
  library(phangorn)
  library(adegenet)
}

#set complete path to cutadapt executable
cutadapt <- "/usr/local/bin/cutadapt"

#important 
prjct_path <- "~/prjcts/fish_eDNA/sfjq"

notes_path <- paste0(prjct_path,"/notes")

results_path <- paste0(prjct_path,"/results")

figs_path <- paste0(results_path,"/figs")

prcj_radical <- "SFJq_fish_metabarcoding"
#path to project data folder were the processed reads will be stored
data_path <- paste0(prjct_path,"/data/reads")

```

<br>

## Quality control

Chech overall quality of sequencing runs for all samples

```{bash, eval=FALSE,echo=TRUE}


```

<br>


## Demultiplex SFJQ sample (MiFish & NeoFish mixed)

```{bash, eval=FALSE,echo=TRUE}
#Demultiplex SFJQ sample (MiFish & NeoFish mixed)
#samples MiniSeq LGC
cutadapt -j 79 --no-indels  -g file:~/prjcts/fish_eDNA/sfjq/data/primers_neo_mi.fasta  -G file:~/prjcts/fish_eDNA/sfjq/data/primers_neo_mi.fasta  -o ~/runs/run_09fev21/fastq/all/sfjq_dmx/SFJQ-{name1}-{name2}_R1_001.fastq.gz  -p ~/runs/run_09fev21/fastq/all/sfjq_dmx/SFJQ-{name1}-{name2}_R2_001.fastq.gz  ~/runs/run_09fev21/fastq/all/SFJQ-neo-mi_S23_L001_R1_001.fastq  ~/runs/run_09fev21/fastq/all/SFJQ-neo-mi_S23_L001_R2_001.fastq 2> ~/runs/run_09fev21/fastq/all/sfjq_dmx/cut_SFJQ_demux.txt

cp ~/runs/run_09fev21/fastq/all/sfjq_dmx/SFJQ-neo_FWD-neo_REV.* ~/runs/run_01mar21/fastq/all/
cp ~/runs/run_09fev21/fastq/all/sfjq_dmx/SFJQ-mif_FWD-mif_REV.* ~/runs/run_09fev21/fastq/all/

mv ~/runs/run_01mar21/fastq/all/SFJQ-neo-mi_S23_L001_R* ~/runs/run_01mar21/fastq/all/sfjq_dmx/


#samples iSeq Ecomol
cutadapt -j 79 --no-indels  -g file:~/prjcts/fish_eDNA/sfjq/data/primers_neo_mi.fasta  -G file:~/prjcts/fish_eDNA/sfjq/data/primers_neo_mi.fasta  -o ~/runs/run_01mar21/fastq/all/sfjq_dmx/Da23-{name1}-{name2}_R1_001.fastq.gz  -p ~/runs/run_01mar21/fastq/all/sfjq_dmx/Da23-{name1}-{name2}_R2_001.fastq.gz  ~/runs/run_01mar21/fastq/all/Da23_S72_L001_R1_001.fastq ~/runs/run_01mar21/fastq/all/Da23_S72_L001_R2_001.fastq 2> ~/runs/run_01mar21/fastq/all/sfjq_dmx/cut_SFJQ_demux.txt

cp ~/runs/run_01mar21/fastq/all/sfjq_dmx/Da23-neo_FWD-neo_REV.* ~/runs/run_01mar21/fastq/all/
cp ~/runs/run_01mar21/fastq/all/sfjq_dmx/Da23-mif_FWD-mif_REV.* ~/runs/run_01mar21/fastq/all/

mv ~/runs/run_01mar21/fastq/all/Da23_S72_L001_R* ~/runs/run_01mar21/fastq/all/sfjq_dmx/
```

## Set path to raw data

```{r, eval=FALSE,echo=TRUE}
#1 - load runs raw data ----
## All libs are demultiplexed
{
  # PATH to the directory containing raw fastq files after unzipping.
  libs_path1 <- "~/runs/run_29jul20/fastq/all"
  libs_path2 <- "~/runs/run_09fev21/fastq/all" 
  libs_path3 <- "~/runs/run_01mar21/fastq/all" 
}
#check content
list.files(path = libs_path1,pattern = "fastq") 
list.files(path = libs_path2,pattern = "fastq") 
list.files(path = libs_path3,pattern = "fastq") 

```

### Identify sample names radicals

```{r, eval=FALSE,echo=TRUE}
#2 - get sample names ----

# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
{
  all_fnFs1 <- sort(list.files(libs_path1, pattern="_R1_001.fastq", full.names = TRUE))
  all_fnRs1 <- sort(list.files(libs_path1, pattern="_R2_001.fastq", full.names = TRUE))

  all_fnFs2 <- sort(list.files(libs_path2, pattern="_R1_001.fastq", full.names = TRUE))
  all_fnRs2 <- sort(list.files(libs_path2, pattern="_R2_001.fastq", full.names = TRUE))

  all_fnFs3 <- sort(list.files(libs_path3, pattern="_R1_001.fastq", full.names = TRUE))
  all_fnRs3 <- sort(list.files(libs_path3, pattern="_R2_001.fastq", full.names = TRUE))

  all_fnFs <- c(all_fnFs1,all_fnFs2,all_fnFs3)
  all_fnRs <- c(all_fnRs1,all_fnRs2,all_fnRs3)
}
#load csv with primers and respective samples
primers_n_samples <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/primers_n_samples_sfjq.csv",
         header = TRUE)


#3 - map sample names to reads files ----
primers_n_samples <- primers_n_samples %>%
  mutate("R1" = "R1",
         "R2" = "R2")

for (sample in 1:nrow(primers_n_samples)) {
  
  primers_n_samples$R1[sample] <-
   all_fnFs[grep(pattern =  paste0("/",primers_n_samples$File_name[sample]),x = all_fnFs)]
  
  primers_n_samples$R2[sample] <-
   all_fnRs[grep(pattern =  paste0("/",primers_n_samples$File_name[sample]),x = all_fnRs)]

}

```

<br>

### Define sample levels
```{r, eval=FALSE,echo=TRUE}
#4 - set sample levels
primers_n_samples$File_name

sample_levels <- c(
"Da23-mif", "SFJQ-mif",
"Da23-neo", "SFJQ-neo",
"Da20","SFnNorm-mi",
"Da19","SFnNorm-neo",
"Da22","SFNorm-mi",
"Da21","SFNorm-neo",
"pJequei-N-norm-N","pJequei-N-norm-M","pJequei-N-norm-T",
"pJequei-norm-N","pJequei-norm-M","pJequei-norm-T",
"Cassaum","neg-PCR2")

primer_levels <- c("NeoFish", "MiFish", "Teleo","NeoFish/MiFish", "NeoFish/MiFish/Teleo")

```
<br>
### Remove primers from reads
  
As the primer-derived sequences are identical, they are not informative and thus must be removed for the following steps.

<br>

#### Load primer sequences

```{r, eval=FALSE,echo=TRUE}
#4- identify primers ----

#primers sequences used for each sample
# inosine pairs with A, C, U
#                    T, G, A = IUPAC code:  D
#cutadapt  accepts IUPAC code !!!!!!!!
{
  #neo
  neo_FWD <- "CGCCGTCGCAAGCTTACCCT"
  names(neo_FWD) <- "neo_FWD"
  neo_REV <- "AGTGACGGGCGGTGTGTGC"
  names(neo_REV) <- "neo_REV" 
  
  #mif
  mif_FWD <- "GTCGGTAAAACTCGTGCCAGC"
  names(mif_FWD) <- "mif_FWD"
  mif_REV <- "ACATAGTGGGGTATCTAATCCCAGTTTG"
 # mif_REV <- "CATAGTGGGGTATCTAATCCCAGTTTG" #original
  names(mif_REV) <- "mif_REV" 
  
  #tel
  tel_FWD <- "ACACCGCCCGTCACTCT"
  names(tel_FWD) <- "tel_FWD"
  tel_REV <- "ACTTCCGGTACACTTACCATG"
  names(tel_REV) <- "tel_REV"  
  
  
#creates a list of single row tibbles for each primer
primers <- tibble(Primers = c(neo_FWD,neo_REV,
                              mif_FWD,mif_REV,
                              tel_FWD,tel_REV)) %>% 
  mutate(`Primer names`= names(Primers)) %>% 
  split(1:nrow(.))
}
```

<br>

#### Generate sequences for complement, reverse, and reverse complement for each primer
  
The function _allOrients_ is used to generate all possible orientations for primers FWD e REV.

```{r eval=FALSE,echo=TRUE}
#5 - check primer orientation ----

#function to get all possible primer orientations
allOrients <- function(primers) {
   # Create all orientations of the input sequence
    # Must be a tibble with cols = c(Primers,`Primer names`)
  
   require(Biostrings)
   dna <- Biostrings::DNAString(primers$Primers)  # The Biostrings works w/ DNAString objects rather than character vectors
   orients <- c(Forward = dna, 
                Complement = Biostrings::complement(dna), 
                Reverse = Biostrings::reverse(dna),
                RevComp = Biostrings::reverseComplement(dna))
   names(orients) <- paste0(names(orients))
   
   primer_tbl <- sapply(orients, toString)
   
   primer_tbl <- dplyr::tibble(Sequence = primer_tbl,
                        `Primer orientation` = names(primer_tbl)) %>% 
     dplyr::mutate(Primer = primers$`Primer names`) %>%
     unite(col=`Orientation name`, Primer ,`Primer orientation`,remove = FALSE)
   
   return(primer_tbl)  # Convert back to character vector
}


#apply function 
primers_all_orients <- purrr::map_dfr(primers, allOrients)

names(primers_all_orients$Sequence) <- primers_all_orients$`Orientation name`

```

<br>

#### Remove reads with undetermined bases **(Ns)** and unpaired

Reads with undetermined bases prevent proper primer identification and ASV determination. These sequences must be removed from the data.

```{r eval=FALSE}
#6 - pre filter reads with Ns for primer checking ----
# create names for N-cleaned files

primers_n_samples <- primers_n_samples %>%
  mutate("R1 N-cleaned" = "R1 N-cleaned",
         "R2 N-cleaned" = "R2 N-cleaned")

for (sample in 1:nrow(primers_n_samples)) {
  primers_n_samples$`R1 N-cleaned`[sample] <-
   paste0(data_path,"/N-cleaned/",primers_n_samples$File_name[sample],"_R1_N-cleaned.fastq.gz")
  primers_n_samples$`R2 N-cleaned`[sample] <-
   paste0(data_path,"/N-cleaned/",primers_n_samples$File_name[sample],"_R2_N-cleaned.fastq.gz")
}

# remove reads with Ns to make primer filtering more accurate

dada2::filterAndTrim(
  fwd = primers_n_samples$R1, filt = primers_n_samples$`R1 N-cleaned`, 
  rev = primers_n_samples$R2, filt.rev = primers_n_samples$`R2 N-cleaned`,
  maxN = 0, multithread = TRUE, matchIDs = TRUE,
  verbose = TRUE, compress = TRUE)

# pivote table to longer format
primers_n_samples <- primers_n_samples %>% 
  pivot_longer(cols = c(R1,R2,`R1 N-cleaned`,`R2 N-cleaned`),
               names_to = "Stage", values_to = "Read file")

```

<br>

#### Count primer presence on reads

Before primer removal it is possible to count their presence on the reads. This procedures is carried on independently for each sample. 

```{r echo=TRUE,eval=FALSE}
#6 - count primer orientation hits ----

#function to count primer on each specific library
primerHits <- function(primer, fn) {
   # Counts number of reads in which the primer is found
   nhits <- Biostrings::vcountPattern(primer, ShortRead::sread(ShortRead::readFastq(fn)), fixed = FALSE)
   return(sum(nhits > 0))
}

#function to call primerHits for multiple primers
multi_primerHits <- function(Read_file,primers){
  primer_counts <- purrr::map_df(primers,.f = primerHits, fn = Read_file)
  primer_counts <- primer_counts %>%  mutate(`Read file` = Read_file)
  return(primer_counts)
}
###########

#vector of read files to look on for primers
reads_seqs <- primers_n_samples %>% 
  filter(Stage %in% c("R1 N-cleaned", "R2 N-cleaned")) %>% 
  select(`Read file`) %>% as.list()
 

#named vector of primer sequences
primers_seqs <- primers_all_orients$Sequence


cores_to_be_used <- future::availableCores() - 2 # Usar todos os cores -2 = 78

future::plan(future::multisession(workers = cores_to_be_used))


#count primers
primers_in_Nreads <- furrr::future_map_dfr(reads_seqs$`Read file`, .f = multi_primerHits, primers = primers_seqs, .options = furrr::furrr_options(seed = NULL))

#get sample information into primers_in_Nreads table
primers_in_Nreads <- left_join(primers_in_Nreads,primers_n_samples,by = "Read file")

# 
# primers_in_Nreads_bckp <- primers_in_Nreads
# primers_in_Nreads <- primers_in_Nreads_bckp

```

#### Prepare primer counts for ploting

```{r, echo=TRUE,eval=FALSE}
#7- prepare primer counts for plots ----

# cat(paste0(colnames(primers_in_Nreads),"\n"))

primers_in_Nreads <-
  primers_in_Nreads %>% 
  select(# `Read file
 File_name, Type, Group, Library, Primer, Run, Stage,
         neo_FWD_Forward, neo_REV_Forward, neo_FWD_Complement, neo_REV_Complement, 
         neo_FWD_Reverse, neo_REV_Reverse, neo_FWD_RevComp, neo_REV_RevComp, 
         mif_FWD_Forward, mif_REV_Forward, mif_FWD_Complement, mif_REV_Complement, 
         mif_FWD_Reverse, mif_REV_Reverse, mif_FWD_RevComp, mif_REV_RevComp, 
         tel_FWD_Forward, tel_REV_Forward, tel_FWD_Complement, tel_REV_Complement, 
         tel_FWD_Reverse, tel_REV_Reverse, tel_FWD_RevComp, tel_REV_RevComp)


#write.csv(x = primer_hits_tbl, file = "~/prjcts/fish_eDNA/notes/jequiDNApool/csv/primers_hits_in_reads.csv")

str(primers_in_Nreads)
colnames(primers_in_Nreads)
rownames(primers_in_Nreads)

primers_in_Nreads$Primer
primers_in_Nreads$Library


#8- prepare primer counts for plots in ggplot----

#convert primer hits table to long format
primers_in_Nreads_long <- primers_in_Nreads %>% 
  gather(key = Sequences, 
         value = Count,  
         neo_FWD_Forward, neo_FWD_Complement, neo_FWD_Reverse,neo_FWD_RevComp, 
         neo_REV_Forward, neo_REV_Complement, neo_REV_Reverse, neo_REV_RevComp,
         mif_FWD_Forward, mif_FWD_Complement, mif_FWD_Reverse, mif_FWD_RevComp, 
         mif_REV_Forward, mif_REV_Complement, mif_REV_Reverse, mif_REV_RevComp,
         tel_FWD_Forward, tel_FWD_Complement, tel_FWD_Reverse, tel_FWD_RevComp, 
         tel_REV_Forward, tel_REV_Complement, tel_REV_Reverse, tel_REV_RevComp
         ) %>% 
  mutate(Sequences = factor(Sequences,
                            levels = c("neo_FWD_Forward","neo_FWD_RevComp",
                                       "neo_REV_Forward","neo_REV_RevComp",
                                       "neo_FWD_Complement","neo_FWD_Reverse",
                                       "neo_REV_Complement","neo_REV_Reverse",
                                       
                                       
                                       "mif_FWD_Forward","mif_FWD_RevComp",
                                       "mif_REV_Forward","mif_REV_RevComp",
                                       "mif_FWD_Complement","mif_FWD_Reverse",
                                       "mif_REV_Complement","mif_REV_Reverse",
                                       
                                       
                                       "tel_FWD_Forward","tel_FWD_RevComp",
                                       "tel_REV_Forward","tel_REV_RevComp",
                                       "tel_FWD_Complement","tel_FWD_Reverse",
                                       "tel_REV_Complement","tel_REV_Reverse")),
                                       
                                       
         File_name = factor(File_name,levels = sample_levels),
         Run = as.factor(Run),
         Primer = factor(Primer,levels = c("NeoFish",
                                           "MiFish",
                                           "Teleo",
                                           "NeoFish/MiFish",
                                           "NeoFish/MiFish/Teleo"))) 



# PLOT 1: primers counts in reads tile plot - only primers FWD & REV, foward & revcomp ----
primers_tile <- 
  primers_in_Nreads_long %>% 
  # filter(Sequences  %in% c(
  #   "mif_REV_RevComp", "mif_REV_Forward", "mif_FWD_RevComp", "mif_FWD_Forward",
  #   "neo_REV_RevComp", "neo_REV_Forward", "neo_FWD_RevComp", "neo_FWD_Forward",
  #   "tel_REV_RevComp", "tel_REV_Forward", "tel_FWD_RevComp", "tel_FWD_Forward")) %>% 
  mutate(File_name = factor(File_name,levels = sample_levels)) %>% 
  ggplot2::ggplot(aes(y=File_name,x=Sequences,fill=log10(Count)
                      # ,col=Stage
                      )) +
  geom_tile()+
  geom_text(aes(label = Count),size=1)+
  # scale_fill_gradient(low="white", high="darkgreen",trans="log10") +
  scale_fill_gradientn(name = "Primer counts",
                       colours = c("white","darkgreen"),
                       values = c(0,1),
                       na.value ="white") +
  theme_light(base_line_size = 1,base_size = 6) +
  theme(axis.text.x = element_text(angle = 45,hjust = 1)) +
  geom_hline(yintercept = c(40.5,82.5,86.5,116.5),color = "grey") +
  geom_vline(xintercept = c(4.5,8.5,12.5,16.5),color = "grey") +
  # coord_fixed(ratio = 0.20) +
  xlab("Primers") +
  ylab("Amostra") +
  ggtitle(label = "eDNA 1st, 2nd & 3rd runs",
              subtitle = "Primer presence on sample reads") 

primers_tile

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/1-primers_in_reads_all_FR.png",
     plot = primers_tile,
     device = "png",
     width = 27,
     height = 40,
     units = "cm",
     dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/1-primers_in_reads_all_FR.pdf",
     plot = primers_tile,
     device = "pdf",
     width = 27,
     height = 40,
     units = "cm",
     dpi = 600)


#9- write csv file with primer hits counts per lib ----
write.csv(x = primer_hits_tbl,file = "~/prjcts/fish_eDNA/sfjq/results/primers_hits_tbl.csv",
          row.names = FALSE)

```


### Remove primers from reads
#### Primer removal with **_Cutadapt_** 

The **_cutadapt_** software ([DOI:10.14806/ej.17.1.200](http://journal.embnet.org/index.php/embnetjournal/article/view/200)) was used for primer removal on read sequences.

```{r eval=FALSE}
#10 - cutadapt ----

#set or create cutadapt processed reads dir path
path.cut <- file.path(data_path, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)

```

<br>

#### Generate and execute primer-specific commands

The original DADA2 ITS protocol removes only _FWD_ and _REV reverse complement_ sequences. This protocol is adjusted for selecting reads only of the expected primer and removing the primer. 

```{r eval=FALSE}
# opitional: remove all primers from all reads and samples ----

#10 - map sample names to reads files ----

#name outputs
cutadapt_files <- primers_n_samples %>% 
  filter(Stage %in% c("R1 N-cleaned","R2 N-cleaned")) %>% 
  mutate(`Read file` = str_replace_all(.$`Read file`,pattern = "N-cleaned",replacement = "cutadapt")) %>% 
  mutate(Stage = str_replace_all(.$Stage,pattern = "N-cleaned",replacement = "cutadapt"))


primers_n_samples <- bind_rows(primers_n_samples,cutadapt_files)

#all ----
{ 
          #make reverse complements
        #  the XXX_Complement and XXX_reverse have no hits so were ignored at last plot and from now on
          all_FWD.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("FWD_Forward"))] 
          all_FWD.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("FWD_RevComp"))] 
          all_REV.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("REV_Forward"))] 
          all_REV.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("REV_RevComp"))] 
  
}
  
#remove primers and filter only the reads that contain the expected primer ----
{
  #MiFish ----
  mif_FWD.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("mif_FWD_Forward"))] 
  mif_FWD.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("mif_FWD_RevComp"))] 
  mif_REV.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("mif_REV_Forward"))] 
  mif_REV.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("mif_REV_RevComp"))] 
  
  #NeoFish ----
  neo_FWD.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("neo_FWD_Forward"))] 
  neo_FWD.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("neo_FWD_RevComp"))] 
  neo_REV.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("neo_REV_Forward"))] 
  neo_REV.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("neo_REV_RevComp"))] 
  
  #Teleo ----
  tel_FWD.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("tel_FWD_Forward"))]
  tel_FWD.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("tel_FWD_RevComp"))]
  tel_REV.orients <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("tel_REV_Forward"))]
  tel_REV.RC <-  primers_all_orients$Sequence[primers_all_orients$`Orientation name` %>% grep(pattern = c("tel_REV_RevComp"))]
  }
  
                #creat flags
                # Trim FWD and the reverse-complement of REV off of R1 (forward reads)
                all_R1.flags <- paste("-g", all_FWD.orients, "-a", all_REV.RC)
                # Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
                all_R2.flags <- paste("-G", all_REV.orients, "-A", all_FWD.RC)
                
#create primer-specific tags ----
  {
  #MiFish ----
  # Trim FWD and the reverse-complement of REV off of R1 (forward reads)
  mif_R1.flags <- paste("-g", mif_FWD.orients, "-a", mif_REV.RC)
  # Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
  mif_R2.flags <- paste("-G", mif_REV.orients, "-A", mif_FWD.RC)
  
  #NeoFish ----
  # Trim FWD and the reverse-complement of REV off of R1 (forward reads)
  neo_R1.flags <- paste("-g", neo_FWD.orients, "-a", neo_REV.RC)
  # Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
  neo_R2.flags <- paste("-G", neo_REV.orients, "-A", neo_FWD.RC)
  
  #Teleo ----
  # Trim FWD and the reverse-complement of REV off of R1 (forward reads)
  tel_R1.flags <- paste("-g", tel_FWD.orients, "-a", tel_REV.RC)
  # Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
  tel_R2.flags <- paste("-G", tel_REV.orients, "-A", tel_FWD.RC)
  }

#cutadapt files path and names ----
{
        all_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt"]
        names(all_fnFs.cut) <- all_fnFs.cut %>% 
            str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
            str_remove(pattern = "_cutadapt.fastq.gz")
        all_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt"]
        names(all_fnRs.cut) <- all_fnRs.cut %>% 
            str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
            str_remove(pattern = "_cutadapt.fastq.gz")
        
        all_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned"]
        all_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned"]

}

{
  #neo ----
  neo_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt" & primers_n_samples$Primer == "NeoFish"]
    names(neo_fnFs.cut) <- neo_fnFs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  neo_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt" & primers_n_samples$Primer == "NeoFish"]
    names(neo_fnRs.cut) <- neo_fnRs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  neo_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned" & primers_n_samples$Primer == "NeoFish"]
    names(neo_fnFs.filtN) <- neo_fnFs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
  neo_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned" & primers_n_samples$Primer == "NeoFish"]
    names(neo_fnRs.filtN) <- neo_fnRs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
    
  #mif ----
  mif_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt" & primers_n_samples$Primer == "MiFish"]
    names(mif_fnFs.cut) <- mif_fnFs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  mif_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt" & primers_n_samples$Primer == "MiFish"]
    names(mif_fnRs.cut) <- mif_fnRs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  mif_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned" & primers_n_samples$Primer == "MiFish"]
    names(mif_fnFs.filtN) <- mif_fnFs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
  mif_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned" & primers_n_samples$Primer == "MiFish"]
    names(mif_fnRs.filtN) <- mif_fnRs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
    
      
  #tel ----
  tel_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt" & primers_n_samples$Primer == "Teleo"]
    names(tel_fnFs.cut) <- tel_fnFs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  tel_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt" & primers_n_samples$Primer == "Teleo"]
    names(tel_fnRs.cut) <- tel_fnRs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  tel_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned" & primers_n_samples$Primer == "Teleo"]
    names(tel_fnFs.filtN) <- tel_fnFs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
  tel_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned" & primers_n_samples$Primer == "Teleo"]
    names(tel_fnRs.filtN) <- tel_fnRs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
    
          
  #neo/mif & neo/mif/tel ----
  nmt_fnFs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 cutadapt" & primers_n_samples$Primer %in% c("NeoFish/MiFish","NeoFish/MiFish/Teleo")]
    names(nmt_fnFs.cut) <- nmt_fnFs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  nmt_fnRs.cut <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 cutadapt" & primers_n_samples$Primer %in% c("NeoFish/MiFish","NeoFish/MiFish/Teleo")]
    names(nmt_fnRs.cut) <- nmt_fnRs.cut %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/cutadapt/")) %>% 
      str_remove(pattern = "_cutadapt.fastq.gz")

  nmt_fnFs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 N-cleaned" & primers_n_samples$Primer %in% c("NeoFish/MiFish","NeoFish/MiFish/Teleo")]
    names(nmt_fnFs.filtN) <- nmt_fnFs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
  nmt_fnRs.filtN <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 N-cleaned" & primers_n_samples$Primer %in% c("NeoFish/MiFish","NeoFish/MiFish/Teleo")]
    names(nmt_fnRs.filtN) <- nmt_fnRs.filtN %>% 
      str_remove(pattern = paste0("~/prjcts/fish_eDNA/sfjq/data/reads/N-cleaned/")) %>% 
      str_remove(pattern = "_N-cleaned.fastq.gz")
  
}


# e essa ordem tá perigosa
names(primers_n_samples$`Read file`)<- primers_n_samples$File_name


primers_n_samples$`Read file`[primers_n_samples$Stage %in% c("R1 N-cleaned", "R2 N-cleaned")] %>%  length()

#TODO esse tem q ir pro purrr...


# Run Cutadapt
#   output folder must exist
for(i in 1:40) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(all_R1.flags, all_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", all_fnFs.cut[i], "-p", all_fnRs.cut[i], # output files
all_fnFs.filtN[i], all_fnRs.filtN[i],  # input files
"--minimum-length 10")) # guarantee no zerolength reads
}


length(neo_fnFs.cut)
length(mif_fnFs.cut)
length(mif_fnRs.cut)
length(tel_fnFs.cut)
length(nmt_fnFs.cut)


#run cutadapt by primer ----

#MiFish ----
for(i in 1:length(mif_fnFs.cut)) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(mif_R1.flags, mif_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", mif_fnFs.cut[i], "-p", mif_fnRs.cut[i], # output files
mif_fnFs.filtN[i], mif_fnRs.filtN[i],  # input files
"--minimum-length 10 --discard-untrimmed")) # guarantee no zerolength reads
}

#NeoFish ----
for(i in 1:length(neo_fnFs.cut)) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(neo_R1.flags, neo_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", neo_fnFs.cut[i], "-p", neo_fnRs.cut[i], # output files
neo_fnFs.filtN[i], neo_fnRs.filtN[i],  # input files
"--minimum-length 10 --discard-untrimmed")) # guarantee no zerolength reads
}


#Teleo ----
for(i in 1:length(tel_fnFs.cut)) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(tel_R1.flags, tel_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", tel_fnFs.cut[i], "-p", tel_fnRs.cut[i], # output files
tel_fnFs.filtN[i], tel_fnRs.filtN[i],  # input files
"--minimum-length 10 --discard-untrimmed")) # guarantee no zerolength reads
}


#neo/mif & neo/mif/tel ----
  
for(i in 1:length(nmt_fnFs.cut)) {
# for(i in seq_along(all_fnFs)) {
system2(cutadapt, args = c(all_R1.flags, all_R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", nmt_fnFs.cut[i], "-p", nmt_fnRs.cut[i], # output files
nmt_fnFs.filtN[i], nmt_fnRs.filtN[i],  # input files
"--minimum-length 10 --discard-untrimmed")) # guarantee no zerolength reads
}


# as  Da23-mif & neo & SFJQ-mif & neo were previously demultiplexed, they do not bear the primers, so all reads are filtered out


```

<br>

#### Check primer removal 

Change de library number index in order to check the presence of remaining primer sequences on each lib data.
It is expected that all removed orientation counts change to zero since the primer sequences are removed.

```{r eval=FALSE}
# only ofr didatic purposes (or logical error checking)
# 8 - check for remaining adapters ----
# As a sanity check, we will count the presence of primers in the first cutadapt-ed sample:


#vector of read files to look on for primers
reads_seqs_cut <- primers_n_samples %>% 
  filter(Stage %in% c("R2 cutadapt", "R2 cutadapt")) %>% 
  select(`Read file`) %>% as.list()


#count primers
future::plan(future::multisession(workers = cores_to_be_used))
primers_in_cut_reads <- furrr::future_map_dfr(reads_seqs_cut$`Read file`, .f = multi_primerHits, primers = primers_seqs, .options = furrr::furrr_options(seed = NULL))

 # 
# primers_in_Nreads <- purrr::map_df(reads_seqs,.f = multi_primerHits, primers = primers_seqs)

#get sample information into primers_in_Nreads table
primers_in_cut_reads <- left_join(primers_in_cut_reads,primers_n_samples,by = "Read file")

# primers_in_cut_reads_bckp <- primers_in_cut_reads
# primers_in_cut_reads <- primers_in_cut_reads_bckp

primers_in_cut_reads <-
  primers_in_cut_reads %>% 
  select(# `Read file
 File_name, Type, Group, Library, Primer, Run, Stage,
         neo_FWD_Forward, neo_REV_Forward, neo_FWD_Complement, neo_REV_Complement, 
         neo_FWD_Reverse, neo_REV_Reverse, neo_FWD_RevComp, neo_REV_RevComp, 
         mif_FWD_Forward, mif_REV_Forward, mif_FWD_Complement, mif_REV_Complement, 
         mif_FWD_Reverse, mif_REV_Reverse, mif_FWD_RevComp, mif_REV_RevComp, 
         tel_FWD_Forward, tel_REV_Forward, tel_FWD_Complement, tel_REV_Complement, 
         tel_FWD_Reverse, tel_REV_Reverse, tel_FWD_RevComp, tel_REV_RevComp)

#convert primer hits table to long format
primers_in_cut_reads_long <- primers_in_cut_reads %>% 
  gather(key = Sequences, 
         value = Count,  
         neo_FWD_Forward, neo_FWD_Complement,
         neo_FWD_Reverse,neo_FWD_RevComp, 
         neo_REV_Forward, neo_REV_Complement,
         neo_REV_Reverse, neo_REV_RevComp,
         mif_FWD_Forward, mif_FWD_Complement,
         mif_FWD_Reverse, mif_FWD_RevComp, 
         mif_REV_Forward, mif_REV_Complement,
         mif_REV_Reverse, mif_REV_RevComp,
         tel_FWD_Forward, tel_FWD_Complement,
         tel_FWD_Reverse, tel_FWD_RevComp, 
         tel_REV_Forward, tel_REV_Complement,
         tel_REV_Reverse, tel_REV_RevComp) %>% 
  mutate(Sequences = factor(Sequences,levels = c("neo_FWD_Forward", "neo_REV_Forward",
                                                 "neo_FWD_RevComp", "neo_REV_RevComp",
                                                 "neo_FWD_Complement", "neo_REV_Complement",
                                                 "neo_FWD_Reverse", "neo_REV_Reverse",
                                       
                                                 "mif_FWD_Forward", "mif_REV_Forward",
                                                 "mif_FWD_RevComp", "mif_REV_RevComp",
                                                 "mif_FWD_Complement", "mif_REV_Complement",
                                                 "mif_FWD_Reverse", "mif_REV_Reverse",
                                       
                                                "tel_FWD_Forward", "tel_REV_Forward",
                                                "tel_FWD_RevComp", "tel_REV_RevComp",
                                                "tel_FWD_Complement", "tel_REV_Complement",
                                                "tel_FWD_Reverse", "tel_REV_Reverse")),
         File_name = factor(File_name,levels = sample_levels),
         Run = as.factor(Run),
         Primer = factor(Primer,levels = c("NeoFish","MiFish","Teleo","NeoFish/MiFish","NeoFish/MiFish/Teleo"))) 



# PLOT 1: primers counts in reads tile plot - only primers FWD & REV, foward & revcomp ----
primers_tile_clean <- 
  primers_in_cut_reads_long %>% 
  filter(Sequences  %in% c(
    "mif_REV_RevComp", "mif_REV_Forward", "mif_FWD_RevComp", "mif_FWD_Forward",
    "neo_REV_RevComp", "neo_REV_Forward", "neo_FWD_RevComp", "neo_FWD_Forward",
    "tel_REV_RevComp", "tel_REV_Forward", "tel_FWD_RevComp", "tel_FWD_Forward")) %>% 
  filter(Run %in% c("LGC_MiniSeq_1")) %>% 
  ggplot2::ggplot(aes(y=File_name,x=Sequences,fill=log10(Count))) +
  geom_tile()+
  geom_text(aes(label = Count),size=1)+
  # scale_fill_gradient(low="white", high="darkgreen",trans="log10") +
  scale_fill_gradientn(name = "Primer counts",
                       colours = c("white","darkgreen"),
                       values = c(0,1),
                       na.value ="white") +
  theme_light(base_line_size = 1,base_size = 6) +
  theme(axis.text.x = element_text(angle = 45,hjust = 1)) +
  geom_hline(yintercept = c(40.5,82.5,86.5,116.5),color = "grey") +
  geom_vline(xintercept = c(4.5,8.5,12.5,16.5),color = "grey") +
  # coord_fixed(ratio = 0.20) +
  xlab("Primers") +
  ylab("Amostra") +
  ggtitle(label = "eDNA 1st, 2nd & 3rd runs",
              subtitle = "Primer presence on sample reads") 
# +
#   facet_wrap(~Run, drop = TRUE)


primers_tile_clean

```

<br><br>

### Quality filtering

Here the **DADA2** pipeline starts.

<br>

#### Set input libs paths

Define the paths to the libraries after _cutadapt_ primer removal.

```{r, eval=FALSE}
# 9 - load clean seqs to DADA2 pipe ----

all_fnFs.cut <- c(mif_fnFs.cut,neo_fnFs.cut,tel_fnFs.cut,nmt_fnFs.cut)
all_fnRs.cut <- c(mif_fnRs.cut,neo_fnRs.cut,tel_fnRs.cut,nmt_fnRs.cut)


all_fnFs.cut
all_fnRs.cut

```

<br>

#### Set quality filtering output files names

```{r, eval=FALSE}
# 11 - quality filter preparation ----


#name outputs
Qfilter_files <- primers_n_samples %>% 
  filter(Stage %in% c("R1 N-cleaned","R2 N-cleaned")) %>% 
  mutate(`Read file` = str_replace_all(.$`Read file`,pattern = "N-cleaned",replacement = "Qfiltered")) %>% 
  mutate(Stage = str_replace_all(.$Stage,pattern = "N-cleaned",replacement = "Qfiltered"))

primers_n_samples <- bind_rows(primers_n_samples,Qfilter_files)

#rename files so all can be traceble
names(primers_n_samples$`Read file`) <- primers_n_samples$File_name


# Qfiltered files path and names

all_filtFs <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R1 Qfiltered"]
names(all_filtFs) <- all_filtFs %>% 
    str_remove(pattern = paste0(data_path,"/Qfiltered/")) %>% 
    str_remove(pattern = "_Qfiltered.fastq.gz")
all_filtRs <- primers_n_samples$`Read file`[primers_n_samples$Stage == "R2 Qfiltered"]
names(all_filtRs) <- all_filtRs %>% 
    str_remove(pattern = paste0(data_path,"/Qfiltered/")) %>% 
    str_remove(pattern = "_Qfiltered.fastq.gz")



```

<br>

#### Quality filtering

On this step it is possible to filter by size but, as we have already removed primers from the beginning/end of the reads, it is expected that the remaining sequences are already trimmed to lengths compatible with their respective amplicons. Thus, no length trimming was conducted.


```{r, eval=FALSE}
# 12 - dada filtering ----

# We’ll use standard filtering parameters: maxN=0 (DADA2 requires no Ns), truncQ=2, rm.phix=TRUE and maxEE=2. The maxEE parameter sets the maximum number of “expected errors” allowed in a read, which is a better filter than simply averaging quality scores.

length(all_fnFs.cut)
length(all_filtFs)
length(all_fnRs.cut)
length(all_filtRs)

#sorting for next step not to mix files
all_fnFs.cut <- base::sort(all_fnFs.cut)
all_filtFs <- base::sort(all_filtFs)
all_fnRs.cut <- base::sort(all_fnRs.cut)
all_filtRs <- base::sort(all_filtRs)
names(all_fnFs.cut)
names(all_filtFs)
names(all_fnRs.cut)
names(all_filtRs)

#all

all_filtered_out <- dada2::filterAndTrim(fwd = all_fnFs.cut,
                                         filt = all_filtFs, 
                                         rev = all_fnRs.cut,
                                         filt.rev = all_filtRs,
                                         # truncLen=c(240,160),
                                         maxN=0,
                                         maxEE=c(2,2),
                                         # truncQ=2,
                                         rm.phix=TRUE,
                                         compress=TRUE, multithread=TRUE,verbose = TRUE,matchIDs = TRUE) # On Windows set multithread=FALSE
head(all_filtered_out)

```

<br>

#### View post-filtering quality profiles

```{r, eval=FALSE}
#check quality profile after filtering and trimming
plotQualityProfile(all_filtFs[2])
plotQualityProfile(all_filtRs[2:8])

plotQualityProfile(mif_filtFs[])
plotQualityProfile(mif_filtRs[])


```
<br>

### Identify error rates intrinsic to sequencing

```{r, eval=FALSE}
# 13 - learn error rates ----

#Learn the Error Rates
primers_n_samples$Run %>%  unique()

#run LGC_MiniSeq_1 ----
run1_errF <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "LGC_MiniSeq_1" & 
                                                         primers_n_samples$Stage == "R1 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)
run1_errR <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "LGC_MiniSeq_1" & 
                                                         primers_n_samples$Stage == "R2 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)

#run LGC_MiniSeq_2 ----
run2_errF <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "LGC_MiniSeq_2" & 
                                                         primers_n_samples$Stage == "R1 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)
run2_errR <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "LGC_MiniSeq_2" & 
                                                         primers_n_samples$Stage == "R2 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)

#run ecomol_iSeq ----
run3_errF <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "ecomol_iSeq" & 
                                                         primers_n_samples$Stage == "R1 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)
run3_errR <- learnErrors(primers_n_samples$`Read file`[primers_n_samples$Run == "ecomol_iSeq" & 
                                                         primers_n_samples$Stage == "R2 Qfiltered"], 
                        multithread=TRUE,randomize = TRUE)


#   # plotErrors(run3_errF, nominalQ=TRUE)
#   # plotErrors(run3_errR, nominalQ=TRUE)
```

<br>

### Dereplication: grouping into ASVs

On this step each library is reduced to its unique composing sequences and their counts.

```{r, eval=FALSE}
# 14 - dada dereplication ----


names(primers_n_samples$`Read file`) <- primers_n_samples$File_name ###!!!!!!!!!!!!! Everything must have unique names from here

unique(primers_n_samples$`Read file`)
names(primers_n_samples$`Read file`)


#run LGC_MiniSeq_1 ----
LGC_1_derep_forward <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "LGC_MiniSeq_1" & 
    primers_n_samples$Stage == "R1 Qfiltered"], verbose=TRUE)
# names(all_derep_forward) <- all_sample.names

LGC_1_derep_reverse <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "LGC_MiniSeq_1" & 
    primers_n_samples$Stage == "R2 Qfiltered"], verbose=TRUE)
# names(all_derep_reverse) <- all_sample.names

LGC_1_dadaFs <- dada(LGC_1_derep_forward, err=run1_errF, multithread=TRUE)
LGC_1_dadaRs <- dada(LGC_1_derep_reverse, err=run1_errR, multithread=TRUE)

#run LGC_MiniSeq_2 ----
LGC_2_derep_forward <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "LGC_MiniSeq_2" & 
    primers_n_samples$Stage == "R1 Qfiltered"], verbose=TRUE)
# names(all_derep_forward) <- all_sample.names

LGC_2_derep_reverse <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "LGC_MiniSeq_2" & 
    primers_n_samples$Stage == "R2 Qfiltered"], verbose=TRUE)
# names(all_derep_reverse) <- all_sample.names

LGC_2_dadaFs <- dada(LGC_2_derep_forward, err=run2_errF, multithread=TRUE)
LGC_2_dadaRs <- dada(LGC_2_derep_reverse, err=run2_errR, multithread=TRUE)

#run ecomol_iSeq ----
ecomol_derep_forward <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "ecomol_iSeq" & 
    primers_n_samples$Stage == "R1 Qfiltered"], verbose=TRUE)
# names(all_derep_forward) <- all_sample.names

ecomol_derep_reverse <- derepFastq(primers_n_samples$`Read file`[
  primers_n_samples$Run == "ecomol_iSeq" & 
    primers_n_samples$Stage == "R2 Qfiltered"], verbose=TRUE)
# names(all_derep_reverse) <- all_sample.names

ecomol_dadaFs <- dada(ecomol_derep_forward, err=run3_errF, multithread=TRUE)
ecomol_dadaRs <- dada(ecomol_derep_reverse, err=run3_errR, multithread=TRUE)


all_dadaFs <- c(ecomol_dadaFs,LGC_2_dadaFs,LGC_1_dadaFs)
all_dadaRs <- c(ecomol_dadaRs,LGC_2_dadaRs,LGC_1_dadaRs)

names(all_dadaFs)

```

<br>

### Merge read pairs 

On this step the forward an reverse reads are merged, by overlap, in order to reconstruct the insert full sequence.
As we have samples from three runs, they are all worked independently.

```{r, eval=FALSE}
# 15 - merge read pairs ----

#run1 ----
run1_mergers <- mergePairs(dadaF = LGC_1_dadaFs,
                          derepF = LGC_1_derep_forward,
                          dadaR = LGC_1_dadaRs,
                          derepR = LGC_1_derep_reverse,
                          minOverlap = 20,
                          maxMismatch = 0,   # changed from 0 to 1 since a lot was being left out for single mismatch
                          returnRejects = TRUE,
                          verbose=TRUE)

#run2 ----
run2_mergers <- mergePairs(dadaF = LGC_2_dadaFs,
                          derepF = LGC_2_derep_forward,
                          dadaR = LGC_2_dadaRs,
                          derepR = LGC_2_derep_reverse,
                          minOverlap = 20,
                          maxMismatch = 0,   # changed from 0 to 1 since a lot was being left out for single mismatch
                          returnRejects = TRUE,
                          verbose=TRUE)

#run1 ----
run3_mergers <- mergePairs(dadaF = ecomol_dadaFs,
                          derepF = ecomol_derep_forward,
                          dadaR = ecomol_dadaRs,
                          derepR = ecomol_derep_reverse,
                          minOverlap = 20,
                          maxMismatch = 0,   # changed from 0 to 1 since a lot was being left out for single mismatch
                          returnRejects = TRUE,
                          verbose=TRUE)

all_mergers <- c(run3_mergers,run2_mergers,run1_mergers)

names(all_mergers)

length(all_dadaFs)
length(all_dadaRs)
head(all_mergers[[12]])
length(all_dadaFs)
names(all_mergers)
str(all_mergers)
class(all_mergers)


# all_seqtab <- makeSequenceTable(samples = c(run3_mergers,run2_mergers,run1_mergers))   #talvez essa função aceite varios mergers
all_seqtab <- makeSequenceTable(samples = all_mergers)
dim(all_seqtab)
View(all_seqtab)
str(all_seqtab)
# Inspect distribution of sequence lengths
table(nchar(getSequences(all_seqtab)))
table(nchar(getSequences(all_seqtab))) %>% plot()


names(all_dadaFs)
names(all_derep_forward)
names(all_dadaRs)
names(all_derep_reverse)

```

<br>

### Remove _chimeras_ 

_Chimeras_ are artificial read pairs that might have been generated erroneously on sequencing. The **DADA2** package estimates the probability of a sequence to be chimeric given the abundancy of its parental sequnces. After chimeric sequences removal, the remaining ASVs length distribution is assessed. On further steps it will be used to restrict analisys to ASVs compatible with each primer amplicons' length interval, in order to keep of unexpected ASVs.

```{r, eval=FALSE}
# 16 - remove chimeras ----


# any(colnames(C1conc_seqtab) %in% colnames(all_seqtab))

all_seqtab.nochim <- removeBimeraDenovo(all_seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(all_seqtab.nochim)
sum(all_seqtab.nochim)/sum(all_seqtab) # =  0.8404743 , perda de 16% na abundancia
#count proportion of ASVs of a given length
table(nchar(getSequences(all_seqtab.nochim)))
table(nchar(getSequences(all_seqtab.nochim))) %>% plot()
rownames(all_seqtab.nochim)



View(all_seqtab.nochim)
str(all_seqtab.nochim)

```
<br>

### Count reads and remaining ASVs

```{r, eval=FALSE}
# 17 - count reads proportion throughout the pipeline ----

getN <- function(x) sum(getUniques(x))

#preparing subtables with named rows to combine latter
#raw files

names(primers_n_samples$`Read file`) <- primers_n_samples$Library 

raw_reads <- primers_n_samples %>% filter(Stage %in% c("R1","R2")) 

raw_reads_counts <- ShortRead::countFastq(dirPath = raw_reads$`Read file`) %>% as_tibble(rownames = "Read file")
raw_reads_counts <- left_join(x = raw_reads_counts, y = (raw_reads %>%  mutate(`Read file` = basename(`Read file`)) 
                                                         ),by = "Read file")

tbl_raw_FWD <- raw_reads_counts[raw_reads_counts$Stage %in% c("R1"),] %>% select(File_name, records) %>% `colnames<-`(c("File_name", "Raw FWD"))
tbl_raw_REV <- raw_reads_counts[raw_reads_counts$Stage %in% c("R2"),] %>% select(File_name, records) %>% `colnames<-`(c("File_name", "Raw REV"))



tbl_Denoised_FWD <- (sapply(all_dadaFs, getN) %>% as_tibble(rownames = "File_name")) %>% `colnames<-`(c("File_name", "Denoised FWD"))
tbl_Denoised_REV <- (sapply(all_dadaRs, getN) %>% as_tibble(rownames = "File_name")) %>% `colnames<-`(c("File_name", "Denoised REV"))
tbl_Merged <- (rowSums(all_seqtab) %>% as_tibble(rownames = "File_name")) %>% `colnames<-`(c("File_name", "Merged"))
tbl_Non_chimeric <- (rowSums(all_seqtab.nochim) %>% as_tibble(rownames = "File_name")) %>% `colnames<-`(c("File_name", "Non-chimeric"))

# combine all counts by sample to plot

all_track <- all_filtered_out %>%  as_tibble(rownames = "File_name") %>% 
  mutate(`File_name` = str_remove(string = `File_name`, pattern = "_R1_cutadapt.fastq.gz")) %>% 
  left_join(tbl_raw_FWD,by = "File_name") %>% 
  left_join(tbl_raw_REV,by = "File_name") %>% 
  left_join(tbl_Denoised_FWD,by = "File_name") %>% 
  left_join(tbl_Denoised_REV,by = "File_name") %>% 
  left_join(tbl_Merged,by = "File_name") %>% 
  left_join(tbl_Non_chimeric,by = "File_name") %>% 
  left_join(primers_n_samples[primers_n_samples$Stage == "R1",],by = "File_name") %>% 
  select(!c("Stage", "Read file"))



colnames(all_track) <- c("File_name","N-cleaned", "Filtered","Raw FWD", "Raw REV", "Denoised FWD", "Denoised REV", "Merged", "Non-Chimeric", "Type", "Group", "Library", "Primer", "Run")




# Combine tables together (if there is more than one)
track_tbl <- bind_rows(all_track)


{
all_track$File_name[all_track$File_name == "Cassaum"] <- "Positive Control\n(P.glauca)"
all_track$File_name[all_track$File_name == "Da19"] <- "Da19"
all_track$File_name[all_track$File_name == "Da20"] <- "Da20"
all_track$File_name[all_track$File_name == "Da21"] <- "Da21"
all_track$File_name[all_track$File_name == "Da22"] <- "Da22"
all_track$File_name[all_track$File_name == "Da23-mif"] <- "Da23-mif"
all_track$File_name[all_track$File_name == "Da23-neo"] <- "Da23-neo"
all_track$File_name[all_track$File_name == "neg-PCR2"] <- "neg-PCR2"
all_track$File_name[all_track$File_name == "pJequei-N-norm-M"] <- "Non-normalized JQmc\nMiFish"
all_track$File_name[all_track$File_name == "pJequei-N-norm-N"] <- "Non-normalized JQmc\nNeoFish"
all_track$File_name[all_track$File_name == "pJequei-N-norm-T"] <- "Non-normalized JQmc\nTeleo"
all_track$File_name[all_track$File_name == "pJequei-norm-M"] <- "Normalized JQmc\nMiFish"
all_track$File_name[all_track$File_name == "pJequei-norm-N"] <- "Normalized JQmc\nNeoFish"
all_track$File_name[all_track$File_name == "pJequei-norm-T"] <- "Normalized JQmc\nTeleo"
all_track$File_name[all_track$File_name == "SFJQ-mif"] <- "Normalized SFJQmc\nMiFish"
all_track$File_name[all_track$File_name == "SFJQ-neo"] <- "Normalized SFJQmc\nNeoFish"
all_track$File_name[all_track$File_name == "SFnNorm-mi"] <- "Non-normalized SFmc\nMiFish"
all_track$File_name[all_track$File_name == "SFnNorm-neo"] <- "Non-normalized SFmc\nNeoFish"
all_track$File_name[all_track$File_name == "SFNorm-mi"] <- "Normalized SFmc\nMiFish"
all_track$File_name[all_track$File_name == "SFNorm-neo"] <- "Normalized SFmc\nNeoFish"
}


# save reads counts table



writexl::write_xlsx(x = all_track,
                    path = "~/prjcts/fish_eDNA/sfjq/results/sfjq_read_counts_along_quality_control.xlsx",
                    col_names = TRUE,format_headers = TRUE)






# plot reads proportion troughout the pipeline ----


track_tbl$Primer %>% unique()
track_tbl$File_name %>% unique()

#TODO
# https://bhaskarvk.github.io/colormap/
#https://www.thinkingondata.com/something-about-viridis-library/
#set colors here ss




#perda por filtrar N
all_track %>% mutate(perda = `N-cleaned`/`Raw FWD`)


#18 - set colors for downstream plots ----

# colors 
scales::show_col(colors5)



colors5 <- c("#017504","#000791","#820000","#780058","#ff5500") #neo,mi,tel,all
colors_norm <- c("#017504","#4fc952",
                 "#000791","#3862eb",
                 "#820000","#bf4b4b")
scales::show_col(colors_norm)
scales::show_col(colors5)

#PLOT2 - sample track plot ----

# # track_tbl$File_name %>% paste0('"\n"',collapse = "") %>% cat()
# track_tbl$File_name %>% unique() %>% base::sort() %>% paste0(collapse = '\n') %>%  cat()
# sample_levels


 {
track_tbl$File_name[track_tbl$File_name == "Cassaum"] <- "Positive Control\n(P.glauca)"
track_tbl$File_name[track_tbl$File_name == "Da19"] <- "Da19"
track_tbl$File_name[track_tbl$File_name == "Da20"] <- "Da20"
track_tbl$File_name[track_tbl$File_name == "Da21"] <- "Da21"
track_tbl$File_name[track_tbl$File_name == "Da22"] <- "Da22"
track_tbl$File_name[track_tbl$File_name == "Da23-mif"] <- "Da23-mif"
track_tbl$File_name[track_tbl$File_name == "Da23-neo"] <- "Da23-neo"
#track_tbl$File_name[track_tbl$File_name == "neg-PCR2"] <-
track_tbl$File_name[track_tbl$File_name == "pJequei-N-norm-M"] <- "Non-normalized JQmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "pJequei-N-norm-N"] <- "Non-normalized JQmc\nNeoFish"
track_tbl$File_name[track_tbl$File_name == "pJequei-N-norm-T"] <- "Non-normalized JQmc\nTeleo"
track_tbl$File_name[track_tbl$File_name == "pJequei-norm-M"] <- "Normalized JQmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "pJequei-norm-N"] <- "Normalized JQmc\nNeoFish"
track_tbl$File_name[track_tbl$File_name == "pJequei-norm-T"] <- "Normalized JQmc\nTeleo"
track_tbl$File_name[track_tbl$File_name == "SFJQ-mif"] <- "Normalized SFJQmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "SFJQ-neo"] <- "Normalized SFJQmc\nNeoFish"
track_tbl$File_name[track_tbl$File_name == "SFnNorm-mi"] <- "Non-normalized SFmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "SFnNorm-neo"] <- "Non-normalized SFmc\nNeoFish"
track_tbl$File_name[track_tbl$File_name == "SFNorm-mi"] <- "Normalized SFmc\nMiFish"
track_tbl$File_name[track_tbl$File_name == "SFNorm-neo"] <- "Normalized SFmc\nNeoFish"
}




sample_levels <- c(
"Da23-mif", "Normalized SFJQmc\nMiFish",
"Da23-neo", "Normalized SFJQmc\nNeoFish",
"Da20","Non-normalized SFmc\nMiFish",
"Da19","Non-normalized SFmc\nNeoFish",
"Da22","Normalized SFmc\nMiFish",
"Da21","Normalized SFmc\nNeoFish",
"Non-normalized JQmc\nNeoFish","Non-normalized JQmc\nMiFish","Non-normalized JQmc\nTeleo",
"Normalized JQmc\nNeoFish","Normalized JQmc\nMiFish","Normalized JQmc\nTeleo",
"Positive Control\n(P.glauca)","neg-PCR2")


# Prepare counts for ploting ----

  track_tbl <- track_tbl %>%
  gather(key = "Stage",
        value = "Read Number",
        "Raw REV","Raw FWD",
        "N-cleaned", "Filtered", "Denoised FWD",
        "Denoised REV", "Merged", "Non-Chimeric") %>%
  mutate(Stage = factor(Stage, levels = c("Non-Chimeric", "Merged", "Denoised REV", "Denoised FWD", "Filtered","N-cleaned", "Raw REV","Raw FWD"))) %>%
  mutate(
    Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo","NeoFish/MiFish",  "NeoFish/MiFish/Teleo")),
    File_name = factor(File_name,levels = sample_levels))

  options(scipen = 22)
  
  # track_tbl %>% base::sort(track_tbl$Sample) 

# ploting ----
    
  track_plot <- track_tbl %>% 
    # filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>% 
    # filter(Group %in% c("DNA pool")) %>% 
    ggplot(aes(y = Stage,x = `Read Number`, fill = Primer, group = Stage)) +
    geom_bar(stat="identity") +
    geom_hline(yintercept = 300000, col = 1, linetype = 2) +
    scale_fill_manual(labels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish","NeoFish/MiFish/Teleo"),
                      values = alpha(colour = colors5,
                                     alpha =  0.75)) +
    labs(title = "LGC eDNA 1st & 2nd runs",
         subtitle = "Read counts per library and filtering step",
         x = "Read counts",
         y = "Data filtering step")+
    facet_wrap(~File_name,ncol = 6) +
    coord_fixed(ratio = 60000) +
    theme_bw(base_size = 7) +
    theme(legend.position = "bottom") +
    theme(axis.title = ggtext::element_markdown())

track_plot 







# save plot
ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_sample_track_plot.png",
     plot = track_plot,
     device = "png",
     width = 12,
     height = 10,
     units = "cm",
     dpi = 600)

# save plot
ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_sample_track_plot.svg",
     plot = track_plot,
     device = "svg",
     width = 12,
     height = 10,
     units = "cm",
     dpi = 600)


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_sample_track_plot.pdf",
     plot = track_plot,
     device = "pdf",
     width = 20,
     height = 16,
     units = "cm",
     dpi = 600)


```

Reads proportions are displayed below. The experimental design intended the same read yield for all libs, between  200K a 250K reads. The deviations of this reange are probably due to dosage/pipetting errors.

<br>

<br>

## Classify taxonomy

On this step the ASVs identified by the **DADA2** pipeline, jointly for all libraries of each primer, are associated (or not) to any of the sequences on the Reference 12S Sequences Database. DADA2 has two strategies to identify taxa. The first, _assignSpecies_, identify perfect matches of the ASVs in the Reference Database. The second, _assignTaxonomy_, use a RDP Naive Bayesian Classifier algorithm (Wang, 2007) with kmer size 8 and 100 bootstrap replicates to associate ASVs to the Reference Database Sequences. In the latter, the taxonomy ranks classification is proportional to the sequence similarity, although this relation is not yet clear to us.

```{r, eval=FALSE}

#19 - classify taxonomy exactly ----

all_sps <- dada2::assignSpecies(seqs = all_seqtab.nochim,allowMultiple = 10,
                         refFasta =  "~/prjcts/fish_eDNA/data/refs/db/LGC/jul21/dada_tax_fullDB_order_SPs_jul21.fasta",
                         tryRC=TRUE,
                         n = 20000,
                         verbose = TRUE)


#check how many ASVs were exactly identified as species

View(all_sps)


      all_csv_sp <- all_sps %>% as_tibble() %>% mutate(ASV = rownames(all_sps))
      colnames(all_csv_sp) <- c("exact Genus", "exact Species", "ASV")

```

```{r, eval=FALSE}
#20 - classify taxonomy ----

all_taxa <- dada2::assignTaxonomy(seqs = all_seqtab.nochim,
                     refFasta =  "~/prjcts/fish_eDNA/data/refs/db/LGC/jul21/dada_tax_fullDB_order_jul21.fasta",
                           multithread=TRUE, tryRC=TRUE,taxLevels = c("Kingdom","Phylum","Class","Order","Family", "Genus", "Species","Specimen","Basin"),outputBootstraps = TRUE)


all_taxa$boot

```

```{r, eval=FALSE,echo=FALSE}
# Classify taxonomy with DECIPHER

# import training sequences
fas <- system.file("extdata", "50S_ribosomal_protein_L2.fas", package="DECIPHER")
dna <- readDNAStringSet(fas)

# parse the headers to obtain a taxonomy
s <- strsplit(names(dna), " ")
genus <- sapply(s, `[`, 1)
species <- sapply(s, `[`, 2)
taxonomy <- paste("Root", genus, species, sep="; ")
head(taxonomy)

# train the classifier
## Not run: 
trainingSet <- LearnTaxa(dna, taxonomy)
trainingSet


LGC12Sdb <- Biostrings::readDNAStringSet(filepath ="~/prjcts/fish_eDNA/data/refs/db/LGC/fev21/decipher_tax_fullDB_order_fev21.fasta" )
##prepare DB
LGC12Sdb_trainingSet <- DECIPHER::LearnTaxa(train = LGC12Sdb,taxonomy = names(LGC12Sdb))
LGC12Sdb_trainingSet$problemGroups


## creating DNAStringSet object of our ASVs
dna <- DNAStringSet(getSequences(all_seqtab.nochim))

write.table(asv_tax, "ASVs_taxonomy.tsv", sep = "\t", quote=F, col.names=NA)

tax_info[[2]]

str(tax_info)
View(asv_tax)



library(decontam)


```

<br><br>

Here the **DADA2** pipeline ends.

<br><br><br><br>

## Phyloseq

On this step the ASVs associated to taxonomic ranks by **DADA2** and their respective counts by library, are combined using the **Phyloseq** package.

<br>

### Generate sample metadata table

Here the experiment metadata is associated to each sample.

```{r, eval=FALSE}
# 22 - create sample table ----

#create a sample table for each primer

# primers_n_samples$File_name

all_samdf <- unique(primers_n_samples[1:6])

samdf<- all_samdf

#rownames must me assigned in order to the next step to work

samdf <- samdf %>% as.data.frame()
rownames(samdf) <- samdf$File_name
```

<br>

This sample metadata table was created with the information available for the samples analyzed on this first run. This table must be customized for each experiment.

<br>

```{r, eval=FALSE, echo=FALSE}
# knitr::kable(samdf,align = 'c')
#esta ruim

```

<br><br>


### **Phyloseq** data interpretation

```{r, eval=FALSE}
#23 - interpret dada on phyloseq ----

all_ps <- phyloseq(otu_table(all_seqtab.nochim, taxa_are_rows = FALSE),
                   sample_data(samdf),
                   tax_table(all_taxa$tax))
                   # tax_table(all_taxa))

rownames(all_seqtab.nochim)

```

<br>

### Merge and Flex Phyloseq results 

Many different graphics can be generated, together or in isolation, for all primers/libraries and taxonomic ranks.

```{r, eval=FALSE}
#24 - merge ps analisys ----

#melt phyloseq object into tbl
all_ps_tbl <- psmelt(all_ps) %>% as_tibble() %>% filter(Abundance >= 1)

colnames(all_ps_tbl)[colnames(all_ps_tbl) == "OTU"] <- "ASV"

unique(all_ps_tbl$ASV)
# unique(neo_ps_tbl$ASV)
# unique(mif_ps_tbl$ASV)

all_ps_tbl$Sample %>%  unique()
all_ps_tbl$Primer %>%  unique()

#concatenate exact species table

all_ps_tbl <- left_join(by = "ASV",x=all_ps_tbl,y= all_csv_sp)

# backup table
# all_ps_tbl_bckp <- all_ps_tbl
# all_ps_tbl <- all_ps_tbl_bckp

```

## metaBLASTEr - Identify ASVs with inhouse BLASTn


This package is currently under development, but fully functional. You will need a working NCBI-BLAST+ installed and a BLAST formated reference DB on the same linux server your Rstudio-server is running (but we are improoving it to run on IOS).

```{r, echo=TRUE,eval=FALSE}
# blastn ----

install.packages('BLASTr', repos = "https://heronoh.r-universe.dev")
# Annotate all ASVs by blastN

asvs_blast <- all_ps_tbl$ASV %>% unique() %>% as.character() 




#Identify using metaBLASTr package ----
# paralela com 2 threads ----
tictoc::tic("Parallel - Furrr 2 threads")

blast_res <- BLASTr::parallel_blast(
  db_path = "/data/databases/nt_jun2023/nt",
  asvs = asvs_blast_all,
  out_file = "~/prjcts/fish_eDNA/sfjq/results/blast/blast_out_res_1.csv",
  out_RDS = "~/prjcts/fish_eDNA/sfjq/results/blast/blast_out_res_1.RDS",
  total_cores = 80,
  perc_id = 80,
  num_threads = 2,
  perc_qcov_hsp = 80,
  num_alignments = 3,
  blast_type = "blastn"
)

tictoc::toc()# 


# #Save env
   base::save.image("~/prjcts/fish_eDNA/sfjq//env-canastra_posBLAST-25jul23.RData")

   



colnames(blast_res)

# blast_res <- blast_res %>% rename("OTU" ="Sequence")
# blast_res <- blast_res %>% rename("Sequence" ="OTU")

blast_res_full <- bind_rows(blast_res) %>% 
  select(-c("OTU")) %>%
  filter(!is.na(`1_subject header`))

nrow(blast_res)
dim(blast_res)

blast_res <- blast_res %>%  filter(`1_res` == 1 ) #remover o que não deu nada            

str(blast_res)
 
```

### Rename samples for plots

```{r, echo=TRUE,eval=FALSE}
primers_n_samples
sample_levels

{
all_ps_tbl$File_name[all_ps_tbl$File_name == "Cassaum"] <- "Positive Control\n(P.glauca)"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da19"] <- "Non-normalized SFmc\nNeoFish B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da20"] <- "Non-normalized SFmc\nMiFish B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da21"] <- "Normalized SFmc\nNeoFish B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da22"] <- "Normalized SFmc\nMiFish B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da23-mif"] <- "SFJQ-mif B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "Da23-neo"] <- "SFJQ-neo B"
all_ps_tbl$File_name[all_ps_tbl$File_name == "neg-PCR2"] <-"neg-PCR2"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-N-norm-M"] <- "Non-normalized JQmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-N-norm-N"] <- "Non-normalized JQmc\nNeoFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-N-norm-T"] <- "Non-normalized JQmc\nTeleo"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-norm-M"] <- "Normalized JQmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-norm-N"] <- "Normalized JQmc\nNeoFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "pJequei-norm-T"] <- "Normalized JQmc\nTeleo"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFJQ-mif"] <- "Normalized SFJQmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFJQ-neo"] <- "Normalized SFJQmc\nNeoFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFnNorm-mi"] <- "Non-normalized SFmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFnNorm-neo"] <- "Non-normalized SFmc\nNeoFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFNorm-mi"] <- "Normalized SFmc\nMiFish"
all_ps_tbl$File_name[all_ps_tbl$File_name == "SFNorm-neo"] <- "Normalized SFmc\nNeoFish"
}

{
all_ps_tbl$Sample[all_ps_tbl$Sample == "Cassaum"] <- "Positive Control\n(P.glauca)"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da19"] <- "Non-normalized SFmc\nNeoFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da20"] <- "Non-normalized SFmc\nMiFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da21"] <- "Normalized SFmc\nNeoFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da22"] <- "Normalized SFmc\nMiFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da23-mif"] <- "Normalized SFJQmc\nMiFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "Da23-neo"] <- "Normalized SFJQmc\nNeoFish B"
all_ps_tbl$Sample[all_ps_tbl$Sample == "neg-PCR2"] <- "neg-PCR2"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-N-norm-M"] <- "Non-normalized JQmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-N-norm-N"] <- "Non-normalized JQmc\nNeoFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-N-norm-T"] <- "Non-normalized JQmc\nTeleo"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-norm-M"] <- "Normalized JQmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-norm-N"] <- "Normalized JQmc\nNeoFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "pJequei-norm-T"] <- "Normalized JQmc\nTeleo"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFJQ-mif"] <- "Normalized SFJQmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFJQ-neo"] <- "Normalized SFJQmc\nNeoFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFnNorm-mi"] <- "Non-normalized SFmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFnNorm-neo"] <- "Non-normalized SFmc\nNeoFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFNorm-mi"] <- "Normalized SFmc\nMiFish"
all_ps_tbl$Sample[all_ps_tbl$Sample == "SFNorm-neo"] <- "Normalized SFmc\nNeoFish"
}




all_ps_tbl$Group %>%  unique() %>% base::sort()


# all_ps_tbl$Group <- all_ps_tbl$Group %>% unfactor()
{
all_ps_tbl$Group[all_ps_tbl$Group %in% c("JqNnorm")] <- "Non-normalized JQmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("JqNorm")] <- "Normalized JQmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("SFnNorm")] <- "Non-normalized SFmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("SFNorm")] <- "Normalized SFmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("SFJQ")] <- "Normalized SFJQmc"
all_ps_tbl$Group[all_ps_tbl$Group %in% c("Positive control")] <- "Positive control\n(P. glauca)"
}
all_ps_tbl$Group %>% unique()

all_ps_tbl$Group <- all_ps_tbl$Group %>%  factor(levels = c("Non-normalized JQmc",
                                                            "Normalized JQmc",
                                                            "Non-normalized SFmc",
                                                            "Normalized SFmc",
                                                            "Normalized SFJQmc",
                                                            "Positive control\n(P. glauca)"))


sample_levels <- c(
"Normalized SFJQmc\nMiFish B", "Normalized SFJQmc\nMiFish",
"Normalized SFJQmc\nNeoFish B", "Normalized SFJQmc\nNeoFish",

"Normalized SFmc\nMiFish B","Normalized SFmc\nMiFish",
"Non-normalized SFmc\nMiFish B","Non-normalized SFmc\nMiFish",
"Normalized SFmc\nNeoFish B","Normalized SFmc\nNeoFish",
"Non-normalized SFmc\nNeoFish B","Non-normalized SFmc\nNeoFish",

"Non-normalized JQmc\nMiFish","Normalized JQmc\nMiFish",
"Non-normalized JQmc\nNeoFish","Normalized JQmc\nNeoFish",
"Non-normalized JQmc\nTeleo","Normalized JQmc\nTeleo",

"Positive Control\n(P.glauca)","neg-PCR2")

# all_ps_tbl_blast$Sample[all_ps_tbl_blast$Sample %in% c("pool não-normalizado\nMiFish")] 

all_ps_tbl$Sample <- all_ps_tbl$Sample %>%  factor(levels = sample_levels)


class(asvs_blast)

  
all_ps_tbl_blast <- left_join(x = all_ps_tbl,y = blast_res,by = "ASV")

colnames(all_ps_tbl_blast)

#all_ps_tbl_blast_bckp <- all_ps_tbl_blast
#all_ps_tbl_blast <- all_ps_tbl_blast_bckp

```


## ASVs seqs 

```{r,echo=TRUE, eval=FALSE}
#25 - recover all ASVs sequences to prepare fasta ----


#all ----
# giving our seq headers more manageable names (ASV_1, ASV_2...)
# all_asv_seqs <- tibble("ASV" = colnames(seqtab.nochim))
all_asv_seqs <- tibble("ASV" = asvs_blast)

all_asv_seqs <- all_asv_seqs %>% 
  mutate("ASV length" = nchar(ASV),
         "ASV header" = as.character(""))

all_asv_seqs <- all_asv_seqs[base::order(all_asv_seqs$`ASV length`),]
  for (i in 1:nrow(all_asv_seqs)) {

    all_asv_seqs$`ASV header`[i] <- paste0(">ASV_", i, "_", all_asv_seqs$`ASV length`[i], "bp")

  }


#combine ASV headers and all_ps_tbl
all_ps_tbl_blast <- dplyr::left_join(x = all_ps_tbl_blast,    
                               y = all_asv_seqs,
                               by = "ASV" )


# making and writing out a fasta of our final ASV seqs with tax
for (asv in 1:nrow(all_asv_seqs)) {
  
  tax <- all_ps_tbl_blast %>% 
    filter(ASV == all_asv_seqs$ASV[asv]) %>% 
    select("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Specimen") %>% 
    unique() %>% 
    paste0(collapse = "|")
  
  all_asv_seqs$`ASV header`[asv] <- paste0(all_asv_seqs$`ASV header`[asv],"_",tax)
  
  # if (condition) {
  # fazer algum teste pra ver ser ta certo
  # }
}

#write fasta file with ASVs and Taxonomy
all_asv_fasta <- c(rbind(all_asv_seqs$`ASV header`, all_asv_seqs$ASV))

write(all_asv_fasta, "~/prjcts/fish_eDNA/sfjq/results/sfjq_all_ASVs_all_primers.fasta")

```


### SWARM - ASVs to OTUs

```{r,echo=TRUE, eval=FALSE}
# #swarm

asvs_abd <- all_ps_tbl_blast %>%
  group_by(`ASV`,`ASV header`) %>%
  mutate("ASV total abundance" = sum(Abundance)) %>%
  select(c(`ASV`,`ASV header`,`ASV total abundance`)) %>%
  unique() %>%
  mutate(`ASV header` = paste0(`ASV header`,"_",`ASV total abundance`))


asvs_abd$ASV %>% unique()
asvs_abd$`ASV header` %>% unique()


#write fasta file with ASVs and  abundance
# all_asv_fasta_abd <- c(rbind(asvs_abd$`ASV header primer`, asvs_abd$`ASV`))
all_asv_fasta_abd <- c(rbind(asvs_abd$`ASV header`, asvs_abd$`ASV`))

# write(all_asv_fasta_abd, "~/prjcts/fish_eDNA/sfjq/results/sfjq_ASVs_abd_primer.fasta")
write(all_asv_fasta_abd, paste0(results_path,"/sfjq_ASVs_abd.fasta"))

# ~/prjcts/fish_eDNA/sfjq/swarm$ swarm -t 50 ~/prjcts/fish_eDNA/sfjq/results/sfjq_ASVs_abd.fasta -s sfjq_swarm.stats -o sfjq_swarm.out -w sfjq_representative_OTUs.fasta -i sfjq_swarm.structure -f


swarm_clust <- readr::read_lines("~/prjcts/fish_eDNA/sfjq/swarm/sfjq_swarm.out")





asvs_abd <- asvs_abd %>% mutate("OTU"= 0)



for (asv in 1:nrow(asvs_abd)){
  for (line in 1:length(swarm_clust)) {
    if (str_detect(string =  swarm_clust[line],
                   pattern = str_remove(asvs_abd$`ASV header`[asv],
                                        pattern = ">"))) {
  asvs_abd$OTU[asv] <- line
    }
    }
}



all_ps_tbl_blast <- left_join(x = all_ps_tbl_blast,y = asvs_abd[,c(1,3,4)],by="ASV" ) 

# all_ps_tbl_blast %>% select(`final ID`,OTU) %>% View() 
# all_ps_tbl_blast %>% select(`final ID`,OTU) %>% select(OTU) %>% unique() 
# all_ps_tbl_blast %>% select(ASV,`final ID`,OTU) %>% select(ASV) %>% unique() 


all_ps_tbl_blast$OTU %>% unique()
all_ps_tbl_blast$Group %>% unique()

```

## Calculate sample abundances ----

```{r, eval=FALSE}

#add ASV legth to table
# all_ps_tbl_blast_bckp2 <- all_ps_tbl_blast
# all_ps_tbl_blast <- all_ps_tbl_blast_bckp2


all_ps_tbl_blast <- all_ps_tbl_blast %>% 
  mutate("Relative abundance to all samples" = 0,
         "Relative abundance on sample" = 0,
         "Sample total abundance" = 0)

abd_total <- sum(all_ps_tbl_blast$Abundance)




all_ps_tbl_blast <- all_ps_tbl_blast %>%
  group_by(Sample) %>%
  mutate("Sample total abundance" = sum(Abundance),
         "Relative abundance to all samples" = Abundance/abd_total,
         "Relative abundance on sample" = Abundance/`Sample total abundance`) %>%
  ungroup()

```

### Set final identification from all possibilities

```{r, eval=FALSE}


all_ps_tbl_blast <- all_ps_tbl_blast %>% 
  mutate(`exact GenSp` = paste(`exact Genus`,`exact Species`,sep=" "))



all_ps_tbl_blast <- all_ps_tbl_blast %>% 
  mutate("final ID" = if_else((`exact Species` %in% c(NA,"NA", "NA NA")),
                              if_else((Species %in% c(NA,"NA")),
                                      if_else(Genus %in% c(NA,"NA"),
                                              substr(as.character(`1_subject header`),1,30),
                                              Genus),
                                      Species),
                              as.character(`exact GenSp`)))


```

#Group/correct species for ploting

```{r,echo=TRUE, eval=FALSE}
# all_ps_tbl_blast_bckp3 <- all_ps_tbl_blast

#Species detected
all_ps_tbl_blast$`final ID` %>% unique() %>% base::sort()
{
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Astyanax aff_fasciatus","Astyanax cf_fasciatus"))] <- "Astyanax fasciatus"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Astyanax cf_lacustris"))] <- "Astyanax lacustris"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Characidium sp"))] <- "Characidium"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Hypostomus sp"))] <- "Hypostomus"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Hoplias malabaricus/sp"))] <- "Hoplias malabaricus"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Rhamdia aff_quelen"))] <- "Rhamdia quelen"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Coptodon zillii KAUM:I:90126 m"))] <- "Coptodon zillii"
all_ps_tbl_blast$`final ID`[(all_ps_tbl_blast$`final ID` %in% c("Trachelyopterus cf_galeatus/galeatus"))] <- "Trachelyopterus galeatus"
}

```


### Identify primers expected ASV legth range

```{r, eval=FALSE}

# create ranges column
all_ps_tbl_blast <- all_ps_tbl_blast %>%
   mutate("Expected length" = "FALSE")


# fill ranges column with expected primer insert ranges

for (asv in 1:nrow(all_ps_tbl_blast)) {
   if (all_ps_tbl_blast$Primer[asv] == "NeoFish") {
      if (all_ps_tbl_blast$`ASV length`[asv] >= 185 && all_ps_tbl_blast$`ASV length`[asv] <= 200) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
         }

   }
   if (all_ps_tbl_blast$Primer[asv] == "MiFish") {
      if (all_ps_tbl_blast$`ASV length`[asv] >= 165 && all_ps_tbl_blast$`ASV length`[asv] <= 180) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
      }

   }
  if (all_ps_tbl_blast$Primer[asv] == "Teleo") {
      if (all_ps_tbl_blast$`ASV length`[asv] >= 60 && all_ps_tbl_blast$`ASV length`[asv] <= 75) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
      }

   }
  if (all_ps_tbl_blast$Primer[asv] == "NeoFish/MiFish") {
      if (all_ps_tbl_blast$`ASV length`[asv] >= 165 && all_ps_tbl_blast$`ASV length`[asv] <= 200) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
      }

   }
  if (all_ps_tbl_blast$Primer[asv] == "NeoFish/MiFish/Teleo") {
      if (all_ps_tbl_blast$`ASV length`[asv] %in% c(60:75,165:180,185:200)) {
         all_ps_tbl_blast$`Expected length`[asv] <- "in range"
      }else{
         all_ps_tbl_blast$`Expected length`[asv] <- "out of range"
      }

   }

}


#factorize comlumn
all_ps_tbl_blast$`Expected length` <- as.factor(all_ps_tbl_blast$`Expected length`)

```


#Reorder table 

```{r, eval=FALSE}

paste0(colnames(all_ps_tbl_blast),"\n") %>%  cat()


# all_ps_tbl_blast_bckp4 <- all_ps_tbl_blast
# all_ps_tbl_blast <- all_ps_tbl_blast_bckp4


all_ps_tbl_blast <- 
  all_ps_tbl_blast %>% 
  select(c("Sample","Group","Type","Primer","File_name","Library","Run",
           "final ID",
           "Abundance",
           "Relative abundance to all samples",
           "Relative abundance on sample",
           "Sample total abundance",
           "Kingdom","Phylum","Class","Order","Family",
           "Genus","Species","Specimen","Basin",
           "exact Genus","exact Species",
           "exact GenSp",
           "1_subject header","1_subject",
           "1_indentity","1_length",
           # "1_mismatches","1_gaps",
           # "1_query start","1_query end","1_subject start",
           # "1_subject end","1_e-value","1_bitscore",
           "2_subject header","2_subject",
           "2_indentity","2_length",
           # "2_mismatches","2_gaps",
           # "2_query start","2_query end","2_subject start",
           # "2_subject end","2_e-value","2_bitscore",
           "3_subject header","3_subject",
           "3_indentity","3_length",
           # "3_mismatches","3_gaps",
           # "3_query start","3_query end","3_subject start",
           # "3_subject end","3_e-value","3_bitscore",
           "ASV","ASV length","ASV header","Expected length","OTU"
           ))

# paste0(colnames(all_ps_tbl_blast),"\n") %>%  cat()
names(all_ps_tbl_blast)[which(names(all_ps_tbl_blast)=="ASV")] <- "ASV (Sequence)"
names(all_ps_tbl_blast)[which(names(all_ps_tbl_blast)== "ASV length")] <- "ASV size (pb)"

```

###save complete table
```{r, eval=FALSE}

#order by abundance

smp_abd_ID <- all_ps_tbl_blast[rev(base::order(all_ps_tbl_blast$Abundance)),] %>% 
  filter(`Abundance` > 0) 

dim(smp_abd_ID)

writexl::write_xlsx(x = smp_abd_ID,
                    path = "~/prjcts/fish_eDNA/sfjq/results/sfjq_all_analysis_info_06-03-22.xlsx",
                    col_names = TRUE,format_headers = TRUE)





ASVs_per_sample <- all_ps_tbl_blast %>% 
  # filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>%
  filter(!`final ID` %in% c(NA,"NA")) %>% 
  mutate("Sample" = factor(Sample,levels = sample_levels)) %>%
  group_by(Sample) %>%
   summarize("Library" = unique(`Library`),
     "Group" = unique(Group),
             "Primer" = unique(Primer),
             "Total ASV" = length(unique(`ASV (Sequence)`[Abundance != 0])),
             "ASVs out of range" = length(unique(`ASV (Sequence)`[Abundance != 0 & `Expected length` == "out of range"])),
             "ASVs in range" = length(unique(`ASV (Sequence)`[Abundance != 0 & `Expected length` == "in range"]))
             ,
             "Identified Species" = length(unique(`final ID`[Abundance != 0 & `Expected length` == "in range"]))
             ) 


writexl::write_xlsx(x = ASVs_per_sample,
                    path = "~/prjcts/fish_eDNA/sfjq/results/sfjq_ASVs_per_sample_06-07-21.xlsx",
                    col_names = TRUE,format_headers = TRUE)



```

## curing: manual checking of the species assignment
```{r, eval=FALSE}


all_ps_tbl_bl_cur <- smp_abd_ID



all_ps_tbl_bl_cur <- all_ps_tbl_bl_cur %>% 
  mutate("revised final ID" = `final ID`)


all_ps_tbl_bl_cur$`revised final ID` %>% unique() %>%  sort()




#correct misidentifications one by one
{
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Prochilodus")] <- "Prochilodus argenteus/hartii"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Prochilodus argenteus")] <- "Prochilodus argenteus/hartii"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Pimelodus")] <- "Pimelodus pohli"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Astyanax")] <- "Astyanax lacustris"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Hypostomus")] <- "Hypostomus alatus"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("NA elegans/gilbert")] <- "Cyphocharax gilbert"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("NA lepidura/xenodon")] <- "Curimatella lepidura"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Roeboides xenodon")] <- "Curimatella lepidura"
all_ps_tbl_bl_cur$`revised final ID`[all_ps_tbl_bl_cur$`revised final ID` %in% c("Curimatella lepidura")] <- "Roeboides xenodon"





all_ps_tbl_bl_cur$`revised final ID`[(all_ps_tbl_bl_cur$`revised final ID` %in% c("Hoplias brasiliensis/intermedius")) & 
                                       all_ps_tbl_bl_cur$Group %in% c("Non-normalized SFmc","Normalized SFmc")] <- "Hoplias intermedius"


all_ps_tbl_bl_cur$`revised final ID`[(all_ps_tbl_bl_cur$`revised final ID` %in% c("Hoplias brasiliensis/intermedius")) & 
                                       all_ps_tbl_bl_cur$Group %in% c("Non-normalized JQmc","Normalized JQmc")] <- "Hoplias brasiliensis"



all_ps_tbl_bl_cur$`revised final ID`[(all_ps_tbl_bl_cur$`revised final ID` %in% c("Hoplias intermedius")) & 
                                       all_ps_tbl_bl_cur$Group %in% c("Normalized SFJQmc")] <- "Hoplias brasiliensis/intermedius"




all_ps_tbl_bl_cur$`revised final ID`[(all_ps_tbl_bl_cur$`revised final ID` %in% c("Hoplias intermedius")) & 
                                       all_ps_tbl_bl_cur$Group %in% c("Non-normalized JQmc","Normalized JQmc")] <- "Hoplias brasiliensis"

}

```

### Identify expected species

As this was a controlled experiment, we must identify the species that were expected and those who were not.

```{r, eval=FALSE}

#duplicate final ID so we can identify partially identified species

# all_ps_tbl_blast_bckp5 <- all_ps_tbl_blast 
# all_ps_tbl_blast <- all_ps_tbl_blast_bckp5
# 
#                           all_ps_tbl_blast <- all_ps_tbl_bl_cur %>%
#                             mutate(`revised final ID` = `final ID`)



# expctd_sps_tbl <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/sfjq_species.csv",
# expctd_sps_tbl <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/sfjq_species_06mar22.csv",
# expctd_sps_tbl <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/sfjq_species_10mar22.csv",
expctd_sps_tbl <- read.csv(file = "~/prjcts/fish_eDNA/sfjq/data/sfjq_species_02jun22.csv",
                           header = TRUE, check.names = FALSE) %>% as_tibble() 

#mudei o T. chalceus de 0.0073 para 0.0299 no pool SF

colnames(expctd_sps_tbl)[colnames(expctd_sps_tbl) == "Species"] <- "revised final ID"

```




### Proportions table

As this was a controlled experiment, we must identify the species that were expected and those who were not.

```{r, eval=FALSE}

# tabelas de proporções ----
# JQmc ----

# criando a tabela de proporções do JQ


# if used left_join, p hartii wont show up
all_ps_tbl_jq <- full_join(
  all_ps_tbl_bl_cur[all_ps_tbl_bl_cur$Sample %in% c(
  "Normalized JQmc\nNeoFish", "Non-normalized JQmc\nNeoFish",
  "Normalized JQmc\nMiFish", "Non-normalized JQmc\nMiFish",
  "Normalized JQmc\nTeleo", "Non-normalized JQmc\nTeleo"),],
  expctd_sps_tbl[expctd_sps_tbl$Pool=="JQ",c(2:9)],
          by = "revised final ID") 

# all_ps_tbl_jq %>% select(`revised final ID`,Abundance,`Relative abundance on sample`, 64:73) %>% View()

all_ps_tbl_jq$`Expected Species` <- "not expected"

all_ps_tbl_jq$`revised final ID` %>% unique()

all_ps_tbl_jq <- all_ps_tbl_jq %>% mutate( 
  # `Expected Species`=if_else((!is.na(`Full name`) & (.$`revised final ID` %in% expected_sps)),"expected","not expected")
  `Expected Species`=if_else((!is.na(`Full name`)),"expected","not expected")
  )

colnames(all_ps_tbl_jq)


jq_proportions <- all_ps_tbl_jq %>%
  select(c(1:24,37:50)) %>%
  pivot_longer(cols = c("Percentage on respective Norm pool", "Percentage on respective Non-norm pool" ),
               names_to = "Pool",values_to = "Proportion") %>% 
  # mutate(Proportion = Proportion*100) %>% 
  mutate(Sample = factor(Sample,levels = c(
    "Normalized JQmc\nNeoFish", "Non-normalized JQmc\nNeoFish",
    "Normalized JQmc\nMiFish", "Non-normalized JQmc\nMiFish",
    "Normalized JQmc\nTeleo", "Non-normalized JQmc\nTeleo"
    # ,
    # "Non-normalized SFmc\nMiFish","Non-normalized SFmc\nNeoFish",
    # "Normalized SFmc\nMiFish","Normalized SFmc\nNeoFish",
    # "Normalized SFJQmc\nMiFish", "Normalized SFJQmc\nNeoFish"
    ))) %>%
  filter((Sample %in% c("Non-normalized JQmc\nNeoFish", "Non-normalized JQmc\nMiFish", "Non-normalized JQmc\nTeleo"
                        # , "Non-normalized SFmc\nMiFish","Non-normalized SFmc\nNeoFish"
                        ) & Pool %in% c("Percentage on respective Non-norm pool"
                                        ) )|(Sample %in% c("Normalized JQmc\nNeoFish", "Normalized JQmc\nMiFish", "Normalized JQmc\nTeleo"
                                                           # , "Normalized SFmc\nMiFish", "Normalized SFmc\nNeoFish", "Normalized SFJQmc\nMiFish", "Normalized SFJQmc\nNeoFish"
                                                           ) & Pool %in% c("Percentage on respective Norm pool") )) %>% 
  group_by(Sample,`revised final ID`) %>%
  summarize(Proportion = unique(Proportion),
            `Relative abundance on sample` = sum(`Relative abundance on sample`),
            Sample = unique(Sample),
            # `revised final ID` = unique(`revised final ID`),
            `revised final ID` = unique(`revised final ID`),
            Primer = unique(Primer),
            `Expected Species` = unique(`Expected Species`),
            Pool = unique(Pool),
            `Num ASVs` = length(`ASV (Sequence)`),
            `Num OTUs` = length(unique(OTU))) %>% 
  ungroup()


#SFmc ----

# criando a tabela de proporções do 

all_ps_tbl_sf <- full_join(
  all_ps_tbl_bl_cur[all_ps_tbl_bl_cur$Sample %in% c(
  "Normalized SFmc\nNeoFish", "Non-normalized SFmc\nNeoFish",
  "Normalized SFmc\nMiFish", "Non-normalized SFmc\nMiFish"),],
  expctd_sps_tbl[expctd_sps_tbl$Pool=="SF",c(2:9)],
          by = "revised final ID")

# all_ps_tbl_jq %>% select(`revised final ID`,Abundance,`Relative abundance on sample`, 64:73) %>% View()

all_ps_tbl_sf$`Expected Species` <- "not expected"

all_ps_tbl_sf$`revised final ID` %>% unique()

all_ps_tbl_sf <- all_ps_tbl_sf %>% mutate( 
  # `Expected Species`=if_else((!is.na(`Full name`) & (.$`revised final ID` %in% expected_sps)),"expected","not expected")
  `Expected Species`=if_else((!is.na(`Full name`)),"expected","not expected")
  )

colnames(all_ps_tbl_sf)


sf_proportions <- all_ps_tbl_sf %>%
  # select(c(1:15,63:68,73:74)) %>% 
  select(c(1:24,37:50)) %>%
  pivot_longer(cols = c("Percentage on respective Norm pool", "Percentage on respective Non-norm pool" ),
               names_to = "Pool",values_to = "Proportion") %>% 
  # mutate(Proportion = Proportion*100) %>% 
  mutate(Sample = factor(Sample,levels = c(
    "Normalized SFmc\nNeoFish", "Non-normalized SFmc\nNeoFish",
    "Normalized SFmc\nMiFish", "Non-normalized SFmc\nMiFish"
    # ,
    # "Non-normalized SFmc\nMiFish","Non-normalized SFmc\nNeoFish",
    # "Normalized SFmc\nMiFish","Normalized SFmc\nNeoFish",
    # "Normalized SFJQmc\nMiFish", "Normalized SFJQmc\nNeoFish"
    ))) %>%
  filter((Sample %in% c("Non-normalized SFmc\nNeoFish", "Non-normalized SFmc\nMiFish"
                        # , "Non-normalized SFmc\nMiFish","Non-normalized SFmc\nNeoFish"
                        ) & Pool %in% c("Percentage on respective Non-norm pool"
                                        ) )|(Sample %in% c("Normalized SFmc\nNeoFish", "Normalized SFmc\nMiFish"
                                                           # , "Normalized SFmc\nMiFish", "Normalized SFmc\nNeoFish", "Normalized SFJQmc\nMiFish", "Normalized SFJQmc\nNeoFish"
                                                           ) & Pool %in% c("Percentage on respective Norm pool") )) %>% 
  group_by(Sample,`revised final ID`) %>% 
  summarize(Proportion = unique(Proportion),
            `Relative abundance on sample` = sum(`Relative abundance on sample`),
            Sample = unique(Sample),
            # `revised final ID` = unique(`revised final ID`),
            `revised final ID` = unique(`revised final ID`),
            Primer = unique(Primer),
            `Expected Species` = unique(`Expected Species`),
            Pool = unique(Pool),
            `Num ASVs` = length(`ASV (Sequence)`),
            `Num OTUs` = length(unique(OTU))) %>% 
  ungroup()


#SFJQmc ----
# View(expctd_sps_tbl[expctd_sps_tbl$Pool=="SFJQ",c(2:9)])
# criando a tabela de proporções do SFJQ

all_ps_tbl_sfjq <- full_join(
  all_ps_tbl_bl_cur[all_ps_tbl_bl_cur$Sample %in% c(
  "Normalized SFJQmc\nNeoFish", "Normalized SFJQmc\nMiFish"),],
  expctd_sps_tbl[expctd_sps_tbl$Pool=="SFJQ",c(2:9)],
          by = "revised final ID")


# all_ps_tbl_jq %>% select(`revised final ID`,Abundance,`Relative abundance on sample`, 64:73) %>% View()

all_ps_tbl_sfjq$`Expected Species` <- "not expected"

all_ps_tbl_sfjq$`revised final ID` %>% unique()

all_ps_tbl_sfjq <- all_ps_tbl_sfjq %>% mutate( 
  # `Expected Species`=if_else((!is.na(`Full name`) & (.$`revised final ID` %in% expected_sps)),"expected","not expected")
  `Expected Species`=if_else((!is.na(`Full name`)),"expected","not expected")
  )

colnames(all_ps_tbl_sfjq)


sfjq_proportions <- all_ps_tbl_sfjq %>%
  select(c(1:24,37:50)) %>%
  # select(c(1:15,63:68,73:74)) %>% 
  pivot_longer(cols = c("Percentage on respective Norm pool", "Percentage on respective Non-norm pool" ),
               names_to = "Pool",values_to = "Proportion") %>% 
  # mutate(Proportion = Proportion*100) %>% 
  mutate(Sample = factor(Sample,levels = c(
    "Normalized SFJQmc\nNeoFish", "Normalized SFJQmc\nMiFish"
    ))) %>%
  filter((Sample %in% c("Normalized SFJQmc\nNeoFish", "Normalized SFJQmc\nMiFish"
                        ) & Pool %in% c("Percentage on respective Norm pool"
                                        ) )) %>% 
  group_by(Sample,`revised final ID`) %>% 
  summarize(Proportion = unique(Proportion),
            `Relative abundance on sample` = sum(`Relative abundance on sample`),
            Sample = unique(Sample),
            # `revised final ID` = unique(`revised final ID`),
            `revised final ID` = unique(`revised final ID`),
            Primer = unique(Primer),
            `Expected Species` = unique(`Expected Species`),
            Pool = unique(Pool),
            `Num ASVs` = length(`ASV (Sequence)`),
            `Num OTUs` = length(unique(OTU))) %>% 
  ungroup()



#this table will be used
all_ps_tbl_sfjq_full <-  dplyr::bind_rows(all_ps_tbl_sfjq,all_ps_tbl_sf,all_ps_tbl_jq)



all_ps_tbl_sfjq_full %>% colnames() %>% unique() %>% paste0(collapse = '",\n"') %>% cat()


```


### Pearson correlations between DNA input and sequence yield

```{r,echo=TRUE, eval=FALSE}

#correlação entre o DNA input e reads ABD -----

# JQmc
{
jq_df_neo_norm <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized JQmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_mif_norm <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized JQmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_tel_norm <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized JQmc\nTeleo")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_neo_skew <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized JQmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_mif_skew <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized JQmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

jq_df_tel_skew <- jq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized JQmc\nTeleo")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

#SF 
sf_df_neo_norm <- sf_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized SFmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

sf_df_mif_norm <- sf_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized SFmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

sf_df_neo_skew <- sf_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized SFmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

sf_df_mif_skew <- sf_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Non-normalized SFmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

# SFJQ
sfjq_df_neo_norm <- sfjq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized SFJQmc\nNeoFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()

sfjq_df_mif_norm <- sfjq_proportions %>% 
  ungroup() %>% 
  filter(Sample %in% c("Normalized SFJQmc\nMiFish")) %>% 
  select(c(2,3,4)) %>% as.data.frame() %>% na.exclude()
}




#correlações 
#JQmc
cor.test(x = jq_df_neo_norm$Proportion ,
         y = jq_df_neo_norm$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = jq_df_neo_skew$Proportion ,
         y = jq_df_neo_skew$`Relative abundance on sample` ,
         method = "pearson")



cor.test(x = jq_df_mif_norm$Proportion ,
         y = jq_df_mif_norm$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = jq_df_mif_skew$Proportion ,
         y = jq_df_mif_skew$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = jq_df_tel_norm$Proportion ,
         y = jq_df_tel_norm$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = jq_df_tel_skew$Proportion ,
         y = jq_df_tel_skew$`Relative abundance on sample` ,
         method = "pearson")

#SF
cor.test(x = sf_df_neo_norm$Proportion ,
         y = sf_df_neo_norm$`Relative abundance on sample` ,
         method = "pearson",)


cor.test(x = sf_df_neo_skew$Proportion ,
         y = sf_df_neo_skew$`Relative abundance on sample` ,
         method = "pearson")



cor.test(x = sf_df_mif_norm$Proportion ,
         y = sf_df_mif_norm$`Relative abundance on sample` ,
         method = "pearson")


cor.test(x = sf_df_mif_skew$Proportion ,
         y = sf_df_mif_skew$`Relative abundance on sample` ,
         method = "pearson")

```


## Richness analysis on Vegan
```{r,echo=TRUE, eval=FALSE}
library(vegan)
# data(dune)
# decorana(dune)

# class(dune)
#1- prepare data for entry in vegan ----

all_ps_blst_vegan <- all_ps_tbl_bl_cur %>% 
  filter(`Expected length` %in% c("in range")) %>% 
  mutate("Normalization" = str_split(.$Group,pattern = " ",2,simplify = TRUE)[,1],
         "Mock Community" = str_split(.$Group,pattern = " ",2,simplify = TRUE)[,2]) %>% 
  filter(!(Sample %in% c("Positive Control\n(P.glauca)","neg-PCR2"))) %>%   #remove control samples
  select(c(Sample,Group,Normalization,`Mock Community`,Type,Primer,File_name,Library,Run,`final ID`,`Relative abundance on sample`)) %>% 
  group_by(Sample,`final ID`,Group,Type,Primer,File_name,Library,Run,Normalization,`Mock Community`) %>% 
  summarise(`Relative abundance on sample` = sum(`Relative abundance on sample`)) %>% 
  pivot_wider(c(Sample,Group,Type,Primer,File_name,Library,Run,Normalization,`Mock Community`),names_from = `final ID` ,values_from = `Relative abundance on sample`) %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>% 
  # mutate(Library = unfactor(Library)) %>% 
  mutate("Sample number" = 0) %>% 
  ungroup()  %>% 
  select(`Sample number`, 1:(ncol(.)-1)) %>% 
  mutate(Normalization = factor(Normalization))

#2- associate sample numbers to sample names ----
for (sample in 1:nrow(all_ps_blst_vegan)) {
  all_ps_blst_vegan$`Sample number`[sample] <- sample 
  
}

#tirando as amostras da ecomol pra facilitar

all_ps_blst_vegan <- all_ps_blst_vegan[all_ps_blst_vegan$Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2"),] 




colnames(all_ps_blst_vegan)
hist(colSums(all_ps_blst_vegan[,-c(1:10)]))
hist(rowSums(all_ps_blst_vegan[,-c(1:10)]))
all_ps_blst_vegan[,-c(1:10)]

all_ps_blst_vegan %>% select(Sample, `Sample number`)
# all_ps_blst_vegan %>% select(`Sample number`, 1:(ncol(.)-1))

#3- create data.frame of species counts: rownames are Sample numbers ----

all_ps_blst_vegan_df <- all_ps_blst_vegan %>% 
  # filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>% 
  select(-c("Sample", "Group", "Type", "Primer", "File_name", "Library", "Run","Normalization","Mock Community")) %>% 
  select(base::sort(colnames(.))) %>% 
  as.data.frame() 

#4- name rows as Sample numbers and remove column ----
row.names(all_ps_blst_vegan_df) <- all_ps_blst_vegan_df$`Sample number`
all_ps_blst_vegan_df <- all_ps_blst_vegan_df %>% 
  select(-c(`Sample number`))

library(vegan)
#5- 

all_ps_ord <- decorana(veg = all_ps_blst_vegan_df)

all_ps_ord %>% summary()

all_ps_ord %>% str()

all_ps_ord$cproj


plot(all_ps_ord)
plot(all_ps_ord,type = "p")
plot(all_ps_ord,type = "c") 

points(all_ps_ord, display = "sites", cex = 0.8, pch=21, col="red", bg="yellow")
text(all_ps_ord, display = "sites", cex=0.7, col="blue")
text(all_ps_ord, display = "spec", cex=0.7, col="blue")



#6- NMDS analisys ----



# library(vegan)
# data(varespec)
#6a- Calculate distances ----
all_ps_vg_dist <- vegdist(all_ps_blst_vegan_df, method="bray")

vegan::scores(all_ps_vg_dist)

# all_ps_vg_dist_metaMDS <- metaMDS(comm = all_ps_vg_dist, autotransform = FALSE) 
# actually autotransform = FALSE doesn't seem to change the results

# plot(all_ps_vg_dist_metaMDS)

# all_ps_vg_dist_metaMDS_2 <- metaMDS(comm = all_ps_vg_dist, distance = "bray", k =2)

# plot(all_ps_vg_dist_metaMDS_2)

#selecionar apenas espécies esperadas?

all_ps_blst_vegan_df %>% ncol()
all_ps_blst_vegan_df <- all_ps_blst_vegan_df[,(colnames(all_ps_blst_vegan_df) %in% expected_sps)]


all_ps_blst_vegan_df %>% ncol()
all_ps_vg_dist <- vegdist(all_ps_blst_vegan_df, method="bray")

all_ps_ord <- decorana(veg = all_ps_blst_vegan_df)

all_ps_ord %>% summary()

all_ps_ord %>% str()

all_ps_ord$cproj
all_ps_ord


plot(all_ps_ord)
plot(all_ps_ord,type = "p")
plot(all_ps_ord,type = "c") 
vegan::scores(all_ps_vg_dist)




# all_ps_blst_vegan_df[,(colnames(all_ps_blst_vegan_df) %in% expected_sps)] %>% colnames()
# all_ps_blst_vegan_df%>% colnames()



all_ps_vegan_ord_meta <- metaMDS(veg = all_ps_blst_vegan_df, comm = all_ps_vg_dist)
# actually autotransform = FALSE doesn't seem to change the results
plot(all_ps_vegan_ord_meta, type = "t")


all_ps_vegan_ord_meta %>% str()
all_ps_vegan_ord_meta$stress


  
#6b- extract NMDS scores from results
  
all_vegan_meta <- (vegan::scores(all_ps_vegan_ord_meta) %>% tidyr::as_tibble(rownames = "Sample number")) %>% mutate(`Sample number` = as.numeric(`Sample number`))
            # all_vegan_meta <- as.data.frame(vegan::scores(all_ps_vegan_ord_meta))
            
            #Using the scores function from vegan to extract the site scores and convert to a data.frame
            
            # all_vegan_meta$`Sample number` <- rownames(all_vegan_meta) %>% as.numeric()  
            
            # all_vegan_meta %>% left_join()# create a column of site names, from the rownames of data.scores
            
            # all_vegan_meta <- all_vegan_meta  %>% as_tibble() # create a column of site names, from the rownames of data.scores

#7- bring NMDS scores to complete table

all_vegan_meta_tbl <- left_join(x = unique(all_ps_blst_vegan[,c(1:10)]),y = all_vegan_meta, by = "Sample number") %>% 
  mutate(Primer=factor(Primer,levels = c("NeoFish", "MiFish", "Teleo")),
         `Mock Community`=factor(`Mock Community`))





library(factoextra)
library(ggforce)



nmds_PLOT <- all_vegan_meta_tbl %>% 
  # filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>% 
  ggplot(aes(x = NMDS1,y = NMDS2, col = Primer,shape = Normalization,label = Sample,Group = `Mock Community`))+
    # stat_ellipse()+ 
  geom_point(size = 11)+
  theme_light(base_size = 18) +
  theme(legend.position="bottom") +
  coord_fixed(ratio = 1) +
  # ggrepel::geom_label_repel(label.size = 0.8,size = 3,min.segment.length = 2) +
  # ggrepel::geom_text_repel(col="black",size = 3,min.segment.length = 2) +
  # scale_shape_manual() %>% 
  scale_color_manual(
    # labels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish", "NeoFish/MiFish/Teleo"),
    labels = c("NeoFish", "MiFish", "Teleo"),
                     values = alpha(colour = colors_norm[c(1,3,5)] ))+
  annotate(geom = "text",
           x=c(0.275),y=c(-0.275),label=paste0("Stress: ",round(all_ps_vegan_ord_meta$stress,digits = 4)),size=5) +


    # ADD ggforce's ellipses
  ggforce::geom_mark_ellipse(inherit.aes = FALSE,
                             aes(x = NMDS1,y = NMDS2,
                                 group=`Mock Community`,
                                 label=`Mock Community`),
                             n = 100,
                             expand = 0.03,
                             label.fontsize = 20,con.cap = 0.1) 
  
    # facet_wrap(~`Mock Community`,ncol = 2)
  
#   
nmds_PLOT
  # 
# ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_NMDS.pdf",
ggsave(file = "~/outros/sfjq_temp/SFJQ_NMDS.pdf",
     plot = nmds_PLOT,
     device = "pdf",
     width = 40,
     height =25,
     units = "cm",
     dpi = 300)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_NMDS.png",
     plot = nmds_PLOT,
     device = "png",
     width = 31,
     height =20,
     units = "cm",
     dpi = 300)




# anosim----

################################# anosin function###############################
anosin_auto<- function(tbl,cols_out, Coluna){
  df <- tbl[,c(1,(1+cols_out):ncol(tbl))] %>% 
    as.data.frame() %>% 
    `rownames<-`(.$`Sample number`) %>% 
    select(-c("Sample number"))
  
  ano <- anosim(df, grouping = tbl[[Coluna]],
       permutations = 9999, distance = "bray", strata = NULL)
  return(ano)
  
}
################################################################################


#Primers ----
#Todas MC juntas
all_ps_blst_vegan %>% 
  # filter(`Mock Community` %in% c("SFmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Primer")

#Apenas JQ
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("JQmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Primer")

#Apenas SF
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("SFmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Primer")

#Apenas SFJQ
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("SFJQmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Primer")


#Normalization ----
#Todas MC juntas
all_ps_blst_vegan %>% 
anosin_auto(cols_out = 10,Coluna = "Normalization")

#Apenas JQ
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("JQmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Normalization")

#Apenas SF
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("SFmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Normalization")

#Apenas SFJQ
all_ps_blst_vegan %>% 
  filter(`Mock Community` %in% c("SFJQmc")) %>% 
anosin_auto(cols_out = 10,Coluna = "Normalization")


all_vegan_meta_tbl %>% 
  filter(Run %in% c("LGC_MiniSeq_1","LGC_MiniSeq_2")) %>% 
  select(Sample,`Sample number`,Primer)


```

### Ploting ASVs

```{r,echo=TRUE, eval=FALSE}

# all_ps_tbl_blast_bckp6 <- all_ps_tbl_blast

all_ps_tbl_blast <- all_ps_tbl_bl_cur 




#28- ASVs plots by sample and species ----

options(set.seed(seed = 13))

#28a- ASV size distribution - alphabetical----
all_ps_tbl_blast$Run %>% unique()
  
ASV_size_by_Sample <- all_ps_tbl_blast %>%
  filter(!Sample %in% c("Positive Control\n(P.glauca)")) %>% 
  filter(!Run %in% c("ecomol_iSeq")) %>% 
  mutate(Sample = factor(Sample,levels = sample_levels)) %>% 
  mutate(Primer = factor(Primer,levels = c("NeoFish", "MiFish", 
                                           "Teleo", "NeoFish/MiFish/Teleo"))) %>% 
  ggplot(aes(y=Sample,
             x=`ASV size (pb)`,
             colour = Primer,
             size=`Relative abundance on sample`,
             shape=`Expected length`
             )) +
  geom_jitter(height = 0.2,
              width = 0) +
  ggplot2::scale_colour_manual(
                     values = ggplot2::alpha(colour = colors5[1:4] ,alpha =  0.3)) +
  coord_fixed(ratio = 8) +
  scale_x_continuous(breaks = c(20,60,80,100,120,140,160,180,200,220,240,260,280,300,320,340),expand = c(0.02,0.02)) +
  xlab("ASV length (bp)") +
  ylab("Sample") +
  ggtitle(label = "SFJQ mock communities ",
          subtitle = "All ASVs found in samples, by length and abundance") +
  theme_bw(base_size = 15) +
  theme(legend.position = "right") 

ASV_size_by_Sample


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/3-ASV_size_by_sample.png",
     plot = ASV_size_by_Sample,
     device = "png",
     width = 18,
     height = 10,
     dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/3-ASV_size_by_sample.svg",
     plot = ASV_size_by_Sample,
     device = "svg",
     width = 18,
     height = 10,
     dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/3-ASV_size_by_sample.pdf",
     plot = ASV_size_by_Sample,
     device = "pdf",
     width = 18,
     height = 10,
     dpi = 600)
dev.off()



##################### paper ################################# ----

#29 - ASVS - sample X species (12S) - distribution: blast---- 
all_ps_tbl_blast$Group %>%unique()

#Jq & SF DNA pools - ASVs size and species by sample----

all_ps_tbl_blast$Sample %>% unfactor() %>% unique() %>% base::sort()
all_ps_tbl_blast$`revised final ID` %>% unique() %>% base::sort() %>% paste0(collapse = '", \n"') %>% cat()
all_ps_tbl_blast$`final ID` %>% unique() %>% base::sort() %>% paste0(collapse = '", \n"') %>% cat()


pool_labels <- c(
  "Normalized JQmc" = "Jequitinhonha\nNormalized DNA pool",
  "Non-normalized JQmc" = "Jequitinhonha\nNon-Normalized DNA pool",
  "Normalized SFJQmc" = "São Francisco & Jequitinhonha\nNormalized DNA pool",
  "Non-normalized SFmc" = "São Francisco\nNon-Normalized DNA pool",
  "Normalized SFmc" = "São Francisco\nNormalized DNA pool"
)

pools_levels <- c(
"Normalized JQmc",
"Non-normalized JQmc",
"Normalized SFJQmc",
"Normalized SFmc",
"Non-normalized SFmc"
)

#final ID levels
{
finalID_levels<- c(
#jq
"Astyanax lacustris", 
"Australoheros sp", 
"Delturus brevis", 
"Eugerres brasilianus", 
"Hypomasticus steindachneri", 

"Hypostomus nigrolineatus", 
"Megaleporinus elongatus", 
"Megaleporinus garmani", 
"Moenkhausia costae", 
"Rhamdia quelen", 
"Steindachneridion amblyurum", 
"Wertheimeria maculata",
#ambos
"Gymnotus carapo", 
"Hoplias", 
"Hoplias brasiliensis", 

"Hoplias intermedius", 
"Hoplias malabaricus", 
"Prochilodus", 
"Prochilodus argenteus", 
 
"Prochilodus costatus", 
"Steindachnerina elegans", 
"Trachelyopterus galeatus",
#sf

"Astyanax fasciatus", 
"Brycon orthotaenia", 
"Characidium lagosantense", 
"Crenicichla lepidota", 
# "Curimatella lepidura", esse é na vdd roeboides
"Roeboides xenodon",
"Eigenmannia virescens", 
"Franciscodoras marmoratus", 
"Hypostomus alatus", 
"Imparfinis minutus", 
"Leporinus reinhardti", 
"Microglanis leptostriatus", 
"Moenkhausia sanctaefilomenae", 
"Myleus micans", 
"Pamphorichthys hollandi", 
"Phalloceros uai", 

"Pimelodus maculatus", 
"Pimelodus pohli", 
"Pseudoplatystoma corruscans", 
"Pterygoplichthys etentaculatus", 
"Serrasalmus brandtii", 
"Tetragonopterus chalceus", 

#partial
"Astyanax", 
"Hoplias brasiliensis/intermedius",
"Hypostomus", 
"Pimelodus", 
"Prochilodus argenteus/hartii",
#trash


"NA elegans/gilbert", 
"NA lepidura/xenodon", 
"Acestrorhynchus lacustris",
"Coptodon zillii",
"Cyphocharax gilbert",
"Geophagus brasiliensis",
"Planaltina myersi", 
"Poecilia reticulata mitochondr"
)
}
finalID_levels[finalID_levels %>% duplicated()]



# final ID levels 2
{
finalID_levels <- c(
"Acestrorhynchus lacustris", 
"Acinocheirodon melanogramma", 
"Astyanax fasciatus", 
"Astyanax lacustris", 
"Australoheros sp", 
"Bos taurus", 
"Brycon orthotaenia", 
"Characidium lagosantense", 
"Coptodon zillii", 
"Crenicichla lepidota", 
# "Curimatella lepidura", agora é roeboides
"Roeboides xenodon",
"Cyphocharax gilbert", 
"Delturus brevis", 
"Eigenmannia virescens", 
"Eugerres brasilianus", 
"Franciscodoras marmoratus", 
"Geophagus brasiliensis", 
"Gymnotus carapo", 
"Hoplias brasiliensis", 
"Hoplias intermedius", 
"Hoplias malabaricus", 
"Hypomasticus steindachneri", 
"Hypostomus alatus", 
"Hypostomus nigrolineatus", 
"Imparfinis minutus", 
"Leporinus reinhardti", 
"Megaleporinus elongatus", 
"Megaleporinus garmani", 
"Microglanis leptostriatus", 
"Moenkhausia costae", 
"Moenkhausia sanctaefilomenae", 
"Myleus micans", 
"Pamphorichthys hollandi", 
"Phalloceros uai", 
"Pimelodus maculatus", 
"Pimelodus pohli", 
"Planaltina myersi", 
"Prionace glauca", 
"Prochilodus argenteus", 
"Prochilodus costatus", 
"Prochilodus hartii",
"Pseudoplatystoma corruscans", 
"Pterygoplichthys etentaculatus", 
"Rhamdia quelen", 
"Roeboides xenodon", 
"Serrasalmus brandtii", 
"Steindachneridion amblyurum", 
"Tetragonopterus chalceus", 
"Trachelyopterus galeatus", 
"Wertheimeria maculata",

#partial
"Astyanax", 
"Hoplias brasiliensis/intermedius", 
"Hypostomus", 
"Pimelodus", 
"Prochilodus", 
"Prochilodus argenteus/hartii")
}

sps_remove <- c(
NA,"NA",
"Acestrorhynchus lacustris", 
"Acinocheirodon melanogramma", 
"Bos taurus", 
"Coptodon zillii", 
"Curimatella lepidura", 
"Eugerres brasilianus", 
"Geophagus brasiliensis", 
"Leporinus reinhardti", 
"Moenkhausia costae", 
"Planaltina myersi", 
"Prionace glauca", 
"Pseudoplatystoma corruscans"
)


```

# Images arcticle

```{r,echo=FALSE,eval=FALSE}
############################################artigo naira ----


all_ps_tbl_blast$`final ID`[all_ps_tbl_blast$`Expected length` %in% c("in range")] %>% unique()



#now we are going to edit 
# all_ps_tbl_sfjq_full_bckp  <- all_ps_tbl_sfjq_full  


all_ps_tbl_sfjq_full$`revised final ID` %>% unique()



# ASVs Vs. OTUs


all_ps_tbl_sfjq$Sample %>% unique()


# 
# all_ps_tbl_sfjq_full %>%
#   # filter(`Expected length` %in% c("in range")) %>% 
#   # filter(`Expected Species` %in% c("expected")) %>% 
#   filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>% 
#   # subset(!(Sample %in% c("Positive Control\n(P.glauca)"))) %>% 
#   group_by(Sample) %>% 
#   select(c(1:8,18:24,61:74)) %>% View()

all_ps_tbl_sfjq_full$`revised final ID`[all_ps_tbl_sfjq_full$`Expected Species` == "not expected"] %>% unique() %>% base::sort()

all_ps_tbl_sfjq_full$`revised final ID`[all_ps_tbl_sfjq_full$`Expected Species` == "expected"& 
                                  all_ps_tbl_sfjq_full$Primer == "NeoFish"&
                                  all_ps_tbl_sfjq_full$Group == "Normalized SFJQmc"] %>% unique() %>% base::sort()

all_ps_tbl_sfjq_full$`revised final ID`[all_ps_tbl_sfjq_full$`Expected Species` == "expected"& 
                                  all_ps_tbl_sfjq_full$Primer == "MiFish"&
                                  all_ps_tbl_sfjq_full$Group == "Normalized SFJQmc"] %>% unique() %>% base::sort()



ASVs_n_OTUs_tbl$`Expected species list`[ASVs_n_OTUs_tbl$`ID type` == "Species" ]


ASVs_n_OTUs_tbl <- all_ps_tbl_sfjq_full %>% 
  filter(`Expected length` %in% c("in range")) %>%
  # filter(`Expected Species` %in% c("expected")) %>% 
  filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>% 
  # subset(!(Sample %in% c("Positive Control\n(P.glauca)"))) %>% 
  group_by(Sample) %>% 
  summarize(Sample = unique(Sample),
            Run = unique(Run),
            Group = unique(Group),
            OTUs = length(unique(OTU)),
            ASVs = length(unique(`ASV (Sequence)`)),
            IDs = length(unique(`revised final ID`)),
            # all_IDs = length(unique(`Full name`)),
            Primer = unique(Primer),
            # `Species` = length(unique(`final ID`[`Expected Species` %in% c("expected")])),       #Species will now correspond to Expected Species
            `Species` = length(unique(`revised final ID`)),       #Species will now correspond to Expected Species
            # `Expected species list` = list(unique(base::sort(`final ID`[`Expected Species` %in% c("expected")])))
            `Expected species list` = list(unique(base::sort(`revised final ID`)))
            ) %>% 
  mutate(Sample = factor(Sample,levels = sample_levels)) %>% 
  tidyr::pivot_longer(cols = c(OTUs,ASVs,IDs,`Species`),names_to = "ID type",values_to = "Counts") %>%
  mutate(`ID type` = factor(`ID type`,levels = c("ASVs","OTUs","IDs","Species"))) %>%
  mutate(Group = factor(Group,levels = c("Non-normalized JQmc",
                                         "Normalized JQmc",
                                         "Non-normalized SFmc",
                                         "Normalized SFmc",
                                         "Normalized SFJQmc"
                                         ))) %>%
  mutate(Norm = ifelse(Group %in% c("Non-normalized JQmc",
                                    "Non-normalized SFmc"
                                    ),"Non-Normalized","Normalized")) %>% 
  mutate(Norm = factor(Norm)) %>% 
  mutate(MC = str_remove_all(string = Group, pattern = "Normalized |Non-normalized "))

ASVs_n_OTUs_tbl$`Expected species list`

ASVs_n_OTUs_tbl$`Expected species list`[ASVs_n_OTUs_tbl$`ID type` %in% c("Species")]



# ASVs, OTUs and Species plot ----
ASVs_n_OTUs_tbl$Sample %>% unique()
ASVs_n_OTUS_plot


#Como combinado com naiara
ASVs_n_OTUS_plot <- ASVs_n_OTUs_tbl %>% 
  filter(!`ID type` %in% c("IDs")) %>% 
  filter(Norm %in% c("Normalized")) %>% 
  mutate(Sample = str_remove(Sample,"Normalized ")) %>% 
  mutate(Sample = factor(Sample, levels = c("SFJQmc\nMiFish","SFJQmc\nNeoFish",
                                            "SFmc\nMiFish","SFmc\nNeoFish",
                                            "JQmc\nMiFish","JQmc\nNeoFish","JQmc\nTeleo"))) %>% 
  ggplot2::ggplot(aes(x = Sample,
                      y = Counts ,
                      fill = `ID type`, 
                      group = `ID type`)) +
  geom_bar(stat = "identity",position = "dodge")+
  xlab("Sample") +
  ylab("Counts") +
  theme_light(base_size = 16) +
  theme(legend.position = "right",
        axis.text.x = element_text(angle = 0)) +
  scale_fill_manual(values = c("#a8afb8", "#6a7481", "#353a41", "#25282d")) +
  geom_segment(aes(x=0.5, xend=2.5, y=38, yend=38,linetype="Original number\nof species\nin each pool"), colour="black",size=0.75) +
  geom_segment(aes(x=2.5, xend=4.5, y=23, yend=23,linetype="Original number\nof species\nin each pool"), colour="black",size=0.75) +
  geom_segment(aes(x=4.5, xend=7.5, y=17, yend=17,linetype="Original number\nof species\nin each pool"), colour="black",size=0.75) +
  scale_linetype_manual("",values=c("Original number\nof species\nin each pool"=3)) +
  geom_vline(xintercept = c(2.5,4.5)) +
  # ggtitle(label = "Normalized SFJQ mock communities",
  #             subtitle = "Alpha diversity estimation using different evidences") +
  guides(fill = guide_legend(title = "Identification level")) +
  ylab(label = "Alpha divesity") +
  geom_text(aes(label=Counts), 
            position = position_dodge(width = 0.9), 
            vjust = 1.4,
            colour = "#ffffff", size = 7)+
  annotate(geom = "rect",
  xmin=c(0.6,2.6,4.6), xmax=c(1.4,3.4,5.4),#mifish
  ymin=c(-1.0), ymax=c(0),
  alpha=1,fill="#000791")+
  annotate(geom = "rect",
  xmin=c(1.6,3.6,5.6), xmax=c(2.4,4.4,6.4),#neofish
  ymin=c(-1.0), ymax=c(0),
  alpha=1,fill="#017504")+
  annotate(geom = "rect",
  xmin=c(6.6), xmax=c(7.4),#teleo
  ymin=c(-1.0), ymax=c(0),
  alpha=1,fill="#820000")+
  annotate(geom = "text",
  x=c(1.5,3.5,6.5),y=c(51,51,51),label=c("SFJQmc","SFmc","JQmc"),size=8) +
  scale_y_continuous(limits = c(-1.5,53), expand = c(0, 0)) +
  scale_x_discrete(labels = c("MiFish","NeoFish","MiFish","NeoFish","MiFish","NeoFish","Teleo"))




ASVs_n_OTUS_plot
scales::show_col(colors5)


# ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/sfjq-ASVs_vs_OTUs.pdf",
ggsave(file = "~/outros/sfjq_temp/trees/sfjq-ASVs_vs_OTUs.pdf",
     plot = ASVs_n_OTUS_plot,
     device = "pdf",
     width = 14,
     height = 10,
     dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/sfjq-ASVs_vs_OTUs.svg",
     plot = ASVs_n_OTUS_plot,
     device = "svg",
     width = 14,
     height = 10,
     dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/sfjq-ASVs_vs_OTUs.png",
     plot = ASVs_n_OTUS_plot,
     device = "png",
     width = 16,
     height = 10,
     dpi = 600)


```

## Fold-change bar plots

```{r, eval = FALSE, echo=FALSE}
# Fold-change plots ----

#######################fold change plots ----
#plots with mirrored x axis, with log values
library("ggallin")
#JQmc ----
   <-
  jq_proportions %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish", "NeoFish/MiFish/Teleo"))) %>% 
  filter(`Expected Species` %in% c("expected")) %>%
  mutate(fold_change = gtools::foldchange(denom = Proportion,num = `Relative abundance on sample`)) %>%
  # mutate(fold_change = gtools::foldchange2logratio(foldchange = gtools::foldchange(denom = Proportion,num = `Relative abundance on sample`)),base=2) %>% 
  # mutate(`final ID` = factor(`final ID`)) %>% 
  mutate(`revised final ID` = factor(`revised final ID`)) %>% 
  ggplot(aes(x= fold_change,
             y= `revised final ID`, fill = Primer, alpha = Pool)) +
  geom_bar(stat = "identity",
           position =position_dodge2(preserve = "single"))+
  scale_fill_manual(values = colors_norm[c(1,3,5)])+
  scale_alpha_manual(values = c(0.4, 0.9),labels = c("Non-normalized pool","Normalized pool"))+
  ylab("Species") +
  xlab("Proportion of reads divided by proportion of input DNA in JQmc") + 
  scale_x_continuous(trans = ggallin::pseudolog10_trans,
  breaks = c(-3000,-2000,-1000,-500,-100,-50,-10,-5,-1,0,1,5,10,50,100,500,1000,2000)) +
  theme_linedraw(base_size = 15) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    geom_vline(xintercept = c(-1,1),linetype=2,size=0.25)  +
  facet_wrap(~Primer,ncol = 1) +
  annotate(geom = "rect",
  xmin=c(-1), xmax=c(1),
  ymin=c(0.5), ymax=c(17.5),
  alpha=0.1,fill="#000000")


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_jq_foldchange.png",
       plot = jq_porp_plot,
       device = "png",
       width = 14,
       height = 12,
       dpi = 300)
jq_proportions$`final ID` %>% unique()

#SFmc ----
sf_porp_plot <-
  sf_proportions %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish", "NeoFish/MiFish/Teleo"))) %>% 
  filter(`Expected Species` %in% c("expected")) %>%
  mutate(fold_change = gtools::foldchange(denom = Proportion,num = `Relative abundance on sample`)) %>% 
  mutate(`final ID` = factor(`final ID`)) %>% 
  ggplot(aes(x= fold_change,
             y= `final ID`, fill = Primer, alpha = Pool)) +
  geom_bar(stat = "identity",
           position =position_dodge2(preserve = "single"))+
  scale_fill_manual(values = colors_norm[c(1,3,5)])+
  scale_alpha_manual(values = c(0.4, 0.9),labels = c("Non-normalized pool","Normalized pool"))+
  ylab("Species") +
  xlab("Proportion of reads divided by proportion of input DNA in JQmc") + 
  scale_x_continuous(trans = ggallin::pseudolog10_trans,
  breaks = c(-3000,-2000,-1000,-500,-100,-50,-10,-5,-1,0,1,5,10,50,100,500,1000,2000)) +
  theme_linedraw(base_size = 15) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    geom_vline(xintercept = c(-1,1),linetype=2,size=0.25)  +
  facet_wrap(~Primer,ncol = 1) +
  annotate(geom = "rect",
  xmin=c(-1), xmax=c(1),
  ymin=c(0.5), ymax=c(23.5),
  alpha=0.1,fill="#000000")
  
  


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_sf_foldchange.png",
     plot = sf_porp_plot,
     device = "png",
     width = 14,
     height = 10,
     dpi = 300)
#old ondes without mirrored axis ----
# JQmc ----
#proportions
jq_porp_plot <-
  jq_proportions %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish", "NeoFish/MiFish/Teleo"))) %>% 
  filter(`Expected Species` %in% c("expected")) %>%
  # ggplot(aes(x= as.numeric((`Relative abundance on sample`/Proportion)),
    # ggplot(aes(x= gtools::foldchange(denom = Proportion,num = `Relative abundance on sample`),
  ggplot(aes(x= `Relative abundance on sample`/Proportion,
  # ggplot(aes(x= fold_change,
             y= `final ID`, fill = Primer, alpha = Pool
             )) +
  geom_bar(stat = "identity",
    position =position_dodge2(preserve = "single")
           )+
  # scale_color_manual(values = c("#00000f","#969595"),
  #                    labels = c("Non-normalized","Normalized"))+
  # scale_fill_manual(values = alpha(colour = colors_norm ,alpha =  0.8))+
  scale_fill_manual(values = colors_norm[c(1,3,5)])+
  scale_alpha_manual(values = c(0.4, 0.9),labels = c("Non-normalized pool","Normalized pool"))+
  ylab("Species") +
  # scale_x_break(c(20, 80)) +
  # xlim(c(0,20))+
  xlab("Proportion of reads divided by proportion of input DNA in JQmc") +
  scale_x_sqrt(breaks=c(0.025,0.1,0.25,0.5,1,2,3,4,5,6,7,8,10,12,14)) +
  geom_vline(xintercept = 1,linetype=2,size=0.5)  +
  facet_wrap(~Primer,ncol = 1)


 ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_jq.png",
     plot = jq_porp_plot,
     device = "png",
     width = 12,
     height = 8,
     dpi = 300)


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_jq.pdf",
     plot = jq_porp_plot,
     device = "pdf",
     width = 12,
     height = 8,
     dpi = 300)


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_jq.svg",
     plot = jq_porp_plot,
     device = "svg",
     width = 12,
     height = 8,
     dpi = 300)





# SFmc ----
#proportions
sf_porp_plot <-
  sf_proportions %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish", "NeoFish/MiFish/Teleo"))) %>% 
  filter(`Expected Species` %in% c("expected")) %>% 
  ggplot(aes(x= as.numeric((`Relative abundance on sample`/Proportion)),
             y= `final ID`, fill = Primer, alpha = Pool
             )) +
  geom_bar(stat = "identity",
    position =position_dodge2(preserve = "single")
           )+
  # scale_color_manual(values = c("#00000f","#969595"),
  #                    labels = c("Non-normalized","Normalized"))+
  # scale_fill_manual(values = alpha(colour = colors_norm ,alpha =  0.8))+
  scale_fill_manual(values = colors_norm[c(1,3,5)])+
  scale_alpha_manual(values = c(0.4, 0.9),labels = c("Non-normalized pool","Normalized pool"))+
  ylab("Species") +
  # scale_x_break(c(6, 40)) +
  # xlim(c(0,20))+
  xlab("Proportion of reads divided by proportion of input DNA in SFmc") +
  geom_vline(xintercept = 1,linetype=2,size=0.5)  +
  scale_x_sqrt(breaks=c(0.025,0.1,0.25,0.5,1,2,3,4,5,6,7,8,10,12)) +
  facet_wrap(~Primer,ncol = 1) 



ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_sf.png",
     plot = sf_porp_plot,
     device = "png",
     width = 12,
     height = 6,
     dpi = 300)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_sf.pdf",
     plot = sf_porp_plot,
     device = "pdf",
     width = 12,
     height = 6,
     dpi = 300)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_sf.svg",
     plot = sf_porp_plot,
     device = "svg",
     width = 12,
     height = 8,
     dpi = 300)

# all_ps_tbl_blast$OTU[all_ps_tbl_blast$`final ID` =="Tetragonopterus chalceus"] %>% unique()
# 
# all_ps_tbl_blast$`final ID`[all_ps_tbl_blast$OTU %in% c(7,9,87,88,93,132)]


# SFJQmc ----
#proportions
sf_porp_plot <-
  sfjq_proportions %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish", "NeoFish/MiFish/Teleo"))) %>% 
    # mutate(Proportion = 4.35) %>% 
  filter(`Expected Species` %in% c("expected")) %>%
  ggplot(aes(x= as.numeric(`Relative abundance on sample`/Proportion),
             y= `final ID`, fill = Primer,alpha = Pool
             )) +
  geom_bar(stat = "identity",
    # position =position_dodge2(preserve = "single")
    position = "dodge"
    )+
  # scale_color_manual(values = c("#00000f","#969595"),
  #                    labels = c("Non-normalized","Normalized"))+
  # scale_fill_manual(values = alpha(colour = colors_norm ,alpha =  0.8))+
  scale_fill_manual(values = colors_norm[c(1,3,5)])+
  scale_alpha_manual(name = NULL, values = c(0.9),labels = NULL,guide = NULL)+
  # scale_alpha_manual(values = c(0.4, 0.9),
  #                    labels = c("Non-normalized pool","Normalized pool"))+
  ylab("Species") +
  # scale_x_break(c(7.5, 40)) +
  # xlim(c(0,40))+
  xlab("Proportion of ASVs in sample divided by proportion of DNA input in SFJQmc") +
  geom_vline(xintercept = 1,linetype=2)  +
  facet_wrap(~Primer,ncol = 1)




ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_sfjq.png",
     plot = sfjq_porp_plot,
     device = "png",
     width = 12,
     height = 8,
     dpi = 300)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_sfjq.pdf",
     plot = sfjq_porp_plot,
     device = "pdf",
     width = 12,
     height = 8,
     dpi = 300)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_prop_sfjq.svg",
     plot = sfjq_porp_plot,
     device = "svg",
     width = 12,
     height = 8,
     dpi = 300)







all_ps_tbl_blast %>% 
  select(Primer,OTU,`revised final ID`) %>% unique() %>% View()


```





##Ven diagram

```{r echo=FALSE,eval=FALSE}
{

  
  #NeoFish
neo_list_SF_norm <- sf_proportions$`revised final ID`[sf_proportions$Pool %in% c("Percentage on respective Norm pool") & (!is.na(sf_proportions$Proportion)) & sf_proportions$Primer %in% c("NeoFish")] %>% unique() %>% list()
names(neo_list_SF_norm) <- "NeoFish - pool SF - Normalized"


neo_list_JQ_norm <- jq_proportions$`revised final ID`[jq_proportions$Pool %in% c("Percentage on respective Norm pool") & (!is.na(jq_proportions$Proportion)) & jq_proportions$Primer %in% c("NeoFish")] %>% unique() %>% list()
names(neo_list_JQ_norm) <- "NeoFish - pool JQ - Normalized"


neo_list_SFJQ_norm <- sfjq_proportions$`revised final ID`[sfjq_proportions$Pool %in% c("Percentage on respective Norm pool") & (!is.na(sfjq_proportions$Proportion)) & sfjq_proportions$Primer %in% c("NeoFish")] %>% unique() %>% list()
names(neo_list_SFJQ_norm) <- "NeoFish - pool SFJQ - Normalized"



neo_list_SF_Nnorm <- sf_proportions$`revised final ID`[sf_proportions$Pool %in% c("Percentage on respective Non-norm pool") & (!is.na(sf_proportions$Proportion)) & sf_proportions$Primer %in% c("NeoFish")] %>% unique() %>% list()
names(neo_list_SF_Nnorm) <- "NeoFish - pool SF - Non-normalized"


neo_list_JQ_Nnorm <- jq_proportions$`revised final ID`[jq_proportions$Pool %in% c("Percentage on respective Non-norm pool") & (!is.na(jq_proportions$Proportion)) & jq_proportions$Primer %in% c("NeoFish")] %>% unique() %>% list()
names(neo_list_JQ_Nnorm) <- "NeoFish - pool JQ - Non-normalized"



#MiFish
mif_list_SF_norm <- sf_proportions$`revised final ID`[sf_proportions$Pool %in% c("Percentage on respective Norm pool") & (!is.na(sf_proportions$Proportion)) & sf_proportions$Primer %in% c("MiFish")] %>% unique() %>% list()
names(mif_list_SF_norm) <- "MiFish - pool SF - Normalized"


mif_list_JQ_norm <- jq_proportions$`revised final ID`[jq_proportions$Pool %in% c("Percentage on respective Norm pool") & (!is.na(jq_proportions$Proportion)) & jq_proportions$Primer %in% c("MiFish")] %>% unique() %>% list()
names(mif_list_JQ_norm) <- "MiFish - pool JQ - Normalized"


mif_list_SFJQ_norm <- sfjq_proportions$`revised final ID`[sfjq_proportions$Pool %in% c("Percentage on respective Norm pool") & (!is.na(sfjq_proportions$Proportion)) & sfjq_proportions$Primer %in% c("MiFish")] %>% unique() %>% list()
names(mif_list_SFJQ_norm) <- "MiFish - pool SFJQ - Normalized"




mif_list_SF_Nnorm <- sf_proportions$`revised final ID`[sf_proportions$Pool %in% c("Percentage on respective Non-norm pool") & (!is.na(sf_proportions$Proportion)) & sf_proportions$Primer %in% c("MiFish")] %>% unique() %>% list()
names(mif_list_SF_Nnorm) <- "MiFish - pool SF - Non-normalized"


mif_list_JQ_Nnorm <- jq_proportions$`revised final ID`[jq_proportions$Pool %in% c("Percentage on respective Non-norm pool") & (!is.na(jq_proportions$Proportion)) & jq_proportions$Primer %in% c("MiFish")] %>% unique() %>% list()
names(mif_list_JQ_Nnorm) <- "MiFish - pool JQ - Non-normalized"




#teleo

tel_list_JQ_norm <- jq_proportions$`revised final ID`[jq_proportions$Pool %in% c("Percentage on respective Norm pool") & (!is.na(jq_proportions$Proportion)) & jq_proportions$Primer %in% c("Teleo")] %>% unique() %>% list()
names(tel_list_JQ_norm) <- "Teleo - poool JQ - Normalized"


tel_list_JQ_Nnorm <- jq_proportions$`revised final ID`[jq_proportions$Pool %in% c("Percentage on respective Non-norm pool") & (!is.na(jq_proportions$Proportion)) & jq_proportions$Primer %in% c("Teleo")] %>% unique() %>% list()
names(tel_list_JQ_Nnorm) <- "Teleo - poool JQ - Non-normalized"


}

# sps_per_pool <- c(neo_list_SF_norm, neo_list_JQ_norm, neo_list_SFJQ_norm, neo_list_SF_Nnorm, neo_list_JQ_Nnorm, 
#   mif_list_SF_norm, mif_list_JQ_norm, mif_list_SFJQ_norm, mif_list_SF_Nnorm, mif_list_JQ_Nnorm, 
#   tel_list_JQ_norm, tel_list_JQ_Nnorm)

#JQ
  all_sps_JQ <- c(c(neo_list_JQ_norm,neo_list_JQ_Nnorm, mif_list_JQ_norm, mif_list_JQ_Nnorm, tel_list_JQ_norm, tel_list_JQ_Nnorm) %>% unlist() %>% unique(),"Prochilodus hartii") %>% sort() %>% list()
  names(all_sps_JQ) <- "JQ original species"
#SF
  all_sps_SF <- c(neo_list_SF_norm, neo_list_SF_Nnorm, mif_list_SF_norm, mif_list_SF_Nnorm) %>% unlist() %>% unique() %>% sort() %>% list()

  names(all_sps_SF) <- "SF original species"
  
 #SFJQ 
  all_sps_SFJQ <- c(c(neo_list_SF_norm, neo_list_JQ_norm, neo_list_SFJQ_norm, neo_list_SF_Nnorm, neo_list_JQ_Nnorm, 
  mif_list_SF_norm, mif_list_JQ_norm, mif_list_SFJQ_norm, mif_list_SF_Nnorm, mif_list_JQ_Nnorm, 
  tel_list_JQ_norm, tel_list_JQ_Nnorm) %>% unlist() %>% unique(),"Prochilodus hartii") %>% sort() %>% list()

  names(all_sps_SFJQ) <- "SFJQ original species"
  
#SF_norm


sapply(sps_per_pool, length)


#venn diagram ----

sf_norm_venn <- c(all_sps_SF,neo_list_SF_norm,mif_list_SF_norm)

all_sps_SFJQ %>% unlist() %>% paste0(collapse = "\n") %>% cat()
neo_list_SF_norm %>% unlist()  %>% paste0(collapse = ",")
mif_list_SF_norm %>% unlist()  %>% paste0(collapse = ",")

library(made4)




z <- comparelists(unlist(all_sps_JQ),unlist(tel_list_JQ_norm))

z <- comparelists(unlist(all_sps_SFJQ),unlist(mif_list_SFJQ_norm),x="Set.Diff")
z$Set.Diff


intersect(all_sps_SF,neo_list_SF_norm)


library(ggvenn)
ggvenn(sf_norm_venn,stroke_color="Red",
       stroke_linetype="solid",show_elements = TRUE,  label_sep = "\n")



```





##Upset plot

```{r echo=FALSE,eval=FALSE}


sfjq_sps_list <- all_ps_tbl_sfjq_full %>%
  dplyr::group_split(Sample) %>%
  purrr::map(~ .x %>% dplyr::pull(`final ID`))

names(sfjq_sps_list) <- unique(all_ps_tbl_sfjq_full$Sample)




sfjq_sps_list <- all_ps_tbl_sfjq_full %>%
  dplyr::group_split(`final ID`) %>%
  purrr::map(~ .x %>% dplyr::pull(Sample))

names(sfjq_sps_list) <- unique(all_ps_tbl_sfjq_full$`final ID`)


sfjq_sps_list %>% str()

library("UpSetR")


length(sfjq_sps_list)

UpSetR::upset(UpSetR::fromList(sfjq_sps_list), order.by = "freq",nsets = length(sfjq_sps_list),nintersects = 100)
UpSetR::upset(UpSetR::fromList(sfjq_sps_list), order.by = "freq",nsets = length(sfjq_sps_list),nintersects = 100)

genomes_orths_df %>% pheatmap::pheatmap()

genomes_orths_df %>% colSums() %>% table() %>% plot()

```



```{r echo=FALSE,eval=FALSE}
#plots of identified species detected for each pool 
#JQ levels----

all_ps_tbl_sfjq_full$`final ID` %>% base::sort() %>% unique() %>% paste0(collapse = '",\n"') %>% cat()

{
  finalID_levels_SFandJQ <-c(

  
# JQ
  
"Astyanax lacustris",
"Australoheros sp",
"Cyphocharax gilbert",
"Delturus brevis",
"Hoplias brasiliensis",
"Hoplias malabaricus",
"Hypomasticus steindachneri",
"Hypostomus nigrolineatus",
"Megaleporinus elongatus",
"Megaleporinus garmani",
"Prochilodus argenteus",
"Rhamdia quelen",
"Steindachneridion amblyurum",
"Wertheimeria maculata",


# JQ&SF
"Gymnotus carapo",
"Prochilodus costatus",
"Trachelyopterus galeatus",

#SF
"Astyanax fasciatus",
"Brycon orthotaenia",
"Characidium lagosantense",
"Crenicichla lepidota",
"Eigenmannia virescens",
"Franciscodoras marmoratus",
"Hoplias intermedius",
"Hypostomus alatus",
"Imparfinis minutus",
"Microglanis leptostriatus",
"Moenkhausia sanctaefilomenae",
"Myleus micans",
"Pamphorichthys hollandi",
"Phalloceros uai",
"Pimelodus maculatus",
"Pimelodus pohli",
"Pterygoplichthys etentaculatus",
"Roeboides xenodon",
"Serrasalmus brandtii",
"Tetragonopterus chalceus",

#sfjq
"Moenkhausia costae",
"Eugerres brasilianus",


#other
"Acestrorhynchus lacustris",
"Acinocheirodon melanogramma",
"Astyanax",
"Coptodon zillii",
"Curimatella lepidura",
"Geophagus brasiliensis",
"Hoplias brasiliensis/intermedius",
"Hypostomus",
"Leporinus reinhardti",
"Pimelodus",
"Planaltina myersi",
"Prochilodus",
"Prochilodus argenteus/hartii",
"Pseudoplatystoma corruscans"


)
}
# correcting complete table ----


all_ps_tbl_sfjq_full %>% colnames()



  all_ps_tbl_sfjq_full_uniq <- all_ps_tbl_sfjq_full %>% 
    filter(`Expected length` %in% c("in range")) %>%
  # filter(`Expected Species` %in% c("expected")) %>% 
  filter(Run %in% c("LGC_MiniSeq_1", "LGC_MiniSeq_2")) %>% 
  # subset(!(Sample %in% c("Positive Control\n(P.glauca)"))) %>% 
  group_by(Sample,`revised final ID`) %>% 
  summarize(Sample = unique(Sample),
            Run = unique(Run),
            Group = unique(Group),
            OTUs = length(unique(OTU)),
            ASVs = length(unique(`ASV (Sequence)`)),
            IDs = length(unique(`revised final ID`)),
            Primer = unique(Primer),
            `Expected species` = length(unique(`revised final ID`[`Expected Species` %in% c("expected")])),
            `Expected species list` = list(unique(base::sort(`revised final ID`[`Expected Species` %in% c("expected")]))),
            # `revised final ID`= unique(`revised final ID`),
            `RRA (%)` = sum(`Relative abundance on sample`),
            `Percentage on respective Norm pool` = unique(`Percentage on respective Norm pool`),
            `Percentage on respective Non-norm pool` = unique(`Percentage on respective Non-norm pool`),
            `Relative abundance on sample` = unique(`Relative abundance on sample`)
            ) %>%
  ungroup() %>% 
  mutate(`revised final ID`=factor(`revised final ID`
                                   # , levels = rev(finalID_levels)
                                   )) %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish","NeoFish/MiFish/Teleo"),)) %>% 
  mutate(`Recovered proportion norm` = gtools::foldchange(denom = `Percentage on respective Norm pool`,num = `Relative abundance on sample`)) %>% 
  mutate(`Recovered proportion non norm` = gtools::foldchange(denom = `Percentage on respective Non-norm pool`,num = `Relative abundance on sample`)) %>% 
  mutate(Ratio = if_else(Group %in% c("Non-normalized JQmc", "Non-normalized SFmc"), `Recovered proportion non norm`,`Recovered proportion norm`)) %>% 
  mutate(Ratio = if_else(Ratio <= -20, -20, Ratio)) %>% 
  mutate(Group = factor(Group,levels = c("Normalized SFJQmc", "Non-normalized JQmc", "Normalized JQmc", "Non-normalized SFmc", "Normalized SFmc" ))) %>% 
  mutate(MC = str_remove(Group,"Normalized |Non-normalized ")) %>% 
  mutate(Normalization = str_remove(Group," SFmc| JQmc| SFJQmc")) 
  
  
  #  mutate(`Recovered proportion norm` = `Relative abundance on sample`/`Percentage on respective Norm pool`) %>% 
  # mutate(`Recovered proportion non norm` = `Relative abundance on sample`/`Percentage on respective Non-norm pool`)
  # 
  




#novos graficos de proporções ----
# all_ps_tbl_sfjq_full_uniq %>% 
#  mutate(Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish", "NeoFish/MiFish/Teleo"))) %>% 
#   filter(`Expected Species` == 1) %>%
#   ggplot(aes(x= Ratio),
#              y= `revised final ID`, fill = Primer, alpha = Pool
#              )) +
#   geom_bar(stat = "identity",
#     position =position_dodge2(preserve = "single")
#            )+
#   # scale_color_manual(values = c("#00000f","#969595"),
#   #                    labels = c("Non-normalized","Normalized"))+
#   # scale_fill_manual(values = alpha(colour = colors_norm ,alpha =  0.8))+
#   scale_fill_manual(values = colors_norm[c(1,3,5)])+
#   scale_alpha_manual(values = c(0.4, 0.9),labels = c("Non-normalized pool","Normalized pool"))+
#   ylab("Species") +
#   # scale_x_break(c(20, 80)) +
#   # xlim(c(0,20))+
#   xlab("Proportion of ASVs in sample divided by proportion of DNA input in JQmc") +
#   geom_vline(xintercept = 1,linetype=2)  +
#   facet_wrap(~Primer,ncol = 1)



#----






#JQ norm----
jq_norm_presence_plot <- all_ps_tbl_sfjq_full_uniq %>% 
  filter(Group %in% c("Normalized JQmc")) %>% 
  filter(!`revised final ID`%in% c(
    NA,"NA",
    "Acinocheirodon melanogramma",
    "Brycon orthotaenia",
    "Hoplias intermedius",
    "Moenkhausia costae",
    "Serrasalmus brandtii")) %>% 
  ggplot(aes(y=`revised final ID`,
             x=Sample,
             fill=Primer,
             alpha=`Recovered proportion norm`)) +
  annotate(geom = "rect",
           xmin=c(0.45), xmax=c(3.55),
           ymin=c(0.5), ymax=c(4.5),
           alpha=0.5,fill="yellow") +
  geom_raster() +
  # theme_minimal(base_size = 10) +
  scale_fill_manual(values = alpha(colour = colors5)) +
  scale_alpha(name="Recovered sequence counts/\nInput DNA",breaks = c(0.01,1,2,3,4,5,6,7,8,9,10,20),range = c(0.05,1)) +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  coord_fixed(ratio = (21*3)/80 ) +
  geom_text(aes(label=ASVs),alpha=1)+
  theme_minimal(base_line_size = 0) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  ylab(label = "Taxonomy")

jq_norm_presence_plot

#JQ non-norm----
jq_non_norm_presence_plot <- all_ps_tbl_sfjq_full_uniq %>% 
  filter(Group %in% c("Non-normalized JQmc")) %>% 
  filter(!`revised final ID`%in% c(
    NA,"NA",
    "Acinocheirodon melanogramma",
    "Brycon orthotaenia",
    "Hoplias intermedius",
    "Moenkhausia costae",
    "Serrasalmus brandtii",
    "Coptodon zillii")) %>% 
  ggplot(aes(y=`revised final ID`,
             x=Sample,
             fill=Primer,
             alpha=`Recovered proportion non norm`)) +
  annotate(geom = "rect",
           xmin=c(0.45), xmax=c(3.55),
           ymin=c(0.5), ymax=c(2.5),
           alpha=0.5,fill="yellow") +
  geom_raster() +
  scale_fill_manual(values = alpha(colour = colors5)) +
  scale_alpha(name="Recovered sequence counts/\nInput DNA",breaks = c(0.01,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,20),range = c(0.05,1)) +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  coord_fixed(ratio = (19*3)/80 ) +
  geom_text(aes(label=ASVs),alpha=1,size=4)+
  theme_minimal(base_line_size = 0) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  ylab(label = "Taxonomy")

jq_non_norm_presence_plot

#SF norm----
sf_norm_presence_plot <- all_ps_tbl_sfjq_full_uniq %>% 
  filter(Group %in% c("Normalized SFmc")) %>% 
  filter(!`revised final ID`%in% c(
    NA,"NA",
    "Acestrorhynchus lacustris",
    "Astyanax lacustris",
    "Curimatella lepidura",
    "Delturus brevis",
    "Geophagus brasiliensis",
    "Leporinus reinhardti",
    "Moenkhausia costae",
    "Planaltina myersi"
    )) %>% 
  ggplot(aes(y=`revised final ID`,
             x=Sample,
             fill=Primer,
             alpha=`Recovered proportion norm`)) +
  # geom_tile(colour = "#333333", width=0.95, height=0.95, size=1,linesize=0.05) +
  annotate(geom = "rect",
           xmin=c(0.45), xmax=c(2.55),
           ymin=c(0.5), ymax=c(2.5),
           alpha=0.5,fill="yellow") +
  geom_raster() +
  scale_fill_manual(values = alpha(colour = colors5)) +
  scale_alpha(name="Recovered sequence counts/\nInput DNA",breaks = c(0.01,1,2,3,4,5,6,7,8,9,10,20),range = c(0.05,1)) +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  coord_fixed(ratio = (25*2)/80 ) +
  geom_text(aes(label=ASVs),alpha=1) +
  theme_minimal(base_line_size = 0) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  ylab(label = "Taxonomy")

sf_norm_presence_plot

#SF non-norm----
sf_non_norm_presence_plot <- all_ps_tbl_sfjq_full_uniq %>% 
  filter(Group %in% c("Non-normalized SFmc")) %>% 
  filter(!`revised final ID`%in% c(
    # NA,"NA",
    "Acestrorhynchus lacustris",
    "Astyanax lacustris",
    "Curimatella lepidura",
    "Delturus brevis",
    "Geophagus brasiliensis",
    "Leporinus reinhardti",
    "Moenkhausia costae",
    "Planaltina myersi"
    )) %>% 
  ggplot(aes(y=`revised final ID`,
             x=Sample,
             fill=Primer,
             alpha=`Recovered proportion non norm`)) +
  annotate(geom = "rect",
           xmin=c(0.45), xmax=c(2.55),
           ymin=c(0.5), ymax=c(2.5),
           alpha=0.5,fill="yellow") +
  geom_raster() +
  # theme(panel.background=element_rect(fill="#ffffff")) +
  scale_fill_manual(values = alpha(colour = colors5)) +
  scale_alpha(name="Recovered sequence counts/\nInput DNA",breaks = c(0.01,1,2,3,4,5,6,7,8,9,10,20)) +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  coord_fixed(ratio = (27*2)/80) +
  geom_text(aes(label=ASVs),alpha=1) +
  theme_minimal(base_line_size = 0,base_size = 10) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  ylab(label = "Taxonomy")

sf_non_norm_presence_plot
#SFJQ norm----
sfjq_norm_presence_plot <- all_ps_tbl_sfjq_full_uniq %>% 
  mutate(`revised final ID` = factor(`revised final ID`, levels = rev(finalID_levels_SFandJQ))) %>% 
  filter(Group %in% c("Normalized SFJQmc")) %>% 
  filter(!`revised final ID`%in% c(
    NA,"NA",
    "Acestrorhynchus lacustris",
    "Geophagus brasiliensis",
    "Pseudoplatystoma corruscans"
    )) %>% 
  ggplot(aes(y=`revised final ID`,
             x=Sample,
             fill=Primer,
             alpha=`Recovered proportion norm`)) +
  annotate(geom = "rect",
           xmin=c(0.45), xmax=c(2.55),
           ymin=c(0.5), ymax=c(4.5),
           alpha=0.5,fill="yellow") +
  geom_raster() +
  
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  scale_fill_manual(values = alpha(colour = colors5)) +
  scale_alpha(name="Recovered sequence counts/\nInput DNA",breaks = c(0.01,1,2,3,4,5,6,7,8,9,10,20),range = c(0.05,1)) +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  coord_fixed(ratio = (42*2)/120 ) +
  geom_text(aes(label=ASVs),alpha=1) +
  theme_minimal(base_line_size = 0,base_size = 10) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  ylab(label = "Taxonomy")

sfjq_norm_presence_plot
#----


#save plots ----

jq_norm_presence_plot
jq_non_norm_presence_plot
sf_norm_presence_plot
sf_non_norm_presence_plot
sfjq_norm_presence_plot


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/jq_norm_presence_plot.png", plot = jq_norm_presence_plot, device = "png", width = 18, height = 24, units = "cm", dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/jq_non_norm_presence_plot.png", plot = jq_non_norm_presence_plot, device = "png", width = 22, height = 24, units = "cm", dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/sf_norm_presence_plot.png", plot = sf_norm_presence_plot, device = "png", width = 20, height = 28, units = "cm", dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/sf_non_norm_presence_plot.png", plot = sf_non_norm_presence_plot, device = "png", width = 20, height = 30, units = "cm", dpi = 600)

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/sfjq_norm_presence_plot.png", plot = sfjq_norm_presence_plot, device = "png", width = 20, height = 45, units = "cm", dpi = 600)



gtools::foldchange(3.765505e-02,0.056)
gtools::foldchange(0.056,3.765505e-02)
gtools::foldchange(2.458775e-05,0.0556)
gtools::foldchange(0.0556,2.458775e-05)

#tentando fazer todos juntos ----


#organizing by orders----
{finalID_levels_SFandJQ_tax <- c(
#SFJQ
  "Moenkhausia costae",
  "Eugerres brasilianus",
{
#SF
  "Brycon orthotaenia",
  "Astyanax fasciatus",
  "Moenkhausia sanctaefilomenae",
  "Roeboides xenodon",
  "Tetragonopterus chalceus",
  "Characidium lagosantense",
  "Hoplias intermedius",
  "Myleus micans",
  "Serrasalmus brandtii",
  "Crenicichla lepidota",
  "Pamphorichthys hollandi",
  "Phalloceros uai",
  "Eigenmannia virescens",
  "Franciscodoras marmoratus",
  "Imparfinis minutus",
  "Hypostomus alatus",
  "Pterygoplichthys etentaculatus",
  "Pimelodus maculatus",
  "Pimelodus pohli",
  "Microglanis leptostriatus",


#SF & JQ
  "Gymnotus carapo",
  "Trachelyopterus galeatus",
  "Prochilodus costatus",
#JQ
  "Prochilodus argenteus",
  "Prochilodus hartii",
        # "Prochilodus argenteus/hartii",
        # "Prochilodus",
  "Hypomasticus steindachneri",
  "Megaleporinus elongatus",
  "Megaleporinus garmani",
  "Astyanax lacustris",
  "Cyphocharax gilbert",
  "Hoplias brasiliensis",
  "Hoplias malabaricus",
  "Prochilodus argenteus",
  "Prochilodus hartii",
  "Australoheros sp",
  "Wertheimeria maculata",
  "Delturus brevis",
  "Hypostomus nigrolineatus",
  "Rhamdia quelen",
  "Steindachneridion amblyurum",
#other
# "Acestrorhynchus lacustris",
# "Acinocheirodon melanogramma",
  "Astyanax",
# "Coptodon zillii",
# "Curimatella lepidura",
# "Geophagus brasiliensis",
"Hoplias brasiliensis/intermedius",
"Hypostomus",
# "Leporinus reinhardti",
"Pimelodus",
# "Planaltina myersi",
"Prochilodus",
"Prochilodus argenteus/hartii"
# ,"Pseudoplatystoma corruscans"
) %>% unique()
}

# taxonomicaly ordered ----

# {finalID_levels_SFandJQ_tax <- c( "Prochilodus argenteus", "Prochilodus hartii", "Prochilodus costatus",   "Prochilodus",   "Prochilodus argenteus/hartii", "Cyphocharax gilbert", "Myleus micans", "Serrasalmus brandtii", "Megaleporinus elongatus", "Megaleporinus garmani", "Hypomasticus steindachneri", "Hoplias malabaricus", "Hoplias intermedius", "Hoplias brasiliensis",   "Hoplias brasiliensis/intermedius", "Brycon orthotaenia", "Characidium lagosantense", "Roeboides xenodon", "Tetragonopterus chalceus", "Moenkhausia sanctaefilomenae", "Moenkhausia costae", "Astyanax lacustris", "Astyanax fasciatus",     "Astyanax", "Delturus brevis", "Pterygoplichthys etentaculatus", "Hypostomus nigrolineatus", "Hypostomus alatus",   "Hypostomus", "Trachelyopterus galeatus", "Franciscodoras marmoratus", "Wertheimeria maculata", "Rhamdia quelen", "Imparfinis minutus", "Microglanis leptostriatus", "Steindachneridion amblyurum", "Pimelodus pohli", "Pimelodus maculatus",   "Pimelodus", "Eigenmannia virescens", "Gymnotus carapo", "Eugerres brasilianus", "Pamphorichthys hollandi", "Phalloceros uai", "Crenicichla lepidota", "Australoheros sp") %>% unique()
# }



#----

all_ps_tbl_sfjq_full_uniq$`Recovered proportion norm` %>% base::sort()

# browseVignettes("treeio")

all_sfNjq_norm_presence_plot <-
  all_ps_tbl_sfjq_full_uniq %>%
  filter(!`revised final ID` %in% c(
    "Pseudoplatystoma corruscans",
    "Planaltina myersi",
    "Leporinus reinhardti",
    "Acestrorhynchus lacustris",
    "Acinocheirodon melanogramma",
    "Coptodon zillii",
    "Curimatella lepidura",
    "Geophagus brasiliensis")) %>% 
  mutate(`revised final ID` = factor(`revised final ID`
                                     # , levels = rev(finalID_levels_SFandJQ_tax)
                                     )) %>% 
  # mutate(Sample = factor(Sample, levels = sample_levels)) %>% 
  mutate(Sample = factor(Sample, levels = c(
"Normalized SFmc\nMiFish B",
"Normalized SFmc\nMiFish",
"Non-normalized SFmc\nMiFish B",
"Non-normalized SFmc\nMiFish",
"Normalized SFJQmc\nMiFish B",
"Normalized SFJQmc\nMiFish",
"Non-normalized JQmc\nMiFish",
"Normalized JQmc\nMiFish",

"Normalized SFmc\nNeoFish B"    ,
"Normalized SFmc\nNeoFish",
"Non-normalized SFmc\nNeoFish B",
"Non-normalized SFmc\nNeoFish"  ,
"Normalized SFJQmc\nNeoFish B"  ,
"Normalized SFJQmc\nNeoFish",
"Non-normalized JQmc\nNeoFish"  ,
"Normalized JQmc\nNeoFish",

"Non-normalized JQmc\nTeleo",
"Normalized JQmc\nTeleo"        ,
"Positive Control\n(P.glauca)",
"neg-PCR2"))) %>% 
  filter(!`revised final ID`%in% c(
    NA,"NA",
    "Acestrorhynchus lacustris",
    "Coptodon zillii",
    "Curimatela lepidura",
    "Acinocheirodon melanogramma",
    "Curimatella lepidura",
    "Planaltina myersi",
    "Pseudoplatystoma corruscans",
    NA,"NA")) %>% 
  filter(Group %in% c("Normalized SFmc","Normalized JQmc","Normalized SFJQmc")) %>%
  ggplot(aes(y=`revised final ID`,
             x=Sample,
             fill=Ratio)) +
  geom_tile(width=1,col="black", linejoin = "round") +
  # geom_tile(aes(col=Normalization),width=2, linejoin = "round") +
  # theme_minimal(base_size = 10) +
  scale_colour_manual(values = c("#000000","#878787"))+
  scale_fill_gradient2(
    # colours=c("#0000ff","#0000ff", "#ff0000"),
    # low = "#ff0000",high = "#0000ff",
    low = "#cc0000",high = "#007000",
    mid = "#ffffff",midpoint = 0,
    na.value = "gold",
    guide = "colourbar",
    limits = c(-20, 20),
    name = "Fold variation from\ninput DNA to RRA"
    ) +
  # scale_y_discrete(expand = c(0, 0)) +
  # scale_x_discrete(expand = c(0, 0),
  #                  breaks = c(2)) +
  # coord_fixed(ratio = 0.25) +
    
    # naniar::geom_miss_point() +
  geom_text(aes(label=ASVs),alpha=1)+
  theme_minimal(base_line_size = 0.1,base_size = 15) +
  theme(panel.background = element_rect(fill="#ffffff"),
        axis.text.x = element_text(angle = 45, hjust=1)) +
  scale_y_discrete(drop=FALSE) +
  ylab(label = "Taxonomy") +
   facet_wrap(~Primer ,ncol = 3,shrink = T) 
   # facet_grid(~Primer ,drop = TRUE,)

 
 all_sfNjq_norm_presence_plot


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/all_sfNjq_norm_presence_plot.svg", plot = all_sfNjq_norm_presence_plot, device = "svg", width = 45, height = 40, units = "cm", dpi = 600)


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/all_sfNjq_norm_presence_plot_2norms.pdf", plot = all_sfNjq_norm_presence_plot, device = "pdf", width = 100, height = 40, units = "cm", dpi = 600)



scales::show_col(colors5)
  


```

## DNA/RRA correlation plots
```{r echo=TRUE,eval=FALSE}

colnames(all_ps_tbl_sfjq_full_uniq)
all_ps_tbl_sfjq_full_uniq$Group
all_ps_tbl_sfjq_full_uniq$MC
all_ps_tbl_sfjq_full_uniq$Normalization




######função pra plotar lm

# lm_eqn <- function(df){
#     m <- lm(y ~ x, df);
#     eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
#          list(a = format(unname(coef(m)[1]), digits = 2),
#               b = format(unname(coef(m)[2]), digits = 2),
#              r2 = format(summary(m)$r.squared, digits = 3)))
#     as.character(as.expression(eq));
# }





#inclinação das retas ----

#LM
 models_corr <- all_ps_tbl_sfjq_full_uniq %>% 
   filter(Normalization %in% c("Non-normalized")) %>% 
   # filter(Primer %in% c("NeoFish")) %>% 
  group_by(Primer,MC) %>% 
   select(c("Percentage on respective Non-norm pool", "Relative abundance on sample")) %>% 
  do(model = lm(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`, data = .))
   # lm(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`)
   # aov(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`)

models_corr$model
models_corr$Primer
models_corr$MC



models_corr$model[[1]] %>% broom::glance(model) #NeoFish JQmc
models_corr$model[[2]] %>% broom::glance(model) #NeoFish SFmc
models_corr$model[[3]] %>% broom::glance(model) #MiFish JQmc
models_corr$model[[4]] %>% broom::glance(model) #MiFish SFmc
models_corr$model[[5]] %>% broom::glance(model) #Teleo JQmc

#AOV
models_corr <- all_ps_tbl_sfjq_full_uniq %>% 
   filter(Normalization %in% c("Non-normalized")) %>% 
   # filter(Primer %in% c("NeoFish")) %>% 
  group_by(Primer) %>% 
   select(c("Percentage on respective Non-norm pool", "Relative abundance on sample")) %>% 
  do(model = aov(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`, data = .))
   # lm(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`)
   # aov(formula = `Percentage on respective Non-norm pool` ~ `Relative abundance on sample`)

models_corr$model
models_corr$Primer

models_corr$model[[1]] %>% broom::tidy()
models_corr$model[[2]] %>% broom::tidy()
models_corr$model[[3]] %>% broom::tidy()



#JQmc ----
# jq_coef <- lm(`Percentage on respective Non-norm pool` ~ `Relative abundance on sample`,
#    (all_ps_tbl_sfjq_full_uniq %>% 
#       filter(MC %in% c("JQmc")) %>% 
#       filter(Normalization %in% c("Non-normalized"))))




############################ tentando com a tib com norm e n norm na mesma coluna
tab_curated_SFJQ_all_pools %>% colnames()


sfjq_sp_corr <-
  tab_curated_SFJQ_all_pools %>% 
  ggplot(aes(x=`input DNA (%)`,
             y=`RRA (%)`,
             col=Primer,
             shape=Pool))+
  geom_point() + 
  # geom_smooth(method=lm) +
  # coord_fixed()+
  scale_colour_manual(values = alpha(colour = colors_norm[c(1,3,5)] ,alpha =  0.8)) +
  # scale_shape_manual(drop=TRUE) +
  xlab("Input DNA (%)") +
  ylab("Relative Read Abundance (%)") +
  ggtitle("SFmc & JQmc: Correlation between\nInput DNA and RRA") +
  scale_x_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  scale_y_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  coord_fixed(ratio = 1)+
  geom_smooth(method=lm) +
  theme_bw(base_size = 10) +
  # facet_wrap(MC~Primer,ncol = 3) 
  facet_wrap(Primer~Group,ncol = 5) 
# +
#   scale_x_log10() +
#   scale_y_log10()
# minor_breaks = mb
# minor_breaks = mb

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_RRA_DNA_bySp2.png", plot = sfjq_sp_corr, device = "png", width = 30, height = 15, units = "cm", dpi = 600)









###########################




sfjq_sp_corr <- all_ps_tbl_sfjq_full_uniq %>% 
  # filter(MC %in% c("JQmc")) %>%
  filter(Normalization %in% c("Non-normalized")) %>%
  # mutate(`Percentage on respective Non-norm pool`= if_else(`Percentage on respective Non-norm pool` %in% c(NA,"NA"),0,`Percentage on respective Non-norm pool`)) %>% 
  # mutate(`Relative abundance on sample`= if_else(`Relative abundance on sample` %in% c(NA,"NA"),0,`Relative abundance on sample`)) %>% View()
  
  ggplot(aes(x=`Percentage on respective Non-norm pool`*100,
             y=`Relative abundance on sample`*100,
             col=Primer,
             shape=MC
             ))+
  geom_point() + 
  # geom_smooth(method=lm) +
  # coord_fixed()+
  scale_colour_manual(values = alpha(colour = colors_norm[c(1,3,5)] ,alpha =  0.8)) +
  # scale_shape_manual(drop=TRUE) +
  xlab("Input DNA (%)") +
  ylab("Relative Read Abundance (%)") +
  ggtitle("SFmc & JQmc: Correlation between\nInput DNA and RRA") +
  scale_x_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  scale_y_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  coord_fixed(ratio = 1)+
  geom_smooth(method=lm) +
  theme_bw(base_size = 10) +
  # facet_wrap(MC~Primer,ncol = 3) 
  facet_wrap(Primer~Normalization,ncol = 3) 
# +
#   scale_x_log10() +
#   scale_y_log10()
# minor_breaks = mb
# minor_breaks = mb

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SFJQ_RRA_DNA_bySp.png", plot = sfjq_sp_corr, device = "png", width = 30, height = 15, units = "cm", dpi = 600)




#SFmc ----


library("plotly")

sf_sp_corr <- all_ps_tbl_sfjq_full_uniq %>% 
  filter(MC %in% c("SFmc")) %>% 
  filter(Normalization %in% c("Non-normalized")) %>% 
  # mutate(`Percentage on respective Non-norm pool`= if_else(`Percentage on respective Non-norm pool` %in% c(NA,"NA"),0,`Percentage on respective Non-norm pool`)) %>% 
  # mutate(`Relative abundance on sample`= if_else(`Relative abundance on sample` %in% c(NA,"NA"),0,`Relative abundance on sample`)) %>% View()
  
  ggplot(aes(x=`Percentage on respective Non-norm pool`*100,
             y=`Relative abundance on sample`*100,
             col=Primer,
             shape=MC))+
  geom_point() + 
  # geom_smooth(method=lm) +
  # coord_fixed()+
  scale_colour_manual(values = alpha(colour = colors_norm[c(1,3,5)] ,alpha =  0.8)) +
  xlab("input DNA (%)") +
  ylab("Relative Read Abundance (%)") +
  ggtitle("SFmc: Correlation between\nInput DNA and RRA") +
  scale_x_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  scale_y_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  coord_fixed(ratio = 1)+
  geom_smooth(method=lm) +
  theme_bw(base_size = 10) +
  facet_wrap(~Primer,ncol = 3) 
    
    
    
    ggplotly(sf_sp_corr)


ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/SF_RRA_DNA_bySp.png", plot = sf_sp_corr, device = "png", width = 20, height = 15, units = "cm", dpi = 600)



```





##  Aditional Graphics
```{r echo=TRUE,eval=FALSE}
# set colors ----

colors6 <- c("#19821c","#8dbc8f", #neo norm & non norm
             "#171d9a","#8d90c7", #mif norm & non norm
             "#8c1717","#c18d8d"  #tel norm & non norm
             )

scales::show_col(colors6)






# carregando tabela 
# https://docs.google.com/spreadsheets/d/1sTsOaI999E9Py_f4ll5j838r7CVc991a8mPmd8LvykU/edit#gid=0

tab_curated_SFJQ_all_pools <- read.csv("~/prjcts/fish_eDNA/sfjq/data/SFJQ_all_pools-tabelas_curadas_para_o_R-SFJQmc-copy.csv",check.names = F) %>% 
  as_tibble()

tab_curated_SFJQ_all_pools <- tab_curated_SFJQ_all_pools %>% 
    filter(`input DNA (%)` != 0 & `RRA (%)` !=0) %>%
  group_by(Pool,Normalization,Primer,Species) %>% 
  summarize(Pool = unique(Pool),
            Normalization = unique(Normalization),
            Status = unique(Status),
            Species = unique(Species),
            Primer = unique(Primer),
            `Num ASVs` = `Num ASVs`,
            `Num OTUs` = `Num OTUs`,
            `input DNA (%)` = `input DNA (%)`,
            `RRA (%)` = `RRA (%)`
            # `Expected species` = length(unique(`revised final ID`[`Expected Species` %in% c("expected")])),
            # `Expected species list` = list(unique(base::sort(`revised final ID`[`Expected Species` %in% c("expected")]))),
            # `revised final ID`= unique(`revised final ID`),
            # `RRA (%)` = sum(`Relative abundance on sample`),
            # `Percentage on respective Norm pool` = unique(`Percentage on respective Norm pool`),
            # `Percentage on respective Non-norm pool` = unique(`Percentage on respective Non-norm pool`),
            # `Relative abundance on sample` = unique(`Relative abundance on sample`)
            ) %>%
  ungroup() %>% 
  # mutate(`revised final ID`=factor(`revised final ID`
  #                                  # , levels = rev(finalID_levels)
  #                                  )) %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish", "MiFish", "Teleo", "NeoFish/MiFish","NeoFish/MiFish/Teleo"))) %>% 
  mutate(`Fold change (RRA/DNA input)` = gtools::foldchange(denom = `input DNA (%)`,num = `RRA (%)`)) %>% 
  unique()
  # mutate(Group = factor(Group,levels = c("Normalized SFJQmc", "Non-normalized JQmc", "Normalized JQmc", "Non-normalized SFmc", "Normalized SFmc" ))) %>% 
  # mutate(MC = str_remove(Group,"Normalized |Non-normalized ")) %>% 
  # mutate(Normalization = str_remove(Group," SFmc| JQmc| SFJQmc")) 


tab_curated_SFJQ_all_pools$`Fold change (RRA/DNA input)` %>% sort() %>% duplicated()

tab_curated_SFJQ_all_pools[c(which(tab_curated_SFJQ_all_pools$`Fold change (RRA/DNA input)` %>% as.character() %>% duplicated()),
                             which(tab_curated_SFJQ_all_pools$`Fold change (RRA/DNA input)` %>% as.character() %>% duplicated())-1),] %>% View()




# SFJQmc ----
## build tree ----
### read 12S db seqs for the species present in pools
SFJQ_Sps_seqs <- Biostrings::readDNAStringSet(filepath = "~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique.fas") %>%
# SFJQ_Sps_seqs <- Biostrings::readDNAStringSet(filepath = "~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique.fas") %>% 
  DECIPHER::RemoveGaps()

### align seqs
SFJQ_Sps_algn <- DECIPHER::AlignSeqs(myXStringSet = SFJQ_Sps_seqs, 
                                      refinements = 100,
                                      iterations = 100,
                                      verbose = TRUE)

### generate distance matrix
SFJQ_Sps_dist <- DECIPHER::DistanceMatrix(myXStringSet = SFJQ_Sps_algn,
                                            includeTerminalGaps = FALSE,
                                            correction = "Jukes-Cantor",
                                            processors = 20,
                                            verbose = TRUE)

### generate dendrogram/tree from alignment and distance matrix
SFJQmc_tree <- ape::nj(SFJQ_Sps_dist)
        # tree <- phangorn::NJ(SFJQ_Sps_dist)
class(SFJQmc_tree)

### save tree as newick
ape::write.tree(phy = SFJQmc_tree,file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique_APEtree.nwk")
# ape::write.tree(phy = SFJQmc_tree,file = "~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique_APEtree.nwk")

### read tree from file (or stay with the same object)
# SFJQmc_tree <- read.tree("~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique_APEtree.nwk")
SFJQmc_tree <- read.tree("~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique_APEtree.nwk")


## species metadata ----
### read table with pools species, input DNA and RRA
# tab_curated_SFJQ_all_pools <- read.csv("~/outros/sfjq_temp/SFJQ_all_pools-tabelas_curadas_para_o_R-SFJQmc.csv",check.names = F)
# tab_curated_SFJQ_all_pools <- read.csv("~/prjcts/fish_eDNA/sfjq/data/SFJQ_all_pools-tabelas_curadas_para_o_R-SFJQmc.csv",check.names = F) %>% 
#   as_tibble()
# 
# tab_curated_SFJQ_all_pools




tab_curated_SFJQ_all_pools %>% colnames()
tab_curated_SFJQ_all_pools %>% unique()
all_ps_tbl_sfjq_full_uniq %>% colnames()





# tab_curated_SFJQ_all_pools  <- all_ps_tbl_sfjq_full_uniq



#converting the corrected table to the format required for next steps


tab_curated_SFJQ_all_pools <-
  all_ps_tbl_sfjq_full_uniq %>%
  pivot_longer(c("Percentage on respective Norm pool", "Percentage on respective Non-norm pool"),
               names_to = "DNA Originary Pool", values_to = "input DNA (%)") %>% 
  pivot_longer(c("Recovered proportion norm", "Recovered proportion non norm"),
               names_to = "Fold Change Originary Pool", values_to = "Fold Change") %>% 
  select(-c(
    # "DNA Originary Pool", "RRA Originary Pool",
            # "Sample,"
            # "Group",
            "Run"
            )) %>% 
  # colnames()
  # View()
  rename(
   Species = `revised final ID`,
  `Num OTUs` = `OTUs`,
  `Num ASVs` = `ASVs`,
  `Num IDs` = `IDs`,
  # `RRA (%)` = `Relative abundance on sample`,
  # `Expected species`, 
  # `Expected species list`,
  # `Fold Change` = `Ratio`,
  `Pool` = `MC`) %>%
  filter((`DNA Originary Pool` %in% c("Percentage on respective Norm pool") & `Normalization` %in% c("Normalized")) |
           (`DNA Originary Pool` %in% c("Percentage on respective Non-norm pool") & `Normalization` %in% c("Non-normalized")) ) %>% 
  filter((`Fold Change Originary Pool` %in% c("Recovered proportion norm") & `Normalization` %in% c("Normalized")) |
           (`Fold Change Originary Pool` %in% c("Recovered proportion non norm") & `Normalization` %in% c("Non-normalized")) ) %>% 
  select(-c("DNA Originary Pool", "Fold Change Originary Pool")) %>% 
  mutate(Status = if_else(`Expected species` == 0, "Contamination","Expected")) %>% 
  mutate(`RRA (%)` = `RRA (%)` * 100,
         `input DNA (%)` = `input DNA (%)` * 100) %>% unite(Normalization, Primer, col = "Primer_norm",remove = F,sep = " ") %>% 
  mutate(Primer_norm = factor(Primer_norm, levels = c("Normalized NeoFish","Normalized MiFish","Normalized Teleo","Non-normalized NeoFish","Non-normalized MiFish","Non-normalized Teleo"))) %>% View()

# all_ps_tbl_sfjq_full_uniq %>% colnames()


 

# tab_curated_SFJQ_all_pools[sort(c(which(tab_curated_SFJQ_all_pools$`Fold Change` %>% as.character() %>% duplicated()),which(tab_curated_SFJQ_all_pools$`Fold Change`%>% as.character() %>% duplicated())-1)),] %>% View()
# tab_curated_SFJQ_all_pools[tab_curated_SFJQ_all_pools$`Fold Change` %>% duplicated(),] %>% View()

### tidy table
tab_curated_SFJQ <- tab_curated_SFJQ_all_pools %>%
  as_tibble() %>%
  filter(Status == "Expected") %>%
  filter(Pool == "SFJQmc") %>%
  # filter(MC == "SFJQmc") %>% 
  # filter(Pool == "SFJQmc") %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish","MiFish","Teleo")),
         Species = factor(Species),
         # `revised final ID` = factor(`revised final ID`),
         Normalization = factor(Normalization),
         Pool = factor(Pool),
         Status = factor(Status),
         # `RRA (%)` = as.numeric(`Relative abundance on sample`)*100,
         `RRA (%)` = as.numeric(`RRA (%)`),
         `input DNA (%)` = as.numeric(`input DNA (%)`))


## correct tips labels ----
### check tip names in tree
SFJQmc_tree$tip.label


### rename tree tips to mach table
SFJQmc_tree$tip.label
tab_curated_SFJQ$Species %>% unique() %>% sort()

{
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1339_Prochilodus_costatus_JQ_2860"] <- "Prochilodus costatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_5612_Prochilodus_argenteus_hartii"] <- "Prochilodus argenteus/hartii"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_6586_Steindachnerina_elegans-Cyphocharax_gilbert"] <- "Cyphocharax gilbert"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1230_Serrasalmus_brandtii"] <- "Serrasalmus brandtii"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1162_Myleus_micans"] <- "Myleus micans"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_2984_Hypomasticus_steindachneri"] <- "Hypomasticus steindachneri"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_4290_Megaleporinus_garmani"] <- "Megaleporinus garmani"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_1762_Megaleporinus_elongatus"] <- "Megaleporinus elongatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1037_Brycon_orthotaenia"] <- "Brycon orthotaenia"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1381_Characidium_lagosantense"] <- "Characidium lagosantense"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "1136_Acestrorhynchus_lacustris"] <- "Acestrorhynchus lacustris"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_7822_Hoplias_malabaricus"] <- "Hoplias malabaricus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1377_Hoplias_intermedius_brasiliensis"] <- "Hoplias brasiliensis/intermedius"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_0916_Roeboides_xenodon"] <- "Roeboides xenodon"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_0901_Tetragonopterus_chalceus"] <- "Tetragonopterus chalceus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1113_Moenkhausia_sanctaefilomenae"] <- "Moenkhausia sanctaefilomenae"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SFJQ_7812_Moenkhausia_costae"] <- "Moenkhausia costae"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1153_Astyanax_cf_fasciatus"] <- "Astyanax fasciatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_7880_Astyanax_lacustris"] <- "Astyanax lacustris"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1141_Pterygoplichthys_etentaculatus"] <- "Pterygoplichthys etentaculatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1041_Hypostomus_alatus"] <- "Hypostomus alatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_7893_Hypostomus_nigrolineatus"] <- "Hypostomus nigrolineatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1248_Gymnotus_carapo_JQ_1631"] <- "Gymnotus carapo"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1117_Eigenmannia_virescens"] <- "Eigenmannia virescens"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_1936_Delturus_brevis"] <- "Delturus brevis"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_0708_Franciscodoras_marmoratus"] <- "Franciscodoras marmoratus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_7817_Wertheimeria_maculata"] <- "Wertheimeria maculata"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1243_Trachelyopterus_galeatus_JQ_5675"] <- "Trachelyopterus galeatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1403_Pimelodus_pohli"] <- "Pimelodus pohli"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1280_Pimelodus_maculatus"] <- "Pimelodus maculatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "1135_Pseudoplatystoma_corruscans"] <- "Pseudoplatystoma corruscans"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_2996_Steindachneridion_amblyurum"] <- "Steindachneridion amblyurum"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1104_Microglanis_leptostriatus"] <- "Microglanis leptostriatus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1161_Imparfinis_minutus"] <- "Imparfinis minutus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_5645_Rhamdia_aff_quelen"] <- "Rhamdia quelen"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SFJQ_2943_Eugerres_brasilianus"] <- "Eugerres brasilianus"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1368_Phalloceros_uai"] <- "Phalloceros uai"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1099_Pamphorichthys_hollandi"] <- "Pamphorichthys hollandi"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "SF_1264_Crenicichla_lepidota"] <- "Crenicichla lepidota"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "JQ_6584_Australoheros_sp"] <- "Australoheros sp"
SFJQmc_tree$tip.label[SFJQmc_tree$tip.label == "304_Geophagus_brasiliensis"] <- "Geophagus brasiliensis"
}

### check if all names have correspondences
SFJQmc_tree$tip.label %in% tab_curated_SFJQ$Species
tab_curated_SFJQ$Species %in% SFJQmc_tree$tip.label
tab_curated_SFJQ$Species[!tab_curated_SFJQ$Species %in% SFJQmc_tree$tip.label]





### convert ape tree to prettier ggtree object
SFJQmc_tree_plot <- ggtree(SFJQmc_tree) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)


### save tree plot
# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique-tree_plot.pdf", plot = SFJQmc_tree_plot, device = "pdf", width = 24, height = 24, units = "cm", dpi = 600)
ggsave(file = "~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique-tree_plot.pdf", plot = SFJQmc_tree_plot, device = "pdf", width = 24, height = 24, units = "cm", dpi = 600)



### extract tips order from tree to reproduce on plots
SPs_order_in_SFJQ_tree <- ggtree::get_taxa_name(SFJQmc_tree_plot) %>% rev()

            # SPs_order_in_SFJQ_tree <- extract_tree_data(tree_plot) %>% 
            #     dplyr::filter(isTip) %>% 
            #     dplyr::pull(label)



## plots ----
### RRA histogram ----

# library(ggdist)



SFJQmc_RRA_plot <-
  tab_curated_SFJQ %>%
  filter(Pool %in% c("SFJQmc")) %>% 
  mutate(Species = factor(Species,levels = SPs_order_in_SFJQ_tree)) %>% 
  ggplot(aes(y = Species,
             x = `RRA (%)`,
             fill = Primer, 
             col = Normalization
             ))+
  geom_bar(stat = "identity", size = 0.3,width = .75,
           # alpha = 0.7,
           position = position_dodge(preserve = "single" ,width = 1.1)) +
           # position = position_dodgejust(preserve = "single" ,width = 1.2)) +
  scale_color_manual(values = c("#000000","#747474")) +
  scale_fill_manual(values = colors6[c(1,3)]) +
  geom_point(aes(y = Species,
                 x = `input DNA (%)`),
             shape = "|",
             size = 3,
             colour = "#000000") +
  scale_x_break(c(13, 30),
                scales = "fixed") +
  scale_x_continuous(breaks=c(0,5,10,13,30,32)) +
  
  xlab("Relative read abundance (%)")+ 
  # opts(axis.title.y = theme_text(vjust=-0.5))
 theme(axis.text = element_text(vjust = -0.5))

## save plot 

# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFJQmc-38_SPs_fused-unique-RRA_barplot.pdf", plot = SFJQmc_RRA_plot, device = "pdf", width = 18, height = 24, units = "cm", dpi = 600)
dev.off()
ggsave(file = "~/outros/sfjq_temp/trees/SFJQmc-38_SPs_fused-unique-RRA_barplot.pdf", plot = SFJQmc_RRA_plot, device = "pdf", width = 30, height = 24, units = "cm", dpi = 600)




# SFmc ----
## build tree ----
### read 12S db seqs for the species present in pools and select only pool species
names(SFJQ_Sps_seqs) %>% sort() %>% paste0(collapse = '",\n"') %>% cat()

SF_Sps_seqs <- SFJQ_Sps_seqs[c("SF_0708_Franciscodoras_marmoratus", "SF_0901_Tetragonopterus_chalceus",
                               "SF_0916_Roeboides_xenodon", "SF_1037_Brycon_orthotaenia",
                               "SF_1041_Hypostomus_alatus", "SF_1099_Pamphorichthys_hollandi",
                               "SF_1104_Microglanis_leptostriatus", "SF_1113_Moenkhausia_sanctaefilomenae",
                               "SF_1117_Eigenmannia_virescens", "SF_1141_Pterygoplichthys_etentaculatus",
                               "SF_1153_Astyanax_cf_fasciatus", "SF_1161_Imparfinis_minutus",
                               "SF_1162_Myleus_micans", "SF_1230_Serrasalmus_brandtii",
                               "SF_1243_Trachelyopterus_galeatus_JQ_5675", "SF_1248_Gymnotus_carapo_JQ_1631",
                               "SF_1264_Crenicichla_lepidota", "SF_1280_Pimelodus_maculatus",
                               "SF_1339_Prochilodus_costatus_JQ_2860", "SF_1368_Phalloceros_uai",
                               "SF_1377_Hoplias_intermedius_brasiliensis", "SF_1381_Characidium_lagosantense",
                               "SF_1403_Pimelodus_pohli")]
### align seqs
SF_Sps_algn <- DECIPHER::AlignSeqs(myXStringSet = SF_Sps_seqs, 
                                      refinements = 100,
                                      iterations = 100,
                                      verbose = TRUE)

### generate distance matrix
SF_Sps_dist <- DECIPHER::DistanceMatrix(myXStringSet = SF_Sps_algn,
                                            includeTerminalGaps = FALSE,
                                            correction = "Jukes-Cantor",
                                            processors = 20,
                                            verbose = TRUE)

### generate dendrogram/tree from alignment and distance matrix
SFmc_tree <- ape::nj(SF_Sps_dist)
        # tree <- phangorn::NJ(SFJQ_Sps_dist)
class(SFmc_tree)

### save tree as newick
# ape::write.tree(phy = SFmc_tree,file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFmc-23_SPs_fused-unique_APEtree.nwk")
ape::write.tree(phy = SFmc_tree,file = "~/outros/sfjq_temp/trees/SFmc-23_SPs_fused-unique_APEtree.nwk")

### read tree from file (or stay with the same object)
SFmc_tree <- read.tree("~/prjcts/fish_eDNA/sfjq/data/trees/SFmc-23_SPs_fused-unique_APEtree.nwk")
SFmc_tree <- read.tree("~/outros/sfjq_temp/trees/SFmc-23_SPs_fused-unique_APEtree.nwk")

## species metadata ----
### read table with pools species, input DNA and RRA

### tidy table
tab_curated_SF <- tab_curated_SFJQ_all_pools %>% 
  filter(Pool %in% c("SFmc")) %>% 
  filter(Status %in% c("Expected")) %>%
  # filter(MC == "SFJQmc") %>% 
  # filter(Pool == "SFJQmc") %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish","MiFish","Teleo")),
         Species = factor(Species),
         # `revised final ID` = factor(`revised final ID`),
         Normalization = factor(Normalization),
         Pool = factor(Pool),
         Status = factor(Status),
         # `RRA (%)` = as.numeric(`Relative abundance on sample`)*100,
         `RRA (%)` = as.numeric(`RRA (%)`),
         `input DNA (%)` = as.numeric(`input DNA (%)`))


## correct tips labels ----
### check tip names in tree
SFmc_tree$tip.label


### rename tree tips to mach table
SFmc_tree$tip.label
tab_curated_SF$Species %>% unique() %>% sort()

{
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1339_Prochilodus_costatus_JQ_2860"] <- "Prochilodus costatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1230_Serrasalmus_brandtii"] <- "Serrasalmus brandtii"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1162_Myleus_micans"] <- "Myleus micans"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1037_Brycon_orthotaenia"] <- "Brycon orthotaenia"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1381_Characidium_lagosantense"] <- "Characidium lagosantense"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1377_Hoplias_intermedius_brasiliensis"] <- "Hoplias intermedius"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_0916_Roeboides_xenodon"] <- "Roeboides xenodon"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_0901_Tetragonopterus_chalceus"] <- "Tetragonopterus chalceus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1113_Moenkhausia_sanctaefilomenae"] <- "Moenkhausia sanctaefilomenae"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1153_Astyanax_cf_fasciatus"] <- "Astyanax fasciatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1141_Pterygoplichthys_etentaculatus"] <- "Pterygoplichthys etentaculatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1041_Hypostomus_alatus"] <- "Hypostomus alatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1248_Gymnotus_carapo_JQ_1631"] <- "Gymnotus carapo"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1117_Eigenmannia_virescens"] <- "Eigenmannia virescens"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_0708_Franciscodoras_marmoratus"] <- "Franciscodoras marmoratus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1243_Trachelyopterus_galeatus_JQ_5675"] <- "Trachelyopterus galeatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1403_Pimelodus_pohli"] <- "Pimelodus pohli"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1280_Pimelodus_maculatus"] <- "Pimelodus maculatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1104_Microglanis_leptostriatus"] <- "Microglanis leptostriatus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1161_Imparfinis_minutus"] <- "Imparfinis minutus"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1368_Phalloceros_uai"] <- "Phalloceros uai"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1099_Pamphorichthys_hollandi"] <- "Pamphorichthys hollandi"
SFmc_tree$tip.label[SFmc_tree$tip.label == "SF_1264_Crenicichla_lepidota"] <- "Crenicichla lepidota"
}

### check if all names have correspondences
SFmc_tree$tip.label %in% tab_curated_SF$Species
tab_curated_SF$Species %in% SFmc_tree$tip.label
tab_curated_SF$Species[!tab_curated_SF$Species %in% SFmc_tree$tip.label]





### convert ape tree to prettier ggtree object
SFmc_tree_plot <- ggtree(SFmc_tree) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)


### save tree plot
# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFmc-23_SPs_fused-unique-tree_plot.pdf", 
ggsave(file = "~/outros/sfjq_temp/trees/SFmc-23_SPs_fused-unique-tree_plot.pdf", 
       plot = SFmc_tree_plot, 
       device = "pdf", 
       width = 24, height = 20, 
       units = "cm", dpi = 600)



### extract tips order from tree to reproduce on plots
SPs_order_in_SF_tree <- get_taxa_name(SFmc_tree_plot) %>% rev()

            # SPs_order_in_SF_tree <- extract_tree_data(tree_plot) %>% 
            #     dplyr::filter(isTip) %>% 
            #     dplyr::pull(label)




## plots ----
### RRA histogram ----

SFmc_RRA_plot <-
  tab_curated_SF %>%
  # filter(Normalization %in% c("Normalized")) %>%
  unite(Normalization, Primer, col = "Primer_norm",remove = F,sep = " ") %>%
  mutate(Primer_norm = factor(Primer_norm, levels = c("Normalized NeoFish","Normalized MiFish","Non-normalized NeoFish","Non-normalized MiFish"))) %>%
  mutate(Species = factor(Species,levels = SPs_order_in_SF_tree)) %>% 
  ggplot(aes(y = Species,
             x = `RRA (%)`,
             # fill = Primer, 
             fill = Primer_norm, 
             col = Normalization
             ))+
  geom_bar(stat = "identity", size = 0.3,width = 1,
           # alpha = 0.7,
           position = position_dodge(preserve = "single" ,width = 1.5)) +
           # position = position_dodgejust(preserve = "single" ,width = 1.2)) +
    scale_fill_manual(values = colors6[c(1,3,2,4)], name = "") +
  geom_point(aes(y = Species,
                 x = `input DNA (%)`,
             colour = Normalization),
             shape = "|",
             size = 3) +
    scale_color_manual(values = c("#747474","#000000")) +
  xlab("Relative read abundance (%)") 
# +
#   scale_color_manual(values = c("#000000","#848484"))

## save plot 

# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/SFmc-23_SPs_fused-unique-RRA_barplot.pdf", 
#      plot = SFmc_RRA_plot, device = "pdf", width = 24, height = 20, units = "cm", dpi = 600)
ggsave(file = "~/outros/sfjq_temp/trees/SFmc-23_SPs_fused-unique-RRA_barplot.pdf", 
       plot = SFmc_RRA_plot, device = "pdf", width = 30, height = 30, units = "cm", dpi = 600)



# JQmc ----

## build tree ----
### read 12S db seqs for the species present in pools and select only pool species
names(SFJQ_Sps_seqs) %>% sort() %>% paste0(collapse = '",\n"') %>% cat()

JQ_Sps_seqs <- SFJQ_Sps_seqs[c("JQ_1762_Megaleporinus_elongatus",
                               "JQ_1936_Delturus_brevis",
                               "JQ_2984_Hypomasticus_steindachneri",
                               "JQ_2996_Steindachneridion_amblyurum",
                               "JQ_4290_Megaleporinus_garmani",
                               "JQ_5612_Prochilodus_argenteus_hartii",
                               "JQ_5645_Rhamdia_aff_quelen",
                               "JQ_6584_Australoheros_sp",
                               "JQ_6586_Steindachnerina_elegans-Cyphocharax_gilbert",
                               "JQ_7817_Wertheimeria_maculata",
                               "JQ_7822_Hoplias_malabaricus",
                               "JQ_7880_Astyanax_lacustris",
                               "JQ_7893_Hypostomus_nigrolineatus",
                               "SF_1243_Trachelyopterus_galeatus_JQ_5675",
                               "SF_1248_Gymnotus_carapo_JQ_1631",
                               "SF_1339_Prochilodus_costatus_JQ_2860",
                               "SF_1377_Hoplias_intermedius_brasiliensis")]
### align seqs
JQ_Sps_algn <- DECIPHER::AlignSeqs(myXStringSet = JQ_Sps_seqs, 
                                      refinements = 100,
                                      iterations = 100,
                                      verbose = TRUE)

### generate distance matrix
JQ_Sps_dist <- DECIPHER::DistanceMatrix(myXStringSet = JQ_Sps_algn,
                                            includeTerminalGaps = FALSE,
                                            correction = "Jukes-Cantor",
                                            processors = 20,
                                            verbose = TRUE)

### generate dendrogram/tree from alignment and distance matrix
JQmc_tree <- ape::nj(JQ_Sps_dist)
        # tree <- phangorn::NJ(SFJQ_Sps_dist)
class(JQmc_tree)

### save tree as newick
# ape::write.tree(phy = JQmc_tree,file = "~/prjcts/fish_eDNA/sfjq/data/trees/JQmc-23_SPs_fused-unique_APEtree.nwk")
ape::write.tree(phy = JQmc_tree,file = "~/outros/sfjq_temp/trees/JQmc-23_SPs_fused-unique_APEtree.nwk")

### read tree from file (or stay with the same object)
JQmc_tree <- read.tree("~/prjcts/fish_eDNA/sfjq/data/trees/JQmc-23_SPs_fused-unique_APEtree.nw")
# JQmc_tree <- read.tree("~/outros/sfjq_temp/trees/JQmc-23_SPs_fused-unique_APEtree.nwk")

## species metadata ----
### read table with pools species, input DNA and RRA

### tidy table
tab_curated_JQ <- tab_curated_SFJQ_all_pools %>% 
  filter(Pool %in% c("JQmc")) %>% 
  filter(Status %in% c("Expected")) %>%
  # filter(MC == "SFJQmc") %>% 
  # filter(Pool == "SFJQmc") %>% 
  mutate(Primer = factor(Primer, levels = c("NeoFish","MiFish","Teleo")),
         Species = factor(Species),
         # `revised final ID` = factor(`revised final ID`),
         Normalization = factor(Normalization),
         Pool = factor(Pool),
         Status = factor(Status),
         # `RRA (%)` = as.numeric(`Relative abundance on sample`)*100,
         `RRA (%)` = as.numeric(`RRA (%)`),
         `input DNA (%)` = as.numeric(`input DNA (%)`))


## correct tips labels ----
### check tip names in tree
JQmc_tree$tip.label


### rename tree tips to mach table
JQmc_tree$tip.label
tab_curated_JQ$Species %>% unique() %>% sort()

{
JQmc_tree$tip.label[JQmc_tree$tip.label == "SF_1339_Prochilodus_costatus_JQ_2860"] <- "Prochilodus costatus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_5612_Prochilodus_argenteus_hartii"] <- "Prochilodus argenteus/hartii"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_6586_Steindachnerina_elegans-Cyphocharax_gilbert"] <- "Cyphocharax gilbert"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_2984_Hypomasticus_steindachneri"] <- "Hypomasticus steindachneri"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_4290_Megaleporinus_garmani"] <- "Megaleporinus garmani"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_1762_Megaleporinus_elongatus"] <- "Megaleporinus elongatus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_7822_Hoplias_malabaricus"] <- "Hoplias malabaricus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "SF_1377_Hoplias_intermedius_brasiliensis"] <- "Hoplias brasiliensis"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_7880_Astyanax_lacustris"] <- "Astyanax lacustris"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_7893_Hypostomus_nigrolineatus"] <- "Hypostomus nigrolineatus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "SF_1248_Gymnotus_carapo_JQ_1631"] <- "Gymnotus carapo"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_1936_Delturus_brevis"] <- "Delturus brevis"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_7817_Wertheimeria_maculata"] <- "Wertheimeria maculata"
JQmc_tree$tip.label[JQmc_tree$tip.label == "SF_1243_Trachelyopterus_galeatus_JQ_5675"] <- "Trachelyopterus galeatus"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_2996_Steindachneridion_amblyurum"] <- "Steindachneridion amblyurum"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_5645_Rhamdia_aff_quelen"] <- "Rhamdia quelen"
JQmc_tree$tip.label[JQmc_tree$tip.label == "JQ_6584_Australoheros_sp"] <- "Australoheros sp"
}

### check if all names have correspondences
JQmc_tree$tip.label %in% tab_curated_JQ$Species
tab_curated_JQ$Species %in% JQmc_tree$tip.label
tab_curated_JQ$Species[!tab_curated_JQ$Species %in% JQmc_tree$tip.label]




#invert branches to match Siluriformes position on the top


JQmc_tree$edge
JQmc_tree %>% as_tibble() %>% View()


JQmc_tree  %>% 
  ggtree() + 
  # geom_text(aes(label=node)) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)%>% 
  ggtree::flip(node1 = 21,node2 = 20)
 

rotateNodes(tree = JQmc_tree, "all") %>%
  ggtree() + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)



### convert ape tree to prettier ggtree object
JQmc_tree_plot <- ggtree(JQmc_tree) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)


### save tree plot
# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/JQmc-17_SPs_fused-unique-tree_plot.pdf", 
ggsave(file = "~/outros/sfjq_temp/trees/JQmc-17_SPs_fused-unique-tree_plot.pdf", 
       plot = JQmc_tree_plot, 
       device = "pdf", 
       width = 24, height = 24, 
       units = "cm", dpi = 600)



### extract tips order from tree to reproduce on plots
SPs_order_in_JQ_tree <- get_taxa_name(JQmc_tree_plot) %>% rev()

            # SPs_order_in_JQ_tree <- extract_tree_data(tree_plot) %>% 
            #     dplyr::filter(isTip) %>% 
            #     dplyr::pull(label)




## plots ----
### RRA histogram ----

JQmc_RRA_plot <-
  tab_curated_JQ %>%
    unite(Normalization, Primer, col = "Primer_norm",remove = F,sep = " ") %>%
  mutate(Primer_norm = factor(Primer_norm, levels = c("Normalized NeoFish","Normalized MiFish","Normalized Teleo","Non-normalized NeoFish","Non-normalized MiFish","Non-normalized Teleo"))) %>%
  mutate(Species = factor(Species,levels = SPs_order_in_JQ_tree)) %>% 
  ggplot(aes(y = Species,
             x = `RRA (%)`,
             fill = Primer_norm, 
             col = Normalization
             ))+
  geom_bar(stat = "identity", size = 0.3,width = 1,
           # alpha = 0.7,
           position = position_dodge(preserve = "single" ,width = 1.5)) +
           # position = position_dodgejust(preserve = "single" ,width = 1.2)) +
  scale_fill_manual(values = colors6[c(1,3,5,2,4,6)], name = "") +
  geom_point(aes(y = Species,
                 x = `input DNA (%)`,
             colour = Normalization),
             shape = "|",
             size = 3) +
    scale_color_manual(values = c("#747474","#000000")) +
  xlab("Relative read abundance (%)") 


## save plot 

# ggsave(file = "~/prjcts/fish_eDNA/sfjq/data/trees/JQmc-23_SPs_fused-unique-RRA_barplot.pdf", plot = JQmc_RRA_plot, device = "pdf", width = 24, height = 20, units = "cm", dpi = 600)
ggsave(file = "~/outros/sfjq_temp/trees/JQmc-17_SPs_fused-unique-RRA_barplot.pdf", plot = JQmc_RRA_plot, device = "pdf", width = 30, height = 30, units = "cm", dpi = 600)




#arvore ----


# SFJQmc_tree <- read.tree("~/prjcts/fish_eDNA/sfjq/data/12S_full_SFJQmc_fused_SPs_e_3_contams.nwk")




ggplot(SFJQmc_tree) + geom_tree() + theme_tree()

# This is convenient shorthand

# # tree_plot <- 
#   ggtree(SFJQmc_tree) + 
#   theme_tree2() +
#   geom_tiplab(offset = 0,align = T)+ 
#   xlim(0, 0.42) 

  
  tree_plot
  
  
  
    
#TODO create function to build double graph of tree and bars

  
  
# newick tree to plot along the graph
  tree4plot <- SFJQmc_tree

  
# table for ggplot to go alongside the tree (long format, can have factors)
  tbl4plot <- tab_curated_SFJQ
  
# plot to put by side (y axis must be the species in tree (column name == Species))
  plot4tree <- SFJQmc_RRA_plot
  
  
  
### generate plot from plylo object (.nwk read by ape::read.tree)
  

class(SFJQmc_tree)

tree4plot$edge.length %>% sort() %>% sum() 
tree4plot %>% str()


tree_plot$data

# tree_plot <-
  ggtree(tr = tree4plot,layout = "rectangular") +
  theme_tree2() +
  # geom_tiplab(offset = 0,align = T)+
  geom_tiplab(align = T)+
  xlim(0, 0.42)
  
  
### extract tips order from tree to reproduce on plots
SPs_order_in_tree <- ggtree::get_taxa_name(SFJQmc_tree_plot) %>% rev()
  
  
  
  plot4tree$data %>% 
    dplyr::mutate(Species = factor(Species, levels = SPs_order_in_tree))
    
  
  
    
    
    ## check tip names in tree
JQmc_tree$tip.label







### convert ape tree to prettier ggtree object
JQmc_tree_plot <- ggtree(JQmc_tree) + 
  theme_tree2() +
  geom_tiplab(offset = 0,align = T)+ 
  xlim(0, 0.42)
    
    
treeNbar_plot  <- function(){
    
  }  
  

# https://www.r-bloggers.com/2016/12/add-layer-to-specific-panel-of-facet_plot-output-2/
# facet_plot(tree_plot, panel = 'Stacked Barplot', 
#            data = tab_curated_SFJQ, geom = geom_histogram,
#            mapping = aes(x = Species,y =`RRA (%)`,  fill = as.factor(Primer)),
#            stat='identity',position = 'dodge' )
# 
# # 
# p3 <- facet_plot(tree_plot, panel='bar', data=tab_curated_SFJQ, geom=geom_bar, 
#                  aes(x=`RRA (%)`, y=Species,fill = Primer),
#                  stat = "identity",
#                  position = "dodge")


#https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/mbe/35/12/10.1093_molbev_msy194/3/msy194_supp.pdf?Expires=1648204036&Signature=gZC6A1vfyaUWOyL~fKn8wxgY~3fbTBI3jPOGbVtwZSzv3jlXISjCahA37gwR3QTr6oN0SK-bdwAlHQyaPpkdj2~yi5scNQXAQrUi0EQNOqkOo3HUvFvCr-Dir2y7N03vIo5urr1n2idrPclTXtTRtiu7avn255T5eg~cXv0NBNUgiVFcwHHnZ81qQUrSdiA54wIvEs~RF18DYkp-Gla1CJT0eUGuYF8LfFXG5Dq1CgcZV0qGs0fKgfIKRlAT~AP25Xxkdh20RzAkqgBFvxp0JazrVOz5uvdok3uSu3023etErTxhaW7rm67VkCUBVxRgtG8GdFT3fOFJAsPg26Wagw__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA

# 
# tree_plot %<+% tab_curated_SFJQ + 
#   geom_histogram(aes(y = Species,
#                      x = `RRA (%)`,
#                      fill = Primer),
#                  stat = "identity",
#                  position = "dodge")

colnames(tab_curated_SFJQ)[colnames(tab_curated_SFJQ) == "Species"] <- "tip.label"



p2 <- 
  facet_plot(p = tree_plot, panel = "SNP", data = tab_curated_SFJQ, geom = geom_histogram,
                 mapping=aes(y = tip.label, x = `RRA (%)`, fill = Primer),
                 stat = "identity",
                 position = "dodge") +
# %>%
#   facet_plot("Trait", bar_data, ggstance::geom_barh,
#              aes(x = dummy_bar_value, color = location, fill = location),
#              stat = "identity", width = .6) +
  theme_tree2(legend.position=c(.05, .85))
print(p2)



p2 <- tree_plot + geom_facet(panel = "RRA (%)",
                       data = tab_curated_SFJQ,
                       geom = geom_bar,
                       mapping = aes(x=`RRA (%)`, fill = Primer),orientation = 'y',stat="identity")





extract_tree_data <- function(tree_disp, displayorder=TRUE) {
    td_out <- tree_disp$data
    if (displayorder) {
       td_out <- dplyr::arrange(td_out,y)
    }
    return(td_out)
}

SPs_order_in_tree <- extract_tree_data(tree_plot) %>% 
    dplyr::filter(isTip) %>% 
    dplyr::pull(label)



```

## dendrograms of ASVs & db species

```{r echo=FALSE,eval=FALSE}
# generate fasta files for the ASVs of and primer, identifying the pool where it was found

all_ps_tbl_sfjq_full %>%  colnames()


identified_ASVs <- all_ps_tbl_sfjq_full %>% 
  group_by(Primer, `ASV (Sequence)`) %>% 
  summarise(`Sample` = unique(`Sample`),
            OTU = unique(OTU),
            `final ID` = unique(`final ID`),
            `ASV header` = unique(`ASV header`)) 


identified_ASVs$Sample <- identified_ASVs$Sample %>% str_replace_all(pattern = "\n",
                                                                     replacement = "-")
  


identified_ASVs <- identified_ASVs %>% unite(final_header,`ASV header`, OTU, Sample, `final ID`,sep = "-")


  
#write fasta file with ASVs and Taxonomy

dir.create("~/prjcts/fish_eDNA/sfjq/results/ASVs",showWarnings = T)

#NeoFish
identified_ASVs %>% 
  filter(Primer %in% c("NeoFish")) 
  

neoASVs_fasta <- c(rbind(identified_ASVs$final_header[identified_ASVs$Primer =="NeoFish"],
                         identified_ASVs$`ASV (Sequence)`[identified_ASVs$Primer =="NeoFish"]))

write(neoASVs_fasta, "~/prjcts/fish_eDNA/sfjq/results/ASVs/NeoFish_ASVs_IDed.fasta")



#MiFish
identified_ASVs %>% 
  filter(Primer %in% c("MiFish")) 
  

mifASVs_fasta <- c(rbind(identified_ASVs$final_header[identified_ASVs$Primer =="MiFish"],
                         identified_ASVs$`ASV (Sequence)`[identified_ASVs$Primer =="MiFish"]))

write(mifASVs_fasta, "~/prjcts/fish_eDNA/sfjq/results/ASVs/MiFish_ASVs_IDed.fasta")



#Teleo
identified_ASVs %>% 
  filter(Primer %in% c("Teleo")) 
  

telASVs_fasta <- c(rbind(identified_ASVs$final_header[identified_ASVs$Primer =="Teleo"],
                         identified_ASVs$`ASV (Sequence)`[identified_ASVs$Primer =="Teleo"]))

write(telASVs_fasta, "~/prjcts/fish_eDNA/sfjq/results/ASVs/Teleo_ASVs_IDed.fasta")




```



## fold change standard deviation


```{r echo=FALSE,eval=FALSE}


SPs_order_in_SFJQ_tree %>% paste0(collapse = '", \n"') %>% cat()

tib_curated_SFJQ_all_pools <- tab_curated_SFJQ_all_pools %>%
  mutate(Species=replace(Species, Species %in% c("Hoplias brasiliensis", "Hoplias intermedius"), "Hoplias brasiliensis/intermedius")) %>%
  as_tibble() %>%
  filter(Status == "Expected") %>%
  mutate(Primer = factor(Primer, levels = c("NeoFish","MiFish","Teleo")),
         Species = factor(Species, levels = c("Characidium lagosantense", "Brycon orthotaenia","Hoplias malabaricus",
                                              # "Hoplias brasiliensis",
                                              "Hoplias brasiliensis/intermedius",
                                              # "Hoplias intermedius",
                                              "Serrasalmus brandtii", "Myleus micans", "Cyphocharax gilbert", "Prochilodus costatus","Prochilodus argenteus/hartii", "Hypomasticus steindachneri","Megaleporinus garmani","Megaleporinus elongatus","Roeboides xenodon","Tetragonopterus chalceus", "Moenkhausia costae","Moenkhausia sanctaefilomenae", "Astyanax fasciatus","Astyanax lacustris", "Eugerres brasilianus","Crenicichla lepidota", "Australoheros sp","Phalloceros uai","Pamphorichthys hollandi","Gymnotus carapo","Eigenmannia virescens","Delturus brevis","Pterygoplichthys etentaculatus", "Hypostomus alatus", "Hypostomus nigrolineatus","Trachelyopterus galeatus", "Franciscodoras marmoratus", "Wertheimeria maculata", "Imparfinis minutus","Rhamdia quelen", "Microglanis leptostriatus", "Steindachneridion amblyurum", "Pimelodus pohli", "Pimelodus maculatus")),
         Normalization = factor(Normalization),
         Pool = factor(Pool),
         Status = factor(Status)) 
# %>% 
#  group_by(Normalization,Primer,Species) %>% 
#   summarize(Pool = unique(Pool),
#             Normalization = unique(Normalization),
#             Status = unique(Status),
#             Species = unique(Species),
#             Primer = unique(Primer),
#             `Num ASVs` = sum(`Num ASVs`),
#             `Num OTUs` = sum(`Num OTUs`),
#             `input DNA (%)` = sum(`input DNA (%)`),
#             `RRA (%)` = sum(`RRA (%)`)
#             ) %>%
#   ungroup() %>%   
#   mutate(`Fold change (RRA/DNA input)` = gtools::foldchange(denom = `input DNA (%)`,num = `RRA (%)`))

# !!!!!!!!!!!!!!!!versao antiga quando a tab_curated vinha d ednetro do codigo
# tib_curated_SFJQ_all_pools <- tab_curated_SFJQ_all_pools %>%
#   mutate(Species=replace(Species, Species %in% c("Hoplias brasiliensis", "Hoplias intermedius"), "Hoplias brasiliensis/intermedius")) %>%
#   as_tibble() %>%
#   filter(Status == "Expected") %>%
#   mutate(Primer = factor(Primer, levels = c("NeoFish","MiFish","Teleo")),
#          Species = factor(Species, levels = c("Characidium lagosantense", "Brycon orthotaenia","Hoplias malabaricus",
#                                               # "Hoplias brasiliensis",
#                                               "Hoplias brasiliensis/intermedius",
#                                               # "Hoplias intermedius",
#                                               "Serrasalmus brandtii", "Myleus micans", "Cyphocharax gilbert", "Prochilodus costatus","Prochilodus argenteus/hartii", "Hypomasticus steindachneri","Megaleporinus garmani","Megaleporinus elongatus","Roeboides xenodon","Tetragonopterus chalceus", "Moenkhausia costae","Moenkhausia sanctaefilomenae", "Astyanax fasciatus","Astyanax lacustris", "Eugerres brasilianus","Crenicichla lepidota", "Australoheros sp","Phalloceros uai","Pamphorichthys hollandi","Gymnotus carapo","Eigenmannia virescens","Delturus brevis","Pterygoplichthys etentaculatus", "Hypostomus alatus", "Hypostomus nigrolineatus","Trachelyopterus galeatus", "Franciscodoras marmoratus", "Wertheimeria maculata", "Imparfinis minutus","Rhamdia quelen", "Microglanis leptostriatus", "Steindachneridion amblyurum", "Pimelodus pohli", "Pimelodus maculatus")),
#          Normalization = factor(Normalization),
#          Pool = factor(Pool),
#          Status = factor(Status))
# 
library(ggallin)
library(plotly)

#fold change boxplot ----

  fold_change_boxplot <- tib_curated_SFJQ_all_pools %>% 
  mutate(`Fold change (RRA/DNA input)`=if_else(.$`Fold change (RRA/DNA input)` <= -220000.3,-500,.$`Fold change (RRA/DNA input)`)) %>% 
    ggplot(aes(y = Species,
               x = `Fold change (RRA/DNA input)`,
               col = Primer)) +
    facet_wrap(~Primer) +
    geom_vline(xintercept = 0)+
    geom_vline(xintercept = c(-100,100),linetype=4)+
    geom_vline(xintercept = c(-50,-10,-5,-1,50,10,5,1),
               linetype = 4,
               size = 0.25,alpha=0.5)+
    geom_boxplot() + 
    geom_jitter(width = 0,height = 0.1,size  = 0.4)+ 
    scale_x_continuous(trans = ggallin::pseudolog10_trans, 
                       breaks = c(-2000,-1000,-500,-100,-50,-10,-5,-1,0,1,5,10,50,100,500,1000,2000)
                       ) +
        scale_color_manual(values = colors6[c(1,3,5)]) +
    theme(axis.text.x = element_text(angle = 45)) +
    theme_classic()+ 
    theme(panel.background = element_rect(fill = "grey95",colour = "black"))+
    # geom_hline(yintercept = c(1:38)-0.5)
    geom_hline(yintercept = c(seq(1,38,2)),size = 7.5,alpha=0.04)


# library(ggplotly)


ggplotly(p = fold_change_boxplot)


#ploty----

# fold_change_boxplotly <- 
  tib_curated_SFJQ_all_pools %>% colnames
  tib_curated_SFJQ_all_pools %>% 
  group_by(Species)
  
  
  
  
  
  ggplot(aes(y = Species,
             x = `Fold Change`,
             col = Primer)) +
  facet_wrap(~Primer) +
  geom_vline(xintercept = 0)+
  geom_vline(xintercept = c(-100,100),linetype=4)+
  geom_boxplot() +
  geom_jitter(width = 0,height = 0.1,size  = 0.4)+ 
  scale_x_continuous(trans = ggallin::pseudolog10_trans, 
                     breaks = c(-2000,-1000,-500,-100,-50,-10,-5,-1,0,1,5,10,50,100,500,1000,2000)
                     ) +
      scale_color_manual(values = colors6[c(1,3,5)]) +
  theme(axis.text.x = element_text(angle = 45)) 

ggsave(file = "~/prjcts/fish_eDNA/sfjq/results/figs/jun22_sfjq_foldchange_boxplot.pdf", plot = fold_change_boxplot, device = "pdf", width = 40, height = 24, units = "cm", dpi = 600)




#tentando fazer um point regression shade ----


tib_curated_SFJQ_all_pools %>% 
    unite(Normalization, Primer, col = "Primer_norm",remove = F,sep = " ") %>%
  mutate(Primer_norm = factor(Primer_norm, levels = c(
    "Normalized NeoFish","Non-normalized NeoFish",
    "Normalized MiFish","Non-normalized MiFish",
    "Normalized Teleo","Non-normalized Teleo"))) %>%
  # colnames()
  ggplot(aes(
    x = `input DNA (%)`,
    y = `RRA (%)`,
    col = Primer_norm,
    # group = Group,
    shape = Group)) +
  geom_point(size  = 0.5) + 
  # scale_x_continuous(trans = ggallin::pseudolog10_trans, 
  #                    breaks = c(-2000,-1000,-500,-100,-50,-10,-5,-1,0,1,5,10,50,100,500,1000,2000)
  #                    ) +
  theme(axis.text.x = element_text(angle = 45)) +
 xlab("Input DNA (%)") +
  ylab("Relative Read Abundance (%)") +
  ggtitle("SFmc & JQmc: Correlation between\nInput DNA and RRA") +
  scale_x_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  scale_y_sqrt(breaks=c(0,0.0001,0.001,0.01,0.025,0.05,0.1,0.2,0.3,0.4,0.6,0.8,1)*100) +
  coord_fixed(ratio = 1)+
  geom_smooth(method=lm,aes(fill=Primer_norm)) +
  theme_bw(base_size = 10) +
    scale_color_manual(values = colors6) +
    scale_fill_manual(values = colors6) +
  coord_fixed() +
  # facet_wrap(MC~Primer,ncol = 3)
  facet_wrap(Pool~Primer,ncol = 5) 





```

# References

This analyses were based and inpired on other incredible resources, listed below:

* [Sample(Ecology) -- Jon Lefcheck](https://jonlefcheck.net/2012/10/24/nmds-tutorial-in-r/#:~:text=Keep%20going%2C%20and%20imagine%20as,and%20to%20spare%20your%20thinker).)

* [Principal Component Analysis in R: prcomp vs princomp -- kassambara  ](http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/)
    
* [Analysis of community ecology data in R -- David Zelený](https://www.davidzeleny.net/anadat-r/doku.php/en:pcoa_nmds)




```{r echo=FALSE,eval=FALSE}
#cite
# https://jonlefcheck.net/2012/10/24/nmds-tutorial-in-r/#:~:text=Keep%20going%2C%20and%20imagine%20as,and%20to%20spare%20your%20thinker).

# http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/

# https://www.davidzeleny.net/anadat-r/doku.php/en:pcoa_nmds

```

<br>




#Bonus 

## NeoFish redesign

```{r echo=FALSE,eval=FALSE}
#info ----

# author: Heron O. Hilário
# purpose: integrate sequence files from different origins
#         to construct the LGC 12S sequence database and format it
#         for usage in DADA2.
#
# references:
# https://www.bioconductor.org/help/course-materials/2016/BioC2016/ConcurrentWorkshops2/Wright/BigBioSeqData.pdf
# https://www.bioconductor.org/packages/release/bioc/vignettes/DECIPHER/inst/doc/DECIPHERing.pdf
# http://www2.decipher.codes/RLessons/RLesson14.html
# https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.12687
# http://www2.decipher.codes/OligoDesign.html
# http://www2.decipher.codes/Bioinformatics/BigBioSeqData/BigBioSeqData2.html
# https://cran.r-project.org/web/packages/ggdendro/vignettes/ggdendro.html

#0- load libraries ----
{
  library(ggseqlogo)
  library(DECIPHER)
}
#1- import seqs to db ----

#creat/load SQLdb
db_fish12S <- dbConnect(SQLite(), "~/outros/sfjq_temp/db/fish12S_all_seqs_nov21.sql")


#sequências do LGC 12S db
# Seqs2DB(seq = "~/prjcts/fish_eDNA/",
Seqs2DB(seq = "~/outros/sfjq_temp/db/dada_tax_fullDB_order_SPs_dez21.fasta",
        type =  "FASTA",dbFile =  db_fish12S, identifier =  "LGC 12S DB")


BrowseDB(db_fish12S)

#2- add seq info to DB ----

#seqlengths to DB
l <- DECIPHER::IdLengths(dbFile = db_fish12S)

Add2DB(l, db_fish12S, verbose=TRUE)
BrowseDB(db_fish12S)

#3- get all seqs from DB ----


dna_12Sdb <- SearchDB(dbFile = db_fish12S,identifier = "LGC 12S DB",nameBy = "description")

#merge seq sets
dna_all <- dna_12Sdb


#4- create and assign new names ----

DB_tbl <- tibble("Original names" = names(dna_all),
                 "Identifier" = as.character(""),
                 "Names" = as.character(""),
                 "River basin" = as.character(""),
                 "Cluster" = as.numeric(""),
                 "Composed name"= as.character(""),
                 "genus"= as.character("")) # it will be used to search NCBI


# identify basin from origin file
for (seq in 1:nrow(DB_tbl)) {
  if (DB_tbl$`Original names`[seq] %in% names(dna_sf)) {
    DB_tbl$`River basin`[seq] = "SF"}

  if (DB_tbl$`Original names`[seq] %in% names(dna_jq)) {
    DB_tbl$`River basin`[seq] = "Jq"}

  if (DB_tbl$`Original names`[seq] %in% names(dna_non)){
    DB_tbl$`River basin`[seq] = "Out"}

  if (DB_tbl$`Original names`[seq] %in% names(dna_sfjq2)){
    DB_tbl$`River basin`[seq] = "SFJq2"}
}


# correct species name
for (seq in 1:nrow(DB_tbl)) {

  DB_tbl$Identifier[seq] <-  str_split_fixed(string = DB_tbl$`Original names`[seq], pattern = "_", n = 2)[1]
  DB_tbl$Names[seq] <- str_split_fixed(string = DB_tbl$`Original names`[seq], pattern = "_", n = 2)[2] %>%
    str_replace_all(pattern = "_",replacement = " ") #make replacement to remov aff, cf...
  DB_tbl$genus[seq] <- str_split(string = DB_tbl$Names[seq],pattern = " ")[[1]][1]

}

# create new name and assign to original sequences

for (seq in 1:nrow(DB_tbl)) {
  for (seq2 in 1:length(dna_all)){

    if (names(dna_all)[seq2] == DB_tbl$`Original names`[seq]) {

      names(dna_all)[seq2] <- paste0(DB_tbl$`River basin`[seq]," | ",DB_tbl$Names[seq]," | ",DB_tbl$Identifier[seq])
      DB_tbl$`Composed name`[seq] <- names(dna_all)[seq2]
    }
  }
}



#6- align DB seqs ----
#remove non-fish sequences

            # dna_all <- dna_all[!(names(dna_all) %in% c("Out | Homo sapiens | JN034109.1", #removing
            #                                            "Out | Bos taurus | AJ885201.1"))] #removing
# select only species in pools
#orinentarsequencias antes do alinhamento caso tenha alguma invertida

dna_all %>% names() %>% paste0(collapse = '",\n"') %>% cat()




c(
" |  | SF | Franciscodoras marmoratus | 0708",
" |  | SF | Tetragonopterus chalceus | 0901",
" |  | SF | Roeboides xenodon | 0916",
" |  | SF | Brycon orthotaenia | 1037",
" |  | SF | Hypostomus alatus | 1041",
" |  | SF | Pamphorichthys hollandi | 1099",
" |  | SF | Microglanis leptostriatus | 1104",
" |  | SF | Moenkhausia sanctaefilomenae | 1113",
" |  | SF | Eigenmannia virescens | 1117",
" |  | SF | Pterygoplichthys etentaculatus | 1141",
" |  | SF | Astyanax cf fasciatus | 1153",
" |  | SF | Imparfinis minutus | 1161",
" |  | SF | Myleus micans | 1162",
" |  | SF | Serrasalmus brandtii | 1230",
" |  | SF | Trachelyopterus galeatus | 1243",
" |  | SF | Gymnotus carapo | 1248",
" |  | SF | Crenicichla lepidota | 1264",
" |  | SF | Pimelodus maculatus | 1280",
" |  | SF | Prochilodus costatus | 1339",
" |  | SF | Phalloceros uai | 1368",
" |  | SF | Hoplias intermedius | 1377",
" |  | SF | Characidium lagosantense | 1381",
" |  | SF | Pimelodus pohli | 1403",
" |  | Jq | Gymnotus carapo | 1631",
" |  | Jq | Hypostomus sp | 1711",
" |  | Jq | Megaleporinus elongatus | 1762",
" |  | Jq | Prochilodus hartii | 1765",
" |  | Jq | Megaleporinus elongatus | 1761",
" |  | Jq | Hoplias brasiliensis | 1772",
" |  | Jq | Delturus brevis | 1936",
" |  | Jq | Steindachnerina elegans | 231",
" |  | Jq | Hoplosternum littorale | 2643",
" |  | Jq | Prochilodus costatus | 2860",
" |  | Jq | Colossoma macropomum | 2871",
" |  | Jq | Anchoviella lepidentostole | 2912",
" |  | Jq | Wertheimeria maculata | 7906",
" |  | Jq | Hypostomus nigrolineatus | 7893",
" |  | Jq | Lycengraulis grossidens | 2918",
" |  | Jq | Serrasalmus brandtii | 7890",
" |  | Jq | Cichlasoma facetum | 292",
" |  | Jq | Cichla kelberi | 2932",
" |  | Jq | Eugerres brasilianus | 2943",
" |  | Jq | Hypomasticus steindachneri | 2984",
" |  | Jq | Steindachneridion amblyurum | 2996",
" |  | Jq | Geophagus brasiliensis | 303",
" |  | Jq | Geophagus brasiliensis | 304",
" |  | Jq | Rhamdia cf jequitinhonha | 3054",
" |  | Jq | Pareiorhaphis stephanus | 4196",
" |  | Jq | Megaleporinus garmani | 4290",
" |  | Jq | Harttia garavelloi | 439",
" |  | Jq | Prochilodus argenteus | 5612",
" |  | Jq | Rhamdia aff quelen | 5645",
" |  | Jq | Hypostomus sp | 5650",
" |  | Jq | Hypomasticus steindachneri | 5664",
" |  | Jq | Astyanax cf lacustris | 5672",
" |  | Jq | Trachelyopterus cf galeatus | 5675",
" |  | Jq | Astyanax aff fasciatus | 5693",
" |  | Jq | Astyanax aff fasciatus | 5694",
" |  | Jq | Astyanax aff fasciatus | 5695",
" |  | Jq | Hypostomus sp | 5703",
" |  | Jq | Hypostomus sp | 5704",
" |  | Jq | Serrasalmus brandtii | 6261",
" |  | Jq | Trachelyopterus striatulus | 6583",
" |  | Jq | Australoheros sp | 6584",
" |  | Jq | Steindachnerina elegans | 6586",
" |  | Jq | Cichla sp | 6590",
" |  | Jq | Hoplias sp | 6630",
" |  | Jq | Megaleporinus elongatus | 6633",
" |  | Jq | Prochilodus hartii | 7033",
" |  | Jq | Oligosarcus macrolepis | 7690",
" |  | Jq | Moenkhausia costae | 7812",
" |  | Jq | Wertheimeria maculata | 7817",
" |  | Jq | Hoplias malabaricus | 7822",
" |  | Jq | Hypomasticus steindachneri | 7826",
" |  | Jq | Astyanax lacustris | 7880",
" |  | Out | Prionace glauca | AF448008.1",
" |  | Out | Bos taurus | AJ885201.1",
" |  | Out | Homo sapiens | JN034109.1",
" |  | Out | Prochilodus costatus | NC027690.1",
" |  | Out | Canis familiaris | AY729880.1",
" |  | SFJq2 | Knodus moenkhausii  | KF209618.1",
" |  | SFJq2 | Conorhynchos conirostris | JX899737.1",
" |  | SFJq2 | Leporellus vittatus | LC104399.1")




dna_all <- OrientNucleotides(dna_all)
names(dna_all)

dna_aligned <- AlignSeqs(myXStringSet = dna_all,refinements = 700,iterations = 700,verbose = TRUE)

# BrowseSeqs(dna_aligned,colorPatterns = c(300,600))
BrowseSeqs(dna_aligned)

#7 - distance matrix ----

dna_alg_sub <- subseq(x = dna_aligned,start = 300,end = 1100)

dna_alg_sub <- RemoveGaps(myXStringSet = dna_alg_sub,
                          processors = 10,
                          removeGaps = "all")

dna_alg_sub <-  AlignSeqs(myXStringSet = dna_alg_sub,refinements = 700,iterations = 700,verbose = TRUE)


BrowseSeqs(dna_alg_sub)


# Identify clusters for primer design
dna_alg_sub_dist <- DistanceMatrix(myXStringSet = dna_alg_sub,
                           includeTerminalGaps = FALSE,
                           correction = "Jukes-Cantor",
                           processors = 20)
dim(dna_alg_sub_dist) # a symmetric matrix

#8 - Clusters ----
#identificar clusters na sequencais em função de suas distâncias

dna_clust <- IdClusters(dna_alg_sub_dist,
                         method="UPGMA",
                         cutoff=0.05,
                         show=TRUE, #gera um cladograma, muito dificil de configurar/interpretar
                         type = "both")
head(dna_clust) # cluster numbers


#Adicionar a info de clusters no DB

# Identify sequences by cluster name in the database
Add2DB(data.frame(
  cluster=dna_clust[[1]]$cluster),
  dbFile = db_fish12S,
  verbose = TRUE)

BrowseDB(db_fish12S,orderBy = "cluster")


#add clusters to DB_tbl

for (seq in 1:nrow(DB_tbl)) {
  for (seq2 in 1:nrow(dna_clust[[1]])) {

    if ((rownames(dna_clust[[1]])[seq2]) == (DB_tbl$`Composed name`[seq])) {
      DB_tbl$Cluster[seq] <- (dna_clust[[1]][seq2,])
    }
  }
}

# DB_tbl %>%
#   arrange(by = Cluster)

#order seqs by cluster and write fasta
dna_order <- dna_alg_sub[order(DB_tbl$Cluster),]





writeXStringSet(x = dna_order,
                filepath = "~/prjcts/fish_eDNA/data/refs/db/DB_order_clust.fasta",
                format = "fasta")

# this file was used to generate a MLtree on MEGAX, to identify identic sequences


#9- get taxonomic information to construct DADA2 formated headers----

#load entrez function
extract_taxonomy <- function(organism_name) {
  `%>%` <- dplyr::`%>%`
  organism_xml <- sys::exec_internal("bash",
                                     args = c("/opt/src/entrez_taxonomy.sh", organism_name),
                                     timeout = 1000)
  if (length(sys::as_text(organism_xml$stdout)) ==  0) {
    message(paste0("deu errado para: ",organism_name))
    return(tibble())
  }else{
    organism_df <- sys::as_text(organism_xml$stdout) %>%
      jsonlite::fromJSON()
    message(paste0("deu certo para: ",organism_name))

    return(organism_df)
  }
}

#clear warnings
#assign("last.warning", NULL, envir = baseenv())


# create and fill taxonomy table for the function
taxonomy_df <- dplyr::tibble()

orgs2search <- unique(sort.default(DB_tbl$genus))

taxonomy_df <- purrr::map_df(orgs2search, extract_taxonomy)
# #taxonomy_df <- purrr::map_df(DB_tbl$genus, extract_taxonomy)


#tentando com um for

taxons_tbl <- tibble::tibble()

for (taxa in 1:length(orgs2search)) {

  # taxonomy_df <- purrr::map_df(orgs2search[taxa], extract_taxonomy) #use only if map_df doesnt work for all together

  tax_df <- taxonomy_df[1]$TaxaSet[[taxa]]$LineageEx$Taxon

  tax_df <- tax_df %>%
    mutate("genus" = taxonomy_df[1]$TaxaSet[[taxa]]$ScientificName)

  taxons_tbl <- dplyr::bind_rows(taxons_tbl,tax_df)

}



# orgs2search

# unique(taxons_tbl$genus)


taxons_tbl_bckp <- taxons_tbl


#o map_df ta dando erro, vou fazer com um for
#
# taxonomy_df <- vector(mode = "list", length = length(orgs2search))
#
# for (org in 1:length(orgs2search)) {
#
#   taxonomy_df[[org]]<- extract_taxonomy(organism_name = orgs2search[org])
#
# }
#





#
#
#
# # create a and fill a dataframe for taxonomic ranks
# orgs_tbl <- tibble::tibble()
#
# for (org in 1:nrow(taxonomy_df)) {
#
#   scin_name <- taxonomy_df$TaxaSet[[org]]$ScientificName
#
#   org_tbl <- tibble::as_tibble(taxonomy_df$TaxaSet[[org]]$LineageEx$Taxon) %>%
#     dplyr::mutate("genus" = scin_name)%>%
#     dplyr::filter(Rank %in% c("kingdom","phylum","class","order","family")) %>%
#     tidyr::pivot_wider(names_from = Rank,values_from = c(ScientificName,TaxId)) %>%
#     dplyr::select(genus,dplyr::starts_with("Scie"))
#
#   orgs_tbl <- dplyr::bind_rows(orgs_tbl,org_tbl)
# }


# create a and fill a dataframe for taxonomic ranks
orgs_tbl <- tibble::tibble()

for (org in 1:length(orgs2search)) {

  org_tbl <- taxons_tbl %>%
    filter(genus == orgs2search[org]) %>%
    dplyr::filter(Rank %in% c("kingdom","phylum","class","order","family")) %>%
    tidyr::pivot_wider(names_from = Rank,values_from = c(ScientificName,TaxId)) %>%
    dplyr::select(genus,dplyr::starts_with("Scie"))

  orgs_tbl <- dplyr::bind_rows(orgs_tbl,org_tbl)
}




#10- bind tax rank cols to DB_tbl ----

colnames(orgs_tbl) <- str_remove(string = colnames(orgs_tbl),
                                 pattern = "ScientificName_")
orgs_tbl <- unique(orgs_tbl)

# DB_tbl_bckp <- DB_tbl
# DB_tbl <- DB_tbl_bckp


DB_tbl <- left_join(x = DB_tbl, y = orgs_tbl,by = "genus")

DB_tbl <- DB_tbl %>% mutate("headers" = as.character(""))

#11- creat fasta headers with ranks for DADA2 ----

for (seq in 1:nrow(DB_tbl)) {
  DB_tbl$headers[seq] <- paste0(DB_tbl$kingdom[seq],";",
                      DB_tbl$phylum[seq],";",
                      DB_tbl$class[seq],";",
                      DB_tbl$order[seq],";",
                      DB_tbl$family[seq],";",
                      DB_tbl$genus[seq],";",
                      DB_tbl$Names[seq],";",
                      DB_tbl$Identifier[seq],";",
                      DB_tbl$`River basin`[seq])

}

#12- generate final DB fasta ----



final_dna <- RemoveGaps(myXStringSet = dna_alg_sub,
                        processors = 10,
                        removeGaps = "all")


#rename final dna header
for (seq in 1:nrow(DB_tbl)) {
  for (seq2 in 1:length(final_dna)){

    if (names(final_dna)[seq2] == DB_tbl$`Composed name`[seq]) {

      names(final_dna)[seq2] <- DB_tbl$headers[seq]
    }
  }
}


#order seqs by cluster and write fasta
final_dna_order <- final_dna[order(DB_tbl$Cluster),]



#13 - Identify problematic species to rule out

names(final_dna_order)


#the species to be removed from the final database are:



names(final_dna_order[197])
#193 - Trachelyopterus striatulus 6583
names(final_dna_order[29])
#193 - Orthospinus franciscensis 1071

# final_dna_clean <- final_dna_order[-c(29,197)]
final_dna_clean <- final_dna_order



#write fasta
writeXStringSet(x = final_dna_clean,filepath = "~/prjcts/fish_eDNA/data/refs/db/LGC/fev21/dada_tax_fullDB_order_fev21.fasta",format = "fasta")
# writeXStringSet(x = final_dna_clean,filepath = "~/prjcts/fish_eDNA/data/refs/db/LGC/dada_tax_fullDB_order.fasta",format = "fasta")




#create DB seqs for assigSpecies


#rename final dna header
for (seq in 1:nrow(DB_tbl)) {
  for (seq2 in 1:length(final_dna)){

    if (names(final_dna)[seq2] == DB_tbl$headers[seq]) {

      names(final_dna)[seq2] <- paste(DB_tbl$Identifier[seq],DB_tbl$Names[seq])
    }
  }
}


#order seqs by cluster and write fasta
final_dna_order <- final_dna[order(DB_tbl$Cluster),]

# final_dna_clean <- final_dna_order[-c(29,197)]
final_dna_clean <- final_dna_order

#write fasta
# writeXStringSet(x = final_dna_clean,filepath = "~/prjcts/fish_eDNA/data/refs/db/LGC/dada_tax_fullDB_order_SPs.fasta",format = "fasta")
writeXStringSet(x = final_dna_clean,filepath = "~/prjcts/fish_eDNA/data/refs/db/LGC/fev21/dada_tax_fullDB_order_SPs_fev21.fasta",format = "fasta")








# rename "aff" and "cf"

# sed -i '/>/ s/aff /aff_/'
# sed -i '/>/ s/cf /cf_/'




  neo_FWD_seqs
  
  
"neo_FWD"
"SF | Synbranchus marmoratus | 1404"
"SF | Synbranchus marmoratus | 1405"
"Jq | Oligosarcus macrolepis | 7690"
"SF | Leporinus macrocephalus | 0982"
"SFJq2 | Conorhynchos conirostris | JX899737.1"
"Jq | Rhamdia cf jequitinhonha | 3054"
"SF | Cetopsorhamdia iheringi | 1372"
"SF | Imparfinis minutus | 1161"
"SF | Imparfinis minutus | 1370"
"SF | Rhamdia quelen | 1361"
"Jq | Rhamdia aff quelen | 5645"
"SF | Pimelodella vittata | 1287"
"SF | Pimelodella vittata | 1288"
"SF | Pimelodella vittata | 1289"
"SF | Pseudoplatystoma corruscans | 1135"
"Jq | Steindachneridion amblyurum | 2996"
"SF | Pimelodus maculatus | 0749"
"SF | Pimelodus fur | 1147"
"SF | Pimelodus maculatus | 1280"
"SF | Pimelodus fur | 1345"
"SF | Pimelodus pohli | 1400"
"SF | Pimelodus pohli | 1401"
"SF | Pimelodus pohli | 1403"
"SF | Microglanis leptostriatus | 1102"
"SF | Microglanis leptostriatus | 1104"
"SF | Pseudopimelodus sp | 0671"
  
  
  




#recover Database region for primers

BrowseSeqs(dna_alg_sub[order(DB_tbl$Cluster),]) #all seqs

#neo FWD: 741-760          CGCCGTCGCAAGCTTACCCT
neo_FWD_seqs <- subseq(dna_alg_sub[order(DB_tbl$Cluster),],433,472) # primar +-10
neo_FWD_seqs <- subseq(dna_alg_sub[order(DB_tbl$Cluster),],443,462) # primer only
neo_FWD_seqs <- c(DNAStringSet(neo_FWD),neo_FWD_seqs)
BrowseSeqs(neo_FWD_seqs) #neo FWD seqs

BrowseSeqs(neo_FWD_seqs,colorPatterns = FALSE,highlight = 1) #neo FWD seqs



BrowseSeqs(neo_FWD_seqs[c(
  "neo_FWD",
  "SF | Synbranchus marmoratus | 1404",
  "SF | Synbranchus marmoratus | 1405",
  "Jq | Oligosarcus macrolepis | 7690",
  "SF | Leporinus macrocephalus | 0982",
  "SFJq2 | Conorhynchos conirostris | JX899737.1",
  "Jq | Rhamdia cf jequitinhonha | 3054",
  "SF | Cetopsorhamdia iheringi | 1372",
  "SF | Imparfinis minutus | 1161",
  "SF | Imparfinis minutus | 1370",
  "SF | Rhamdia quelen | 1361",
  "Jq | Rhamdia aff quelen | 5645",
  "SF | Pimelodella vittata | 1287",
  "SF | Pimelodella vittata | 1288",
  "SF | Pimelodella vittata | 1289",
  "SF | Pseudoplatystoma corruscans | 1135",
  "Jq | Steindachneridion amblyurum | 2996",
  "SF | Pimelodus maculatus | 0749",
  "SF | Pimelodus fur | 1147",
  "SF | Pimelodus maculatus | 1280",
  "SF | Pimelodus fur | 1345",
  "SF | Pimelodus pohli | 1400",
  "SF | Pimelodus pohli | 1401",
  "SF | Pimelodus pohli | 1403",
  "SF | Microglanis leptostriatus | 1102",
  "SF | Microglanis leptostriatus | 1104",
  "SF | Pseudopimelodus sp | 0671")],colorPatterns = FALSE,highlight = 1) #neo FWD seqs


seqs_logo_plot <- ggplot2::ggplot() +
  ggseqlogo::geom_logo(data = as.character(neo_FWD_seqs),
                       method = "probability") +
  ggplot2::annotate(geom = "rect",
                    xmin = 10.5,xmax = 30.5,
                    ymin = 1,ymax = 1.15,
                    fill="#c41422", alpha=0.50) +
  ggplot2::annotate(geom = "rect",
                    xmin = 9.5,xmax = 29.5,
                    ymin = -0.15,ymax = 0,
                    fill="#089900", alpha=0.50) +
  ggplot2::annotate(geom = "text", label = "Adjusted NeoFish FWD primer",
                    x = 20,y = -0.075) +
  ggplot2::annotate(geom = "text", label = "Original NeoFish FWD primer",
                    x = 20,y = 1.075)

seqs_logo_plot

dev.off()

ggplot2::ggsave(filename = "~/prjcts/fish_eDNA/sfjq/results/figs/NeoFIsh_FWD_primer_adjustment.png",
                device = "png",
                width = 40, height = 8,
                units = "cm")





BrowseSeqs(neo_FWD_seqs,colorPatterns = FALSE,highlight = 0) #neo FWD seqs

#neo REV: 973-992          GC-ACACACCGCCCGTCACT
neo_REV_seqs <- subseq(dna_aligned[order(DB_tbl$Cluster),],972,991)
neo_REV_seqs <- c(DNAStringSet("GC-ACACACCGCCCGTCACT"),neo_REV_seqs)
BrowseSeqs(neo_REV_seqs)
BrowseSeqs(neo_REV_seqs,colorPatterns = FALSE,highlight = 1) #neo REV seqs


#mif FWD: 304-325
mif_FWD_seqs <- subseq(dna_aligned[order(DB_tbl$Cluster),],304,325)
mif_FWD_seqs <- c(DNAStringSet("GTCGGTAAAACTCGT-GCCAGC"),mif_FWD_seqs)
BrowseSeqs(mif_FWD_seqs) #mif FWD seqs
BrowseSeqs(mif_FWD_seqs,colorPatterns = FALSE,highlight = 1) #mif FWD seqs
BrowseSeqs(mif_FWD_seqs,colorPatterns = TRUE,highlight = 1) #mif FWD seqs

#mif REV: 533-559
mif_REV_seqs <- subseq(dna_aligned[order(DB_tbl$Cluster),],532,559)
mif_REV_seqs <- c(DNAStringSet("CAAACTGGGATTAGATACCCCACTATGT"),mif_REV_seqs)
BrowseSeqs(mif_REV_seqs) #mif REV seqs
BrowseSeqs(mif_REV_seqs,colorPatterns = FALSE,highlight = 1) #mif REV seqs
BrowseSeqs(mif_REV_seqs,colorPatterns = TRUE,highlight = 1) #mif REV seqs








BrowseSeqs(neo_FWD_seqs,colorPatterns = FALSE,highlight = 1) #neo seqs
BrowseSeqs(subseq(dna_aligned[order(DB_tbl$Cluster),],761,972)) #neo seqs
BrowseSeqs(subseq(dna_aligned[order(DB_tbl$Cluster),],761,972)) #neo seqs
#mif FWD: 304-325
#mif REV: 533-560
BrowseSeqs(subseq(dna_aligned[order(DB_tbl$Cluster),],326,532)) #mif seqs
#tel FWD: 977-993
#tel REV: 1078-1098
BrowseSeqs(subseq(dna_aligned[order(DB_tbl$Cluster),],994,1077)) #tel seqs





#
#
# how many times each species is in pool
# (jq_tbl$Species %>% str_replace_all(pattern = " ",replacement = "_")) %in%
#   (DB_tbl$`Original names` %>% str_remove(pattern = "^.*.[[:digit:]]_") )



# ADENDUM----
#13 - map primers on DB ----

# set primers sequences
neo_FWD <- "CGCCGTCGCAAGCTTACCCT" #Mini_bar3
neo_REV <- "AGTGACGGGCGGTGTGTGC"  #Mini_bar3

mif_FWD <- "GTCGGTAAAACTCGTGCCAGC"
mif_REV <-    "CATAGTGGGGTATCTAATCCCAGTTTG"
# mif_REV <- "ACATAGTGGGGTATCTAATCCCAGTTTG"

tel_FWD <- "ACACCGCCCGTCACTCT" #
tel_REV <-  "CTTCCGGTACACTTACCATG"
# tel_REV <- "ACTTCCGGTACACTTACCATG"

primers <- DNAStringSet(c(neo_FWD, neo_REV, mif_FWD, mif_REV, tel_FWD, tel_REV))
primers_rev <- reverseComplement(primers)

names(primers) <- c("neo_FWD","neo_REV","mif_FWD","mif_REV","tel_FWD","tel_REV")

##13a -generate primer orientations----

#function to get all possible primer orientations
allOrients <- function(primers) {
  # Create all orientations of the input sequence
  require(Biostrings)
  dna <- DNAString(primers)  # The Biostrings works w/ DNAString objects rather than character vectors
  orients <- c(Forward = dna, Complement = Biostrings::complement(dna), Reverse = reverse(dna),
               RevComp = reverseComplement(dna))
  return(sapply(orients, toString))  # Convert back to character vector
}


#gerar todas orientações possíveis dos primers
neo_FWD.orients <- allOrients(neo_FWD)
names(neo_FWD.orients) <- paste0("neo_FWD-", names(neo_FWD.orients))
neo_REV.orients <- allOrients(neo_REV)
names(neo_REV.orients) <- paste0("neo_REV-", names(neo_REV.orients))
mif_FWD.orients <- allOrients(mif_FWD)
names(mif_FWD.orients) <- paste0("mif_FWD-", names(mif_FWD.orients))
mif_REV.orients <- allOrients(mif_REV)
names(mif_REV.orients) <- paste0("mif_REV-", names(mif_REV.orients))
tel_FWD.orients <- allOrients(tel_FWD)
names(tel_FWD.orients) <- paste0("tel_FWD-", names(tel_FWD.orients))
tel_REV.orients <- allOrients(tel_REV)
names(tel_REV.orients) <- paste0("tel_REV-", names(tel_REV.orients))

primers_all <- DNAStringSet(c(neo_FWD.orients, neo_REV.orients,
                              mif_FWD.orients, mif_REV.orients,
                              tel_FWD.orients, tel_REV.orients))

primers_all
as.list(primers_all)

#13b - find primers in aligned DB ----


neo_db_html <- BrowseSeqs(myXStringSet = dna_order,
                          patterns=primers_all, #neo primers
                          colorPatterns = TRUE,
                          colors = c("#00ff00","#99ff33","#00ff00","#99ff33","#00ff00","#99ff33","#00ff00","#99ff33",  # neo verde
                                     "#0000ff","#0066cc","#0000ff","#0066cc","#0000ff","#0066cc","#0000ff","#0066cc",  # mif azul
                                     "#ff0000","#ff3300","#ff0000","#ff3300","#ff0000","#ff3300","#ff0000","#ff3300")) # tel vermelho



#14- isolate neo amplicon ----
                 # changes every time is realigned

neoFWDstart <- 645
neoFWDend <- 664
neoREVstart <- 871
neoREVend <- 890




#identify mismatches in the primer sequence alignment




pool_sps[!(pool_sps %in% DB_tbl$Names)]


DB_tbl$`Composed name`[DB_tbl$Names %in% pool_sps]

BrowseSeqs(dna_order[DB_tbl$`Composed name`[DB_tbl$Names %in% pool_sps]],colorPatterns = FALSE)


neo_ampli<- subseq(x = dna_aligned,685,896)

BrowseSeqs(neo_ampli)

#rename neo ampli seqs for mega

neo_ampli

neo_dna_order <- subseq(dna_order,367,577)

names(neo_dna_order)

BrowseSeqs(dna_order[126:136],highlight = 0,threshold=0.2,colorPatterns = FALSE)
BrowseSeqs(neo_dna_order[c(184,186)],highlight = 0)

writeXStringSet(x = neo_dna_order,filepath = "~/prjcts/fish_eDNA/data/refs/db/neo_DB_order_clust.fasta",format = "fasta")

names(neo_ampli)

BrowseSeqs(neo_ampli[c("Jq | Trachelyopterus cf galeatus | 5675","Jq | Trachelyopterus striatulus | 6583")],highlight = 1)



#pegar apenas a sequencia do primer e ordenar

neo_FWD_DNAFISH <- subseq(DNAFISH, neoFWDstart, neoFWDend,) %>% BiocGenerics::sort()
neo_REV_DNAFISH <- subseq(DNAFISH, neoREVstart, neoREVend,) %>% BiocGenerics::sort()


consensus_neo_FWD <- ConsensusSequence(neo_FWD_DNAFISH)
consensus_neo_REV <- ConsensusSequence(neo_REV_DNAFISH)


# NeoFIsh ggseqlogo ----


writeXStringSet(x = neo_ampli,filepath = "~/prjcts/fish_eDNA/data/refs/db/neo_ampli.fasta",format = "fasta")


names(dna_all)
```





**This is a dinamic report, intended to show the current state of analyses. Many procedures and conclusions might change as the pipeline evolves. If you notice errors/mistakes/typos, or have any suggestions, we would be glad to know. _heronoh@gmail.com_**
