---
title: "Marsden Cove ddPCR data processing"
author: "Ulla & Oli"
date: "November 2023"
output: 
  distill::distill_article:
    toc: true
    theme: night
    highlight: pygments
    center: true
code_folding: true
---

### Loading libraries

```{r Loading libraries, message=FALSE, warning=FALSE, code_folding=TRUE}
library(biohelper)
library(ggplot2)
library(dplyr)
library(data.table)
library(Biostrings)
library(janitor)
library(ggthemes)
library(DT)
library(tibble)
library(tidyverse)
library(UpSetR)
library(vegan)
library(plotly)
library(crosstalk)
library(ranacapa)
library(plyr)
library(glue)
library(RColorBrewer)
library(DECIPHER)
library(UpSetR)
library(theseus)
library(microViz)
library(grid)
library(ggpubr)
library(pracma)
library(ggbreak) 
library(patchwork)
library(twoddpcr)

# options(rgl.useNULL=TRUE)
# library(dpcR)
dir.create("../Results_data_analysis")

time_func = function(df){
  df$time2 = 0
  for (i in 1:nrow(df)) {
    if(df$time[i] == 0.1){
      df$time2[i] = 10
    }else if(df$time[i] == 0.3){
      df$time2[i] = 30
    }else if(df$time[i] == 1.0){
      df$time2[i] = 60
    }else if(df$time[i] == 3.0){
      df$time2[i] = 180
    }else if(df$time[i] == 6.0){
      df$time2[i] = 360
    }else if(df$time[i] == 12.0){
      df$time2[i] = 720
      }
    }
    return(df)
  }
```


##### Bugula
```{r}
########### RUN 2
plate <- ddpcrPlate(wells = "../ddPCR_data/Bugula/Run 2/")
meta = readxl::read_xlsx("../ddPCR_data/Bugula/Run 2/ddPCR Run 2 Begula Marsden cove  22-10-31.xlsx",sheet = 2, col_names = F) %>%
  magrittr::set_colnames(c("well","sample_name"))

facetPlot(plate)
dropletPlot(plate[["G01"]], cMethod="Cluster")

ch1_lim = 18500
ch2_lim = 7500

plate <- thresholdClassify(plate, ch1Threshold=ch1_lim, ch2Threshold=ch2_lim)

facetPlot(plate[c("G01","H01")], 
          cMethod="thresholds", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

plate <- mahalanobisRain(plate, cMethod="thresholds",
                         maxDistances=list(NN=15, NP=15, PN=15, PP=15))
facetPlot(plate[c("G01","H01")], 
          cMethod="thresholdsMahRain", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

commonClassificationMethod(plate)

facetPlot(plate, 
          cMethod="thresholds", 
          pointSize = 0.5) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")


df_plate = plateSummary(plate, cMethod="thresholds")
df_plate_r = plateSummary(plate, cMethod="thresholdsMahRain") %>% 
  rownames_to_column("well")
df_Bugudula_r2 = left_join(df_plate_r %>% dplyr::select(c(well, TotalCopiesPer20uLWell)), meta)


########### RUN 3
plate <- ddpcrPlate(wells= "../ddPCR_data/Bugula/Run 3/")
meta = readxl::read_xlsx("../ddPCR_data/Bugula/Run 3/ddPCR Run 3 Begula Marsden cove  22-11-02.xlsx",sheet = 2, col_names = F) %>%
  magrittr::set_colnames(c("well","sample_name"))
#facetPlot(plate)
dropletPlot(plate[["G01"]], cMethod="Cluster")

ch1_lim = 18500
ch2_lim = 7500

plate <- thresholdClassify(plate, ch1Threshold=ch1_lim, ch2Threshold=ch2_lim)

facetPlot(plate[c("G01","H01")], 
          cMethod="thresholds", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

plate <- mahalanobisRain(plate, cMethod="thresholds",
                         maxDistances=list(NN=15, NP=15, PN=15, PP=15))
facetPlot(plate[c("G01","H01")], 
          cMethod="thresholdsMahRain", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

facetPlot(plate, 
          cMethod="thresholds", 
          pointSize = 0.5) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")


df_plate = plateSummary(plate, cMethod="thresholds")
df_plate_r = plateSummary(plate, cMethod="thresholdsMahRain") %>% 
  rownames_to_column("well")

df_Bugudula_r3 = left_join(df_plate_r %>% dplyr::select(c(well, TotalCopiesPer20uLWell)), meta)

########### RUN 5
plate <- ddpcrPlate(wells= "../ddPCR_data/Bugula/Run 5/")
meta = readxl::read_xlsx("../ddPCR_data/Bugula/Run 5/ddPCR Run 5 Begula Marsden cove 22-11-03.xlsx",sheet = 2, col_names = F) %>%
  magrittr::set_colnames(c("well","sample_name"))
#facetPlot(plate)
dropletPlot(plate[["H11"]], cMethod="Cluster")

ch1_lim = 18500
ch2_lim = 7500

plate <- thresholdClassify(plate, ch1Threshold=ch1_lim, ch2Threshold=ch2_lim)
dropletPlot(plate[["H11"]], cMethod="thresholds")+
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted") + 
  theme_classic()

facetPlot(plate[c("H11","C11","D11","E11","F11","G11")], 
          cMethod="thresholds", 
          pointSize = 1) +
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted") + 
  theme_classic()

plate <- mahalanobisRain(plate, cMethod="thresholds",
                         maxDistances=list(NN=15, NP=15, PN=15, PP=15))
# facetPlot(plate[c("G01","H01")], 
#           cMethod="thresholdsMahRain", 
#           pointSize = 1) +  
#   geom_hline(yintercept = ch1_lim, linetype="dotted")+
#   geom_vline(xintercept = ch2_lim, linetype="dotted")

df_plate = plateSummary(plate, cMethod="thresholds")
df_plate_r = plateSummary(plate, cMethod="thresholdsMahRain") %>% 
  rownames_to_column("well")
df_Bugudula_r5 = left_join(df_plate_r %>% dplyr::select(c(well, TotalCopiesPer20uLWell)), meta)
```


##### Styela
```{r}
########### RUN 1
plate <- ddpcrPlate(wells= "../ddPCR_data/Styela/Run 1/")
meta = readxl::read_xlsx("../ddPCR_data/Styela/Run 1/ddPCR Run 1 Styela  22-10-18.xlsx",sheet = 2, col_names = F) %>%
  magrittr::set_colnames(c("well","sample_name"))

#facetPlot(plate)
dropletPlot(plate[["H01"]], cMethod="Cluster")

ch1_lim = 2500
ch2_lim = 6000

plate <- thresholdClassify(plate, ch1Threshold=ch1_lim, ch2Threshold=ch2_lim)

facetPlot(plate[c("G01","H01")], 
          cMethod="thresholds", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

plate <- mahalanobisRain(plate, cMethod="thresholds",
                         maxDistances=list(NN=15, NP=15, PN=15, PP=15))
facetPlot(plate[c("G01","H01")], 
          cMethod="thresholdsMahRain", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

commonClassificationMethod(plate)

facetPlot(plate, 
          cMethod="thresholds", 
          pointSize = 0.5) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")


df_plate = plateSummary(plate, cMethod="thresholds")
df_plate_r = plateSummary(plate, cMethod="thresholdsMahRain") %>% 
  rownames_to_column("well")
df_Styela_r1 = left_join(df_plate_r %>% dplyr::select(c(well, TotalCopiesPer20uLWell)), meta)


########### RUN 2
plate <- ddpcrPlate(wells= "../ddPCR_data/Styela/Run 2/")
meta = readxl::read_xlsx("../ddPCR_data/Styela/Run 2/ddPCR Run 2 Styela Marsden cove   22-10-19.xlsx",sheet = 2, col_names = F) %>%
  magrittr::set_colnames(c("well","sample_name"))

#facetPlot(plate)
facetPlot(plate[c("G12","H12")], 
          cMethod="Cluster", 
          pointSize = 1)

ch1_lim = 2500
ch2_lim = 2500

plate <- thresholdClassify(plate, ch1Threshold=ch1_lim, ch2Threshold=ch2_lim)

facetPlot(plate[c("G12","H12")], 
          cMethod="thresholds", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

plate <- mahalanobisRain(plate, cMethod="thresholds",
                         maxDistances=list(NN=15, NP=15, PN=15, PP=15))
facetPlot(plate[c("G12","H12")], 
          cMethod="thresholdsMahRain", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

# facetPlot(plate, 
#           cMethod="thresholds", 
#           pointSize = 0.5) +  
#   geom_hline(yintercept = ch1_lim, linetype="dotted")+
#   geom_vline(xintercept = ch2_lim, linetype="dotted")

df_plate = plateSummary(plate, cMethod="thresholds")
df_plate_r = plateSummary(plate, cMethod="thresholdsMahRain") %>% 
  rownames_to_column("well")
df_Styela_r2 = left_join(df_plate_r %>% dplyr::select(c(well, TotalCopiesPer20uLWell)), meta)

########### RUN 3
plate <- ddpcrPlate(wells= "../ddPCR_data/Styela/Run 3/")
meta = readxl::read_xlsx("../ddPCR_data/Styela/Run 3/ddPCR Run 3 Styela  Marsden cove 22-10-19.xlsx",sheet = 2, col_names = F) %>%
  magrittr::set_colnames(c("well","sample_name"))

facetPlot(plate[c("G11","H11")])
facetPlot(plate[c("G11","H11")], 
          cMethod="Cluster", 
          pointSize = 1)

ch1_lim = 2500
ch2_lim = 2500

plate <- thresholdClassify(plate, ch1Threshold=ch1_lim, ch2Threshold=ch2_lim)

facetPlot(plate[c("G11","H11")], 
          cMethod="thresholds", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

plate <- mahalanobisRain(plate, cMethod="thresholds",
                         maxDistances=list(NN=15, NP=15, PN=15, PP=15))
facetPlot(plate[c("G11","H11")], 
          cMethod="thresholdsMahRain", 
          pointSize = 1) +  
  geom_hline(yintercept = ch1_lim, linetype="dotted")+
  geom_vline(xintercept = ch2_lim, linetype="dotted")

# facetPlot(plate, 
#           cMethod="thresholds", 
#           pointSize = 0.5) +  
#   geom_hline(yintercept = ch1_lim, linetype="dotted")+
#   geom_vline(xintercept = ch2_lim, linetype="dotted")

df_plate = plateSummary(plate, cMethod="thresholds")
df_plate_r = plateSummary(plate, cMethod="thresholdsMahRain") %>% 
  rownames_to_column("well")
df_Styela_r3 = left_join(df_plate_r %>% dplyr::select(c(well, TotalCopiesPer20uLWell)), meta)

###########################################
###########################################
# Merging and saving results
###########################################

df_Styela = bind_rows(df_Styela_r2 %>% dplyr::select(sample_name, TotalCopiesPer20uLWell),
                      df_Styela_r3 %>% dplyr::select(sample_name, TotalCopiesPer20uLWell)) %>% 
  janitor::clean_names() %>% 
  dplyr::mutate(copies_u_l = total_copies_per20u_l_well / 20)

which(df_Styela$sample_name %ni% metadata$id)
df_Styela_test = df_Styela %>% dplyr::filter(sample_name %ni% metadata$id)
```


---
title: "Data clean-up | 18S"
---

### Loading libraries
```{r Loading libraries, message=FALSE, warning=FALSE, code_folding=TRUE}
library(biohelper)
library(ggplot2)
library(dplyr)
library(data.table)
library(Biostrings)
library(janitor)
library(ggthemes)
library(DT)
library(tibble)
library(tidyverse)
library(UpSetR)
library(vegan)
library(plotly)
library(crosstalk)
library(ranacapa)
library(plyr)
library(glue)
library(theseus)
library(microViz)
```

### Loading the ps object and adding metadata
```{r Loading files, message=FALSE, warning=FALSE, code_folding=TRUE}
ps = readRDS("../Metabarcoding_data/ps_run.rds")

metadata =  read_csv("../Metadata/metadataMarsden.csv")%>%
  clean_names() #%>% as.matrix() %>% as.data.frame()

if(all(metadata$sample_id %in% sample_names(ps))){
    #cat("\nAll metadata samples are in the ASV table\n")
}else{
    cat("\nMissing samples in the ASV table?\n")
    cat(metadata[which(metadata$sample_id %ni% sample_names(ps)),]$sample_id)
}

if(all(sample_names(ps) %in% metadata$sample_id)){
  #cat("\nAll samples in the ASV table are in the metadata\n")
  metadata = metadata[match(sample_names(ps),metadata$sample_id),]
} else{
    cat("\nMissing samples in the metadata table?\n")
    cat(sample_names(ps)[which(sample_names(ps) %ni% metadata$sample_id)])
}

#cat("\nAdding metadata to the ps object\n")
#all(sample_names(ps) == metadata$sample_id)
ps = merge_phyloseq(ps,sample_data(metadata) %>% magrittr::set_rownames(metadata$sample_id))

ps@tax_table@.Data = ps@tax_table@.Data %>% as.data.frame() %>% janitor::clean_names() %>% dplyr::select(-"na")  %>% as.matrix()
# Normalising 18S taxonomy
ps = biohelper::taxo_normalisation(obj = ps, sqlFile = "~/Dropbox/R_scripts_N_bioinfo/Building_packages/biohelper/accessionTaxa.sql", ranks = c("superkingdom","kingdom", "phylum","class","order","family","genus","species"), keepSAR = T) 

# Adding ASV names in taxa table
ps@tax_table@.Data = ps@tax_table@.Data %>% as.data.frame() %>% dplyr::mutate("asv" = taxa_names(ps)) %>% as.matrix()
```


### Inspecting data and library size
```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
rd = fread("../Metabarcoding_data/Cleaned_data/Read_counts.csv")
rd$percent_retained_reads = round(rd$nonchim / rd$input * 100,2)
DT::datatable(rd, options = list(pageLength = 10))
```

```{r echo=FALSE, message=FALSE, warning=FALSE, code_folding=TRUE}
cat("\n\nData info\n")
ps
cat("\n\nMetadata info\n")
metadata %>% skimr::skim()

df <- ps %>% pstoveg_sample() 
if(is.null(df$sample_id)){
  df = df %>% rownames_to_column("sample_id")
}
df$LibrarySize <- sample_sums(ps)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
p1=ggplot(data=df[df$amplicon_type == "sample",], aes(x=Index, y=LibrarySize, colour = amplicon_type, label = sample_id)) + geom_point() + theme_stata() + ggtitle("Sequencing depth per sample") +xlab("Sample index") + ylab("Library size") + scale_color_brewer(palette="Paired",name = "Amplicon type")
p2=ggplot(data=df[df$amplicon_type != "sample",], aes(x=Index, y=LibrarySize, colour = amplicon_type, label = sample_id)) + geom_point() + theme_stata() + ggtitle("Sequencing depth per blank") +xlab("Sample index") + ylab("Library size") + scale_color_brewer(palette="Paired",name = "Amplicon type")

ggplotly(p1)
ggplotly(p2)
```

### Inspection of contamination
#### Looking at reads and richness in blanks
```{r echo=FALSE, message=FALSE, warning=FALSE}
ps_blanks = ps %>% 
  microViz::ps_filter(amplicon_type != "sample") %>%
  phyloseq::filter_taxa(function(x) sum(x) > 0, TRUE)
ps_blanks
# 
# # Creating a table
Contamination = ps_blanks %>% sample_names() %>% as.data.frame(stringsAsFactors=F)
names(Contamination) = "sample_name"
# ## Samples sum
Contamination$Read_count = ps_blanks %>% sample_sums() 
Contamination$Blank_type = ps_blanks@sam_data$amplicon_type
# 
# ## Diversity
R = estimate_richness(ps_blanks, measures="Observed")
Contamination$Richness = R$Observed
# 
# ## Ordering the table
Contamination = Contamination %>% dplyr::arrange(Richness)
# 
Contamination %>% datatable(class = 'cell-border stripe', caption = "Blanks and their read count and richness", options = list(pageLength = 100))
# 
dir.create("../Metabarcoding_data/Contamination")
write.table(Contamination,"../Metabarcoding_data/Contamination/Contamination.txt", quote = F, sep = "\t", row.names = F)
```

#### Proportion of blank ASVs only in blanks

```{r echo=FALSE, message=FALSE, warning=FALSE, collapse=T}
ASVs_in_Blanks = taxa_names(ps_blanks)
ASVs_in_Blanks_only = ASVs_in_Blanks[which(ASVs_in_Blanks %ni% (ps %>%subset_samples(amplicon_type == "sample") %>% phyloseq::filter_taxa(function(x) sum(x) > 0, TRUE) %>% taxa_names()) )]

paste0("Percent of ASVs in blanks only found in blanks: ",round(length(ASVs_in_Blanks_only)/length(ASVs_in_Blanks)*100,2)," %")
```

#### Looking at the importance of ASVs found in contamination

```{r echo=FALSE, message=FALSE, warning=FALSE, collapse=T, fig.height=8, fig.width=10}
ps_samples_RA = ps %>% subset_samples(amplicon_type == "sample") %>% phyloseq::filter_taxa(function(x) sum(x) > 0, TRUE) %>% microbiome::transform(transform = "compositional")
ps_samples_RA_contam = ps_samples_RA %>% subset_taxa(asv %in% ASVs_in_Blanks)

tb = round(taxa_sums(ps_samples_RA_contam) / nsamples(ps_samples_RA_contam) *100,2) %>% as.data.frame() %>% setNames("Mean relative abundance per taxa (%)") %>% dplyr::arrange("Mean relative abundance per taxa (%)") 
tax = ps@tax_table@.Data %>% as.data.frame()
tb$taxo = ""

for (i in 1:nrow(tb)) {
  tb$taxo[i] = tax[tax$asv == rownames(tb)[i],] %>% unite('taxo', kingdom:genus, sep = ": ")
}

round(sample_sums(ps_samples_RA_contam) *100,2) %>% as.data.frame() %>% setNames("Mean relative abundance per sample (%)") %>% dplyr::arrange(desc(`Mean relative abundance per sample (%)`)) %>% datatable(class = 'cell-border stripe', caption = "Mean relative abundance of taxa found in blanks in samples", options = list(pageLength = 10))

df_ps_samples_RA_contam = ps_samples_RA_contam %>% psmelt()

fig2 <- plot_ly(df_ps_samples_RA_contam, x = ~sample_id, y = ~Abundance*100, color = ~family, type = 'bar') %>% layout(yaxis = list(title = 'Relative abundance (%)'), barmode = 'stack') %>% layout(legend=list(title=list(text='<b> Family </b>')))
fig2

ASVs_in_Blanks_only2 = psmelt(ps_samples_RA_contam)
ASVs_in_Blanks_only2 = ASVs_in_Blanks_only2 %>% dplyr::group_by(superkingdom, kingdom, phylum, class, order, family, genus, species, asv) %>% dplyr::summarise(Mean_RA = round(mean(Abundance)*100,2), SD = sd(Abundance)*100 )

write.table(ASVs_in_Blanks_only2, "../Metabarcoding_data/Contamination/Taxo_blank_ASVs_in_samples.txt", quote = F, row.names = F, sep = "\t")
```

#### Effect of removing blanks following different methodologies

```{r echo=FALSE}
cat("\nRemoving all taxa found in blanks\n")
ps_trimmed = ps %>% biohelper::ps_decon(method = "complete_asv_removal")

cat("\nRemoving reads from taxa found in blanks using max_v\n")
ps_trimmed = ps %>% biohelper::ps_decon(method = "max_v")

cat("\nRemoving reads from taxa found in blanks using microDecon\n")
ps_trimmed = ps %>% biohelper::ps_decon(method = "microDecon", group = "matrix")

# In case contamination was substantial and we decide to keep all reads/ASVs to avoid creating a large impact on the data
ps_trimmed = ps %>% subset_samples(amplicon_type %in% c("sample","Sample")) %>% phyloseq::filter_taxa(function(x) sum(x) > 0, TRUE)
```


### Looking at eukaryotic and non-eukaryotic taxa

```{r echo=FALSE, fig.height=5, fig.width=10, collapse=TRUE}
paste0("Number of reads from Eukaryotes: ",ps_trimmed %>% subset_taxa(superkingdom == "Eukaryota" | family %ni% "Hominidae") %>% sample_sums() %>% sum())
paste0("Number of reads from non-eukaryotes: ",ps_trimmed %>% subset_taxa(superkingdom != "Eukaryota" | family %in% "Hominidae")%>% sample_sums() %>% sum())

Non_euk_ASVs = ps_trimmed %>% subset_taxa(superkingdom != "Eukaryota" | family %in% "Hominidae") %>% taxa_names()

ps_trimmed_non_euk = ps_trimmed %>% transform_sample_counts(function(x) 100 * x/sum(x)) %>% subset_taxa(superkingdom!="Eukaryota"| family %in% "Hominidae") 

ps_trimmed_non_euk@tax_table@.Data = ps_trimmed_non_euk@tax_table@.Data %>% as.data.frame() %>% replace(is.na(.), "NA") %>% as.matrix()
tax2 = ps_trimmed_non_euk@tax_table@.Data %>% as.data.frame() 

plot_bar(ps_trimmed_non_euk, 'sample_id', fill = "phylum", title = "Non-eukaryotes reads") + ylab("Reads (%)") + theme_calc() + theme(axis.text.x = element_text(angle = 90, size=4)) + xlab("Sample unique ID")

# Looking at percent of ASVs belonging to non-Eukaryota
Non_euk_ASVs_nb = ps_trimmed %>% subset_taxa(superkingdom != "Eukaryota" | family %in% "Hominidae") %>% phyloseq::filter_taxa(function(x) sum(x) > 0, TRUE) %>% taxa_names() %>% length()
Euk_ASVs_nb = ps_trimmed %>% subset_taxa(superkingdom == "Eukaryota" | family %ni% "Hominidae") %>% phyloseq::filter_taxa(function(x) sum(x) > 0, TRUE) %>% taxa_names() %>% length()
paste0("Percent of non-eukaryotes ASVs: ",round(Non_euk_ASVs_nb/Euk_ASVs_nb *100,2)," %")

# Looking at percent of reads belonging to non-Eukaryota
Non_euk_reads_nb = ps_trimmed %>% subset_taxa(superkingdom != "Eukaryota" | family %in% "Hominidae") %>% phyloseq::filter_taxa(function(x) sum(x) > 0, TRUE) %>% sample_sums() %>% sum()
Euk_reads_nb = ps_trimmed %>% subset_taxa(superkingdom == "Eukaryota" | family %ni% "Hominidae") %>% phyloseq::filter_taxa(function(x) sum(x) > 0, TRUE) %>% sample_sums() %>% sum()
paste0("Percent of non-eukaryotes reads: ",round(Non_euk_reads_nb/Euk_reads_nb *100,5)," %")
```

#### Removing non-eukaryotic ASVs (including homonidae)

```{r echo=FALSE, message=FALSE, warning=FALSE, collapse=T}
ps_trimmed_euk = ps_trimmed %>% subset_taxa(superkingdom=="Eukaryota" & family %ni% "Hominidae" & class %ni% "Aves") 
# # Looking at read counts
# cat("\nBefore data\n")
# ps_trimmed
# paste0("Total number of reads: ",ps_trimmed %>% sample_sums() %>% sum())
# cat("\nAfter data\n")
# ps_trimmed_euk
# paste0("Total number of reads: ",ps_trimmed_euk %>% sample_sums() %>% sum())

# Adding reads lost for sediment samples (processed in another script)
before = ps_trimmed %>% sample_sums() %>% sum()
after = ps_trimmed_euk %>% sample_sums() %>% sum()
before_r = ps_trimmed %>% taxa_names() %>% length()
after_r = ps_trimmed_euk %>% taxa_names() %>% length()

cat("\nTotal number of lost ASVs and reads after non-eukaryote ASVs removal\n")
paste0("Lost reads: ", before-after)
paste0("Lost ASVs: ", before_r-after_r)

cat("\nPercent of lost ASVs and reads after non-eukaryote ASVs removal\n")
paste0("Lost reads: ",round(100-(after/before * 100),2)," %")
paste0("Lost ASVs: ",round(100-(after_r/before_r * 100),2)," %")
```

### Looking at number of rare ASVs

```{r echo=FALSE, fig.height=6, fig.width=10, message=FALSE, warning=FALSE, collapse=TRUE}
TS = sort(taxa_sums(ps_trimmed_euk)) %>% as.data.frame() %>% rownames_to_column("ASV")

p1= ggplot(TS, aes(.)) + geom_histogram() + theme_stata() + scale_x_sqrt(limits= c(0,10000), breaks = c(1,5,10,500,2500,5000,7500,10000)) + scale_y_sqrt(breaks = c(100, 1000, 5000,10000,25000,50000)) + xlab("Reads/ASV") + ylab("Number of ASVs") + ggtitle("ASVs distribution")

cat("\nProportions of rare ASVs in samples\n")

paste0("Singletons: ",taxa_sums(ps_trimmed_euk)[taxa_sums(ps_trimmed_euk)  == 1] %>% length(),": ",round(taxa_sums(ps_trimmed_euk)[taxa_sums(ps_trimmed_euk)  == 1] %>% length()/taxa_sums(ps_trimmed_euk)%>% length()*100,2)," %")

paste0("Doubletons: ",taxa_sums(ps_trimmed_euk)[taxa_sums(ps_trimmed_euk)  == 2] %>% length(),": ",round(taxa_sums(ps_trimmed_euk)[taxa_sums(ps_trimmed_euk)  == 2] %>% length()/taxa_sums(ps_trimmed_euk)%>% length()*100,2)," %")

paste0("Less than 10 reads: ",taxa_sums(ps_trimmed_euk)[taxa_sums(ps_trimmed_euk)  <10] %>% length(),": ",round(taxa_sums(ps_trimmed_euk)[taxa_sums(ps_trimmed_euk)  <10] %>% length()/taxa_sums(ps_trimmed_euk)%>% length()*100,2)," %")

p1
```

#### Effect of removing rare ASVs (ASVs with less than 2 reads in a minimum of 3 samples)

\*This is not performed. It is just to see the impact.

```{r #### Removing taxa with less than 2 reads in a minimum of 3 samples (not performed at the moment), echo=FALSE, message=FALSE, warning=FALSE, collapse=TRUE}
# Filtering taxa -------------------------------------------------
ps_trimmed_euk_min = filter_taxa(ps_trimmed_euk, function(x) sum(x > 2) > (3), TRUE)

# Adding reads lost for sediment samples (processed in another script)
before = ps_trimmed_euk %>% sample_sums() %>% sum()
after = ps_trimmed_euk_min %>% sample_sums() %>% sum()

cat("\nPercent of lost reads after rare ASVs removal\n")
paste0("Lost reads: ",round(100-(after/before * 100),2)," %")
cat("\nPercent of lost ASVs after rare ASVs removal\n")
paste0("Lost ASVs: ",round(100-(ntaxa(ps_trimmed_euk_min)/ntaxa(ps_trimmed_euk) * 100),2)," %")
```

### Looking at sequencing depth per sample with rarefaction curves

```{r , message=FALSE, warning=FALSE, collapse=TRUE, include=FALSE}
ps_trimmed_euk_min = ps_trimmed_euk

dir.create("../Metabarcoding_data/Cleaned_data")
dir.create("../Metabarcoding_data/Cleaned_data/Sequencing_depth")

s = sample_sums(ps_trimmed_euk_min) %>% sort()
x = split(s, ceiling(seq_along(s)/10))
xx <- plyr::ldply(x, function(x){
    plyr::rename(ldply(x, function(z) {data.frame(read_count=z)}),c(".id" = ".id2"))
})
xx =xx %>% dplyr::group_by(.id) %>% dplyr::mutate(.id3 = max(read_count))
xx = xx[match(sample_names(ps_trimmed_euk_min),xx$.id2),]
ps_trimmed_euk_min@sam_data$read_count_gr = paste0("Group_",xx$.id," (<=",xx$.id3,")")
ps_trimmed_euk_min@sam_data$read_count_gr = factor(ps_trimmed_euk_min@sam_data$read_count_gr, levels = gtools::mixedsort(ps_trimmed_euk_min@sam_data$read_count_gr %>% unique()))
p <- ranacapa::ggrare(ps_trimmed_euk_min, step = 1000, color = "read_count_gr",  se = FALSE)
p <- p + facet_wrap(~read_count_gr,scales = "free_x") + theme_bw() + ylab("ASV richness") + xlab("Sequencing depth") + guides(color=guide_legend(title="Sequencing depth")) + theme(axis.text.x = element_text(angle = 45))
ggsave(plot = p, filename = "rarefaction_curves.pdf", path = "../Metabarcoding_data/Cleaned_data/Sequencing_depth/", device = "pdf", width = 12, height = 8)
```

```{r echo=FALSE, fig.height= 8, fig.width=16}
ggplotly(p)
```

### Looking at read count per sample and removing those with less than 1000 reads.

```{r echo=FALSE, message=FALSE, warning=FALSE}
df_read_count = ps_trimmed_euk_min %>% phyloseq::sample_sums() %>% sort() %>% as.data.frame(col.names = "Reads")
colnames(df_read_count) = "Read count"
df_read_count %>% datatable(class = 'cell-border stripe', caption = "Read count per sample after decontamination", options = list(pageLength = 10))

# Filtering samples -------------------------------------------------
ps_trimmed_euk_min_min = phyloseq::prune_samples(phyloseq::sample_sums(ps_trimmed_euk_min) >= 1000, ps_trimmed_euk_min)
```

### Saving results

```{r message=FALSE, warning=FALSE, code_folding=TRUE}
saveRDS(ps_trimmed_euk_min_min,"../Metabarcoding_data/Cleaned_data/ps_clean.rds")

# # Txt tables ----------------------------------------------------
ASV_table = ps_trimmed_euk_min_min %>% pstoveg_otu() %>% t() %>% as.data.frame() %>% rownames_to_column('feature-id') 
write.table(ASV_table,'../Metabarcoding_data/Cleaned_data/ASV_table.txt', quote = F, row.names = F, sep = "\t")

Taxo_table = ps_trimmed_euk_min_min@tax_table@.Data %>% as.data.frame()
Taxo_table  = Taxo_table %>% rownames_to_column('feature-id')
write.table(Taxo_table,"../Metabarcoding_data/Cleaned_data/taxonomy_table.txt", quote = F, row.names = F, sep = "\t")

Metadata = ps_trimmed_euk_min_min@sam_data %>% data.frame()
Metadata  = Metadata %>% rownames_to_column('sample-id')
write.table(Metadata,"../Metabarcoding_data/Cleaned_data/metadata_table.txt", quote = F, row.names = F, sep = "\t")

writeXStringSet(ps_trimmed_euk_min_min@refseq, "../Metabarcoding_data/Cleaned_data/ref_seq.fasta")
```
---
title: "Marsden Cove data analysis"
---

### Loading libraries

```{r Loading libraries, message=FALSE, warning=FALSE, code_folding=TRUE}
library(biohelper)
library(ggplot2)
library(dplyr)
library(data.table)
library(Biostrings)
library(janitor)
library(ggthemes)
library(DT)
library(tibble)
library(tidyverse)
library(UpSetR)
library(vegan)
library(plotly)
library(crosstalk)
library(ranacapa)
library(plyr)
library(glue)
library(RColorBrewer)
library(DECIPHER)
library(UpSetR)
library(theseus)
library(microViz)
library(grid)
library(ggpubr)
library(pracma)
library(ggbreak) 
library(patchwork)
library(twoddpcr)
library(ggpubr)
library(pairwiseAdonis)
library(metagMisc)
# options(rgl.useNULL=TRUE)
# library(dpcR)
dir.create("../Results_data_analysis")

time_func = function(df){
  df$time2 = 0
  for (i in 1:nrow(df)) {
    if(df$time[i] == 0.1){
      df$time2[i] = 10
    }else if(df$time[i] == 0.3){
      df$time2[i] = 30
    }else if(df$time[i] == 1.0){
      df$time2[i] = 60
    }else if(df$time[i] == 3.0){
      df$time2[i] = 180
    }else if(df$time[i] == 6.0){
      df$time2[i] = 360
    }else if(df$time[i] == 12.0){
      df$time2[i] = 720
      }
    }
    return(df)
  }
```


### ddPCR analyses
#### Loading data
Metadata
```{r}
metadata <- read.csv("../Metadata/metadataMarsden.csv") %>% 
  janitor::clean_names() %>% 
  time_func()
```


Sabella
```{r}
sabella <- read.csv("../ddPCR_info/preliminary analysis_Sab/PassiveSabella_cleaned.csv", h=T, row.names=1,stringsAsFactors = F) %>% 
  janitor::clean_names() %>% 
  time_func()
#Sabella <- Sabella[-c(79,77),] # Why are these being removed? Suspected too high??

meta_s = sabella %>% dplyr::select(c(id,matrix, time, replicates, extraction,time2))
```

Begula
```{r}
# begula_r4 <- readxl::read_xlsx("../ddPCR_data/Begula/Run 4/ddPCR Begula neritinia Run 4 Results Marsden cove  2022-11-02.xlsx") %>% 
#   janitor::clean_names()
# 
# begula_r5 <- readxl::read_xlsx("../ddPCR_data/Begula/Run 5/ddPCR Begula neritinia Run 5 Results Marsden cove 2022-11-03.xlsx") %>% 
#   janitor::clean_names()
# 
# begula = bind_rows(begula_r4 %>% dplyr::select(-well), begula_r5 %>% dplyr::select(-well)) 
# rm(begula_r4,begula_r5)
# colnames(begula)[3] = "conc_copies_u_l"
# begula$id = begula$sample
# write.csv(begula,"../ddPCR_data/Begula/ddPCR_results.csv",row.names = F)
begula = read_csv("../ddPCR_data/Bugula/ddPCR_results.csv")

begula = left_join(begula, metadata, by = "id") %>% 
  dplyr::group_by(sample,id, target, matrix, time, time2, replicates) %>% 
  dplyr::summarise(
                   copies_u_l = mean(conc_copies_u_l),
                   accepted_droplets = mean(accepted_droplets),
                   positives = mean(positives),
                   negatives = mean(negatives)) %>% 
  dplyr::mutate(target = "Begula neritina") %>% 
  dplyr::filter(!is.na(matrix)) %>% 
  dplyr::filter(!is.na(copies_u_l)) %>% 
  dplyr::filter(matrix != "field blank")
```


Styela
```{r}
# styela_r1 <- readxl::read_xlsx("../ddPCR_data/Styela/Run 1/ddPCR Run 1 Results Styela Marsden cove passive sampling 2022-10-18.xlsx") %>%
#   janitor::clean_names()
# 
# styela_r2 <- readxl::read_xlsx("../ddPCR_data/Styela/Run 2/ddCR Run 2 Styela Marsden cove passsive sampling copy number Results 22-10-19.xlsx") %>%
#   janitor::clean_names()
# 
# styela_r3 <- readxl::read_xlsx("../ddPCR_data/Styela/Run 3/ddCR Run 3 Results Styela Marsden cove passsive sampling copy number.xlsx") %>%
#   janitor::clean_names() %>% 
#   dplyr::mutate(conc_copies_m_l = as.numeric(conc_copies_m_l))
# 
# styela = bind_rows(styela_r2 %>% dplyr::select(-well), styela_r3 %>% dplyr::select(-well))
# rm(styela_r1,styela_r2,styela_r3)
# colnames(styela)[3] = "conc_copies_u_l"
# styela$id = styela$sample
# write.csv(styela,"../ddPCR_data/Styela/ddPCR_results.csv",row.names = F)

styela = read_csv("../ddPCR_data/Styela/ddPCR_results.csv")

styela = left_join(styela, metadata, by = "id") %>% 
  dplyr::group_by(sample,id, target, matrix, time, time2, replicates) %>% 
  dplyr::summarise(
                   copies_u_l = mean(conc_copies_u_l),
                   accepted_droplets = mean(accepted_droplets),
                   positives = mean(positives),
                   negatives = mean(negatives)) %>% 
  dplyr::mutate(target = "Styela") %>% 
  dplyr::filter(!is.na(matrix)) %>% 
  dplyr::filter(!is.na(copies_u_l)) %>% 
  dplyr::filter(matrix != "field blank")
```

Undaria
```{r}
# undaria_r6 <- readxl::read_xlsx("../ddPCR_data/Undaria/Run 6/ddPCR Run 6 Undaria new probe copies per uL Marsden cove 2022-11-23 Results.xlsx") %>%
#   janitor::clean_names()
# 
# undaria_r7 <- readxl::read_xlsx("../ddPCR_data/Undaria/Run 7/ddPCR Run 7 Undaria new probe Marsden cove samles copies per uL 2022-06-12 Results.xlsx") %>%
#   janitor::clean_names() %>% 
#   dplyr::mutate(conc_copies_m_l = as.numeric(conc_copies_m_l))
# 
# undaria_r8 <- readxl::read_xlsx("../ddPCR_data/Undaria/Run 8/ddPCR Run 8 Undaria new probe Marsden cove samles copies per uL  2022-12-07 Results.xlsx") %>%
#   janitor::clean_names() %>% 
#   dplyr::mutate(conc_copies_m_l = as.numeric(conc_copies_m_l))
# 
# undaria = bind_rows(undaria_r7 %>% dplyr::select(-well), undaria_r8 %>% dplyr::select(-well))
# rm(undaria_r6,undaria_r7,undaria_r8)
# colnames(undaria)[3] = "conc_copies_u_l"
# undaria$id = undaria$sample
# write.csv(undaria,"../ddPCR_data/Undaria/ddPCR_results.csv",row.names = F)

undaria = read_csv("../ddPCR_data/Undaria/ddPCR_results.csv")

undaria = left_join(undaria, metadata, by = "id") %>% 
  dplyr::group_by(sample,id, target, matrix, time, time2, replicates) %>% 
  dplyr::summarise(
                   copies_u_l = mean(conc_copies_u_l),
                   accepted_droplets = mean(accepted_droplets),
                   positives = mean(positives),
                   negatives = mean(negatives)) %>% 
  dplyr::mutate(target = "Undaria") %>% 
  dplyr::filter(!is.na(matrix)) %>% 
  dplyr::filter(!is.na(copies_u_l)) %>% 
  dplyr::filter(matrix != "field blank")
```

#### Creating functions
```{r}
reg_plot_func = function(df){
  df = df %>% 
    dplyr::mutate("sqrt_copies_uL" = sqrt(copies_u_l),
                  "fthrt_copies_uL" = nthroot(copies_u_l, 4))
  
  ref = df %>% dplyr::filter(matrix == "active") %>% dplyr::pull(sqrt_copies_uL)
  m_ref = mean(ref)
  top_ref = m_ref + sd(ref)
  bottom_ref = m_ref - sd(ref)
  if(bottom_ref<0){
    bottom_ref = 0
  }

  p = ggscatter(
    df %>% dplyr::filter(matrix != "active"), x = "time2", y = "sqrt_copies_uL",
    color = "matrix", palette = "jco",
    add = "loess", #loess
    conf.int = F,
    conf.int.level = 0.95
    ) +
    geom_hline(yintercept =top_ref ,alpha=1, linetype = "dotted")+
    geom_hline(yintercept =m_ref ,alpha=1, linetype = "twodash")+
    geom_hline(yintercept =bottom_ref ,alpha=1, linetype = "dotted")+
    facet_wrap(~matrix, scales = "free", nrow = 1) +
    stat_cor()+
    ylab("copies/ul (sqrt)")+ xlab("Submersion time (min)")+
    scale_x_continuous(breaks =c(0,10,30,60,180,360,720)) +
    guides(color=guide_legend(title="Matrix"))
  return(p)
}

barplot_func = function(df, sb = NA){
  df = df %>% 
    dplyr::mutate("sqrt_copies_uL" = sqrt(copies_u_l),
                  "fthrt_copies_uL" = nthroot(copies_u_l, 4))
  
  data_summary <- function(data, varname, groupnames){
    summary_func <- function(x, col){
      c(mean = mean(x[[col]], na.rm=TRUE),
        sd = sd(x[[col]], na.rm=TRUE))
    }  
    data_sum<-ddply(data, groupnames, .fun=summary_func,varname)
    data_sum <- rename(data_sum, c("mean" = varname))
    return(data_sum)
  }

  df2 <- data_summary(df %>% dplyr::filter(matrix != "active"), 
                      varname="sqrt_copies_uL", groupnames=c("matrix", "time2")) %>% 
    dplyr::filter(time2!=0)
  
  for (i in 1:nrow(df2)) {
    df2$bottom_sd[i] = df2$sqrt_copies_uL[i] - df2$sd[i]
    if(is.na(df2$bottom_sd[i])){
      df2$bottom_sd[i] = 0
    }
    if(df2$bottom_sd[i]<0){
      df2$bottom_sd[i] = 0
    }
  }
  
  ref = df %>% dplyr::filter(matrix == "active") %>% dplyr::pull(sqrt_copies_uL)
  m_ref = mean(ref)
  top_ref = m_ref + sd(ref)
  bottom_ref = m_ref - sd(ref)
  if(bottom_ref<0){
    bottom_ref = 0
  }  
  df2$time2 = factor(df2$time2,levels=c(10,30,60,180,360,720))

  p<- ggplot(df2, aes(x=time2, y=sqrt_copies_uL, fill=matrix)) + 
    geom_bar(stat="identity", color="black", position=position_dodge()) +
    geom_errorbar(aes(ymin=bottom_sd, ymax=sqrt_copies_uL+sd), width=.2,position=position_dodge(.9))+
    geom_hline(yintercept =top_ref ,alpha=1, linetype = "dotted")+
    geom_hline(yintercept =m_ref ,alpha=1, linetype = "twodash")+
    geom_hline(yintercept =bottom_ref ,alpha=1, linetype = "dotted")+
    theme_classic()+
    #scale_y_continuous(breaks=c(0,0.5,1,1.5,2,4.5,5,7.5,8))+
    labs(title=" ", x="Submersion time (min)", y = "copies/ul (sqrt)") + 
    guides(fill=guide_legend(title="Matrix"))
  p = set_palette(p,palette = "jco")
  # if(all(!is.na(sb))){
  #   p = p + scale_y_break(breaks = sb)
  # }
  return(p)
  }
```


#### Performing plots and analyses
Sabella spalenzanii
```{r}
# Plot
reg_plot_func(df = sabella)#+facet_wrap(~matrix, scales = "free")
ggsave(filename = "../Results_data_analysis/Figures/reg_ss_ddpcr.pdf",width = 15,height = 4)
barplot_func(df = sabella, sb = c(2,4.5,5,7.5))
ggsave(filename = "../Results_data_analysis/Figures/bp_ss_ddpcr.pdf",width = 7,height = 4)
#Stats
res.aov2 <- aov(copies_u_l ~ matrix * time, data = sabella %>% dplyr::filter(matrix!="active"))
summary(res.aov2)
```

Begula
```{r}
# Plot
reg_plot_func(df = begula)#+facet_wrap(~matrix, scales = "free")
ggsave(filename = "../Results_data_analysis/Figures/reg_beg_ddpcr.pdf",width = 15,height = 4)
barplot_func(df = begula, sb = NA)
ggsave(filename = "../Results_data_analysis/Figures/bp_beg_ddpcr.pdf",width = 7,height = 4)
#Stats
res.aov2 <- aov(copies_u_l ~ matrix * time, data = begula %>% dplyr::filter(matrix!="active"))
summary(res.aov2)
```

Styela
```{r}
# Plot
reg_plot_func(df = styela)#+facet_wrap(~matrix, scales = "free")
ggsave(filename = "../Results_data_analysis/Figures/reg_sty_ddpcr.pdf",width = 15,height = 4)
barplot_func(df = styela, sb = NA)
ggsave(filename = "../Results_data_analysis/Figures/bp_sty_ddpcr.pdf",width = 7,height = 4)
#Stats
res.aov2 <- aov(copies_u_l ~ matrix * time, data = styela %>% dplyr::filter(matrix!="active"))
summary(res.aov2)
```

Undaria
```{r}
# Plot
reg_plot_func(df = undaria)#+facet_wrap(~matrix, scales = "free")
ggsave(filename = "../Results_data_analysis/Figures/reg_und_ddpcr.pdf",width = 15,height = 4)
barplot_func(df = undaria, sb = NA)
ggsave(filename = "../Results_data_analysis/Figures/bp_und_ddpcr.pdf",width = 7,height = 4)
#Stats
res.aov2 <- aov(copies_u_l ~ matrix * time, data = undaria %>% dplyr::filter(matrix!="active"))
summary(res.aov2)
```

###scatter plot with exponential model
```{r}
# Required libraries
library(ggplot2)
library(dplyr)
library(ggpubr)
library(rcompanion) # for nthroot function

# Function to plot GLM with exponential curve and annotate p-values
reg_plot_func = function(df){
  df = df %>% 
    dplyr::mutate(
      "sqrt_copies_uL" = sqrt(copies_u_l),
      "fthrt_copies_uL" = nthroot(copies_u_l, 4)
    )
  
  ref = df %>% dplyr::filter(matrix == "active") %>% dplyr::pull(sqrt_copies_uL)
  m_ref = mean(ref)
  top_ref = m_ref + sd(ref)
  bottom_ref = m_ref - sd(ref)
  if(bottom_ref < 0){
    bottom_ref = 0
  }

  # Calculate the first and last three unique values for x-axis breaks
  x_breaks = sort(unique(df$time2))
  x_breaks = c(x_breaks[1], x_breaks[(length(x_breaks)-2):length(x_breaks)])

  p = ggscatter(
    df %>% dplyr::filter(matrix != "active"), x = "time2", y = "sqrt_copies_uL",
    color = "matrix", palette = "jco",
    add = "none", # Remove loess
    conf.int = F
  ) +
    geom_hline(yintercept = top_ref, alpha = 1, linetype = "dotted") +
    geom_hline(yintercept = m_ref, alpha = 1, linetype = "twodash") +
    geom_hline(yintercept = bottom_ref, alpha = 1, linetype = "dotted") +
    facet_wrap(~matrix, scales = "free", nrow = 1) +
    ylab("copies/ul (sqrt)") + xlab("Submersion time (min)") +
    scale_x_continuous(breaks = x_breaks) +
    scale_y_continuous(limits = c(0, 5)) + # Limit the y-axis to 5
    guides(color = guide_legend(title = "Matrix")) +
    theme(axis.text.x = element_text(size = 8),  # Adjust x-axis label size
          axis.text.y = element_text(size = 8),  # Adjust y-axis label size
          axis.title = element_text(size = 10),  # Adjust axis title size
          plot.margin = margin(1, 1, 1, 1, "cm"))  # Adjust plot margins for annotations
  
  glm_p_values = list() # List to store p-values for each matrix level
  
  # Adding GLM fit with exponential curve and calculating p-values
  for (matrix_level in unique(df$matrix[df$matrix != "active"])) {
    df_subset = df %>% dplyr::filter(matrix == matrix_level)
    # Ensure all response values are positive by adding a small constant if necessary
    df_subset$sqrt_copies_uL = ifelse(df_subset$sqrt_copies_uL <= 0, 1e-6, df_subset$sqrt_copies_uL)
    glm_model = glm(sqrt_copies_uL ~ time2, data = df_subset, family = gaussian(link = "log"))
    pred = predict(glm_model, newdata = data.frame(time2 = seq(min(df_subset$time2), max(df_subset$time2), length.out = 100)), type = "response")
    p = p + geom_line(data = data.frame(time2 = seq(min(df_subset$time2), max(df_subset$time2), length.out = 100), pred = pred, matrix = matrix_level), aes(x = time2, y = pred, color = matrix), size = 1)
    
    # Extract p-value from GLM summary
    glm_summary = summary(glm_model)
    p_value = coef(glm_summary)["time2", "Pr(>|t|)"]
    glm_p_values[[matrix_level]] = p_value
    
    # Annotate p-value on the plot within its corresponding facet
    p = p + geom_text(data = data.frame(matrix = matrix_level, label = paste0("p-value: ", round(p_value, 4))),
                      aes(x = min(df_subset$time2), y = 4.5, label = label),  # Adjust y position if needed
                      color = "black", size = 3, hjust = 0)
  }
  
  return(p)
}

# Using the function
reg_plot_func(df = sabella)




library(ggplot2)
library(dplyr)
library(ggpubr)
library(rcompanion) # for nthroot function

# Function to plot GLM with exponential curve and annotate p-values and R values
reg_plot_func = function(df){
  df = df %>% 
    dplyr::mutate(
      "sqrt_copies_uL" = sqrt(copies_u_l),
      "fthrt_copies_uL" = nthroot(copies_u_l, 4)
    )
  
  ref = df %>% dplyr::filter(matrix == "active") %>% dplyr::pull(sqrt_copies_uL)
  m_ref = mean(ref)
  top_ref = m_ref + sd(ref)
  bottom_ref = m_ref - sd(ref)
  if(bottom_ref < 0){
    bottom_ref = 0
  }

  # Calculate the first and last three unique values for x-axis breaks
  x_breaks = sort(unique(df$time2))
  x_breaks = c(x_breaks[1], x_breaks[(length(x_breaks)-2):length(x_breaks)])
  
  # Determine y-axis limits for styela matrix
  y_max_styela = ifelse("styela" %in% df$matrix, 1, 1)

  p = ggscatter(
    df %>% dplyr::filter(matrix != "active"), x = "time2", y = "sqrt_copies_uL",
    color = "matrix", palette = "jco",
    add = "none", # Remove loess
    conf.int = F
  ) +
    geom_hline(yintercept = top_ref, alpha = 1, linetype = "dotted") +
    geom_hline(yintercept = m_ref, alpha = 1, linetype = "twodash") +
    geom_hline(yintercept = bottom_ref, alpha = 1, linetype = "dotted") +
    facet_wrap(~matrix, scales = "free", nrow = 1) +
    ylab("copies/ul (sqrt)") + xlab("Submersion time (min)") +
    scale_x_continuous(breaks = x_breaks) +
    scale_y_continuous(limits = c(0, y_max_styela)) + # Limit the y-axis to 1 for styela and 5 for others
    guides(color = guide_legend(title = "Matrix")) +
    theme(axis.text.x = element_text(size = 8),  # Adjust x-axis label size
          axis.text.y = element_text(size = 8),  # Adjust y-axis label size
          axis.title = element_text(size = 10),  # Adjust axis title size
          plot.margin = margin(1, 1, 1, 1, "cm"))  # Adjust plot margins for annotations
  
  glm_p_values = list() # List to store p-values for each matrix level
  
  # Adding GLM fit with exponential curve and calculating p-values and R values
  for (matrix_level in unique(df$matrix[df$matrix != "active"])) {
    df_subset = df %>% dplyr::filter(matrix == matrix_level)
    # Ensure all response values are positive by adding a small constant if necessary
    df_subset$sqrt_copies_uL = ifelse(df_subset$sqrt_copies_uL <= 0, 1e-6, df_subset$sqrt_copies_uL)
    glm_model = glm(sqrt_copies_uL ~ time2, data = df_subset, family = gaussian(link = "log"))
    pred = predict(glm_model, newdata = data.frame(time2 = seq(min(df_subset$time2), max(df_subset$time2), length.out = 100)), type = "response")
    p = p + geom_line(data = data.frame(time2 = seq(min(df_subset$time2), max(df_subset$time2), length.out = 100), pred = pred, matrix = matrix_level), aes(x = time2, y = pred, color = matrix), size = 1)
    
    # Extract p-value and R-squared from GLM summary
    glm_summary = summary(glm_model)
    p_value = coef(glm_summary)["time2", "Pr(>|t|)"]
    glm_p_values[[matrix_level]] = p_value
    r_squared = 1 - glm_summary$deviance / glm_summary$null.deviance
    r_value = sqrt(r_squared)
    
    # Annotate R value and p-value on the plot within its corresponding facet
    p = p + geom_text(data = data.frame(matrix = matrix_level, label = paste0("R: ", round(r_value, 4), "\np-value: ", round(p_value, 4))),
                      aes(x = min(df_subset$time2), y = y_max_styela * 0.9, label = label),  # Adjust y position if needed
                      color = "black", size = 3, hjust = 0)
  }
  
  return(p)
}


reg_plot_func(df = styela)


reg_plot_func(df = begula)

# Save the plot
ggsave(filename = "../Results_data_analysis/Figures/reg_ss_ddpcr.pdf", width = 15, height = 4)

```

### Metabarcoding analyses
#### Loading the data
```{r}
# 18S #############################################################
# Load the RDS file
ps <- readRDS("../Metabarcoding_data/Cleaned_data/ps_clean.rds")
m = ps %>% pstoveg_sample()
```


#### Taxonomic composition (per time per membrane)
Do we capture any of the four NIS with 18S?
```{r}
ps_RA = ps %>% 
  tax_fix(unknowns = c("Unknown Family")) %>% 
  microViz::tax_agg(rank = "family") %>% 
  microbiome::transform(transform = "compositional") 

ps_RA@sam_data$time = factor(ps_RA@sam_data$time,levels = ps_RA@sam_data$time %>% unique())
ps_RA@sam_data$sample_id = factor(ps_RA@sam_data$sample_id, levels = ps_RA %>% pstoveg_sample() %>% dplyr::arrange(time) %>% dplyr::pull(sample_id))

l = levels(ps_RA@sam_data$sample_id)
names(l) = 1:length(l)
sample_names(ps_RA) = names(l[ps_RA@sam_data$sample_id])
ps_RA = microViz::phyloseq_validate(ps_RA)

library(speedyseq)
p = taxo_bar_plot(ps_RA, 
                  rank1 = "phylum", 
                  rank2 = "family", 
                  colors=c("cyan","palegreen","dodgerblue","white","yellow","deeppink","lightsalmon"),
                  alpha_num_ordering = T,
                  n_rank1 = NA, n_rank2 = 6)
p = p + scale_x_discrete(breaks=sample_names(ps_RA),
                          labels=ps_RA@sam_data$time)
pp = p +facet_grid(matrix~., scales="free", space="free")
pp = p +facet_wrap(.~matrix, scales="free")

pp

dir.create("../Results_data_analysis")
ggsave(pp  + theme(legend.position="none"),filename = "../Results_data_analysis/Figures/taxo_barplot.pdf",width = 10, height = 8)
legend <- cowplot::get_legend(pp)
ggsave(legend,filename = "../Results_data_analysis/Figures/taxo_barplot_leg.pdf",width = 8, height = 6)
```


#### Identify putative NIS
```{r}
nis = readxl::read_xlsx("../Metabarcoding_data/Cleaned_data/NIS.xlsx") %>% 
  dplyr::filter(pident==100) %>% 
  dplyr::arrange(qseqid, desc(pident), desc(length), evalue) %>% 
  dplyr::group_by(qseqid) %>% 
  dplyr::slice_head(n = 1) %>% 
  dplyr::mutate(species2 = gsub(".*;(.*)","\\1",sseqid),
                species2 = gsub("_"," ",species2)) %>% 
  as.matrix() %>% as.data.frame()
names(nis)[1] = "asv"

ps_RA = ps %>% 
  microbiome::transform(transform = "compositional") %>% 
  prune_taxa(taxa = nis$asv) 
tax = ps_RA@tax_table@.Data %>% as.data.frame()
tax = left_join(tax, nis %>% dplyr::select(asv, species2))
tax$genus = gsub("(.*) .*","\\1", tax$species2)
tax$species = tax$species2
tax$species2=NULL
ps_RA@tax_table@.Data = tax %>% as.matrix()
rownames(ps_RA@tax_table@.Data) = tax$asv

ps_RA@sam_data$time = factor(ps_RA@sam_data$time,levels = ps_RA@sam_data$time %>% unique())
ps_RA@sam_data$sample_id = factor(ps_RA@sam_data$sample_id, levels = ps_RA %>% pstoveg_sample() %>% dplyr::arrange(time) %>% dplyr::pull(sample_id))

l = levels(ps_RA@sam_data$sample_id)
names(l) = 1:length(l)
sample_names(ps_RA) = names(l[ps_RA@sam_data$sample_id])
ps_RA = microViz::phyloseq_validate(ps_RA)

p = taxo_bar_plot(ps_RA, 
                  rank1 = "genus", 
                  rank2 = "species", 
                  colors=c("cyan","palegreen","dodgerblue","white","yellow","deeppink","lightsalmon"),
                  alpha_num_ordering = T,
                  n_rank1 = NA, n_rank2 = 8)
p = p + scale_x_discrete(breaks=sample_names(ps_RA),
                          labels=ps_RA@sam_data$time)
pp = p +facet_grid(matrix~., scales="free", space="free")
pp = p +facet_wrap(.~matrix, scales="free")

pp

dir.create("../Results_data_analysis")
ggsave(pp  + theme(legend.position="none"),filename = "../Results_data_analysis/Figures/taxo_barplot_nis.pdf",width = 10, height = 8)
legend <- cowplot::get_legend(pp)
ggsave(legend,filename = "../Results_data_analysis/Figures/taxo_barplot_leg_nis.pdf",width = 8, height = 6)
```

#### Heatmap of NIS
I think there is too many categories to make a nice heatmap. Perhaps the barplot above is best?
```{r}
#heatmaps for metazoa/NIS from PAT for matrix as columns or time, 
ps_RA_nis = ps_RA 
#ps_temp = ps_RA_nis

nis_time_func = function(ps_temp){
  ps_temp@sam_data$time = as.factor(ps_temp@sam_data$time)
  ps_temp@tax_table@.Data[,2] = NA
  ps_temp@tax_table@.Data[,3] = NA
  ps_temp@tax_table@.Data[,4] = NA
  ps_temp@tax_table@.Data[,5] = NA
  ps_temp@tax_table@.Data[,6] = NA
  ps_temp@tax_table@.Data[,7] = NA

  ps_temp = ps_temp%>% tax_glom(taxrank = "species")
  ps_temp_active = ps_temp %>% subset_samples(matrix == "active") %>% merge_samples(group = "matrix")
  ps_temp_active@sam_data$time = 0
  ps_temp = ps_temp %>% subset_samples(matrix != "active") %>% merge_samples(group = "time")
  
  ps_temp = merge_phyloseq(ps_temp,ps_temp_active)  %>% phyloseq::filter_taxa(function(x) sum(x) >0, TRUE)
  taxa_names(ps_temp) = ps_temp@tax_table@.Data[,"species"] %>% as.character()
  ps_temp_df = ps_temp %>% pstoveg_otu() %>% pracma::nthroot(4) %>% t() 
  
  hmRA = ComplexHeatmap::Heatmap(ps_temp_df, 
                                cluster_columns = F,
                                column_split = factor(colnames(ps_temp_df), levels = gtools::mixedsort(colnames(ps_temp_df) %>% unique())), 
                                column_title = NULL, 
                                col = hcl.colors(palette="viridis",n=5),
                                row_title_rot = 0, 
                                column_title_rot = 0, 
                                heatmap_legend_param = list(title = "4th root transformed abundance",title_gp = grid::gpar(fontsize = 8),title_position = "lefttop-rot", labels_gp = grid::gpar(fontsize = 8)), 
                                column_names_rot = 35, 
                                row_names_gp = grid::gpar(fontsize = 8),
                                column_names_gp = grid::gpar(fontsize = 10))#,column_order = gtools::mixedsort(colnames(ps_g_trimmed) %>% unique()))
  return(hmRA)
}

res_sponge = nis_time_func(ps = ps_RA_nis %>% subset_samples(matrix %in% c("sponge","active")))
res_spongeW = nis_time_func(ps = ps_RA_nis %>% subset_samples(matrix %in% c("sponge water","active")))
res_nylon = nis_time_func(ps = ps_RA_nis %>% subset_samples(matrix %in% c("nylon","active")))
res_nylon_disc = nis_time_func(ps = ps_RA_nis %>% subset_samples(matrix %in% c("nylon disc","active")))
res_nylon_mesh = nis_time_func(ps = ps_RA_nis %>% subset_samples(matrix %in% c("nylon mesh","active")))
```

#### Richness (per time per membrane)
Overall richness
```{r}
reg_plot_func2 = function(ps_temp){
  ps_temp@sam_data$richness = rarefy_even_depth(ps_temp, rngseed=11, sample.size= min(sample_sums(ps_temp)), replace=F) %>%  
    phyloseq::estimate_richness(measures = "Observed") %>% 
    dplyr::pull(Observed)
  
  m_temp = ps_temp %>% pstoveg_sample()
  
  ref = m_temp %>% dplyr::filter(matrix == "active") %>% dplyr::pull(richness)
  m_ref = mean(ref)
  top_ref = m_ref + sd(ref)
  bottom_ref = m_ref - sd(ref)
  # if(bottom_ref<0){
  #   bottom_ref = 0
  # }

  p = ggscatter(
    m_temp %>% dplyr::filter(matrix != "active"), x = "time2", y = "richness",
    color = "matrix", palette = "jco",
    add = "reg", #loess
    conf.int = F,
    conf.int.level = 0.95
    ) +
    geom_hline(yintercept =top_ref ,alpha=1, linetype = "dotted")+
    geom_hline(yintercept =m_ref ,alpha=1, linetype = "twodash")+
    geom_hline(yintercept =bottom_ref ,alpha=1, linetype = "dotted")+
    facet_wrap(~matrix, scales = "fixed", nrow = 1) +
    stat_cor()+
    ylab("Richness")+ xlab("Submersion time (min)")+
    scale_x_continuous(breaks =c(0,10,30,60,180,360,720)) +
    guides(color=guide_legend(title="Matrix"))
  return(p)
}
```

```{r}
#####with exponential model#########
library(ggplot2)
library(dplyr)
library(ggpubr)
library(phyloseq)
library(vegan)

reg_plot_func2 = function(ps_temp) {
  # Estimate richness
  ps_temp@sam_data$richness = rarefy_even_depth(ps_temp, rngseed = 11, sample.size = min(sample_sums(ps_temp)), replace = F) %>%
    phyloseq::estimate_richness(measures = "Observed") %>%
    dplyr::pull(Observed)
  
  # Convert to data frame
  m_temp = ps_temp %>% pstoveg_sample()
  
  # Calculate reference lines
  ref = m_temp %>% dplyr::filter(matrix == "active") %>% dplyr::pull(richness)
  m_ref = mean(ref)
  top_ref = m_ref + sd(ref)
  bottom_ref = m_ref - sd(ref)

  p = ggscatter(
    m_temp %>% dplyr::filter(matrix != "active"), x = "time2", y = "richness",
    color = "matrix", palette = "jco",
    add = "none", # Remove loess, add = "reg.line" removed for manual addition
    conf.int = F,
    conf.int.level = 0.95
  ) +
    geom_hline(yintercept = top_ref, alpha = 1, linetype = "dotted") +
    geom_hline(yintercept = m_ref, alpha = 1, linetype = "twodash") +
    geom_hline(yintercept = bottom_ref, alpha = 1, linetype = "dotted") +
    facet_wrap(~matrix, scales = "fixed", nrow = 1) +
    ylab("Richness") + xlab("Submersion time (min)") +
    scale_x_continuous(breaks = c(10, 180, 360, 720)) +
    guides(color = guide_legend(title = "Matrix")) +
    theme(axis.text.x = element_text(size = 8),  # Adjust x-axis label size
          axis.text.y = element_text(size = 8),  # Adjust y-axis label size
          axis.title = element_text(size = 10))  # Adjust axis title size

  glm_p_values = list()  # List to store p-values for each matrix level

  # Adding GLM fit with exponential curve and calculating p-values and R-squared values
  for (matrix_level in unique(m_temp$matrix[m_temp$matrix != "active"])) {
    df_subset = m_temp %>% dplyr::filter(matrix == matrix_level)
    # Ensure all response values are positive by adding a small constant if necessary
    df_subset$richness = ifelse(df_subset$richness <= 0, 1e-6, df_subset$richness)
    
    # Fit GLM with log link
    glm_model = glm(richness ~ time2, data = df_subset, family = poisson(link = "log"))
    pred = predict(glm_model, newdata = data.frame(time2 = seq(min(df_subset$time2), max(df_subset$time2), length.out = 100)), type = "response")
    
    # Add GLM regression line to plot
    p = p + geom_line(data = data.frame(time2 = seq(min(df_subset$time2), max(df_subset$time2), length.out = 100), richness = pred, matrix = matrix_level), aes(x = time2, y = richness, color = matrix), size = 1)
    
    # Extract p-value and R-squared from the model summary
    glm_summary = summary(glm_model)
    p_value = coef(glm_summary)["time2", "Pr(>|z|)"]
    glm_p_values[[matrix_level]] = p_value
    r_squared = 1 - glm_summary$deviance / glm_summary$null.deviance
    r_value = sqrt(r_squared)
    
    # Annotate R-squared and p-value on the plot within its corresponding facet
    p = p + geom_text(data = data.frame(matrix = matrix_level, label = paste0("R²: ", round(r_squared, 4), "\np-value: ", round(p_value, 4))),
                      aes(x = min(df_subset$time2), y = max(df_subset$richness) * 0.9, label = label),
                      color = "black", size = 3, hjust = 0)
  }

  return(p)
}


```

```{r}
p_r = reg_plot_func2(ps_temp = ps)
ggsave(plot = p_r, filename = "../Results_data_analysis/Figures/richness_nis.pdf",width = 12, height = 5)
p_r
```

NIS richness / abundance 
```{r}
p_rnis = reg_plot_func2(ps_temp = ps %>% prune_taxa(taxa = nis$asv) )
ggsave(plot = p_rnis, filename = "../Results_data_analysis/Figures/richness_nis.pdf",width = 12, height = 5)
p_rnis
```

#### AMCOM-BC
Investigating whether the capture of some NIS taxa is associated with time and/or matrix
NOT RUN at the moment because not sure it is really worth for the MS
```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
pst = ps %>% 
  prune_taxa(taxa = nis$asv) 
pst@sam_data$matrix = factor(pst@sam_data$matrix, levels = c("active", "sponge", "sponge water","nylon","nylon mesh","nylon disc"))

library(ANCOMBC)
res_matrix = ancombc2(data = pst, #%>% subset_samples(time2 %in% c(0,720))
                  fix_formula = "matrix + time",
                  group = "matrix",
                  p_adj_method = "BH", 
                  alpha = 0.05, 
                  neg_lb = T,
                  global = TRUE, 
                  pairwise = TRUE, 
                  dunnet = TRUE)

ancom_res_matrix = res_matrix$res %>% 
  pivot_longer(cols = starts_with("diff_matrix"), 
               names_to = "matrix", 
               values_to = "diff") %>% 
  dplyr::filter(diff == T) %>% 
  dplyr::select(taxon,contains("matrix")) %>% 
  dplyr::mutate(variable = "matrix")

ancom_res_time = res_matrix$res %>% 
  pivot_longer(cols = starts_with("diff_time"), 
               names_to = "time", 
               values_to = "diff") %>% 
  dplyr::filter(diff == T) %>% 
  dplyr::select(taxon,contains("time")) %>% 
  dplyr::mutate(variable = "time")

#---------------------------------------------------------
# prep_func = function(resdf, name, tax){
#   tax_rank_func = function(res){
#       taxrank = c("Phylum","Class","Order","Family","Genus","Species")
#       res$rank=""
#       for (i in 1:nrow(res)) {
#         if(grepl(x = res$taxon[i],pattern=str_c(taxrank, collapse="|"))){
#           res$rank[i] = unlist(str_extract_all(str = res$taxon[i], str_c(taxrank, collapse="|")))
#         }else{
#           res$rank[i]="ASV"
#         }
#       }
#       return(res)
#   }  
#   resdf = tax_rank_func(resdf)
#   if(nrow(resdf)>0){
#       #resdf$lfc_n = paste0("lfc_",resdf$variable)
#       resdf = resdf %>% pivot_longer(cols= starts_with("lfc_"), values_to = "lfc")
#     #dplyr::filter(lfc_n == name)
#       df = resdf %>% 
#         dplyr::arrange(desc(lfc)) %>%
#         dplyr::mutate(direct = ifelse(lfc > 0, "Possitively associated with variable", "Negatively associated with variable")) #%>% dplyr::mutate(taxon = gsub(x = taxon, " Genus",""))
#   
#       df$taxon = factor(df$taxon, levels = df$taxon)
#       df$direct = factor(df$direct, 
#                         levels = c("Possitively associated with variable", "Negatively associated with variable"))
#       df = left_join(df, tax)
#   
# 
# }
# tax = 
# 
# 
#       p1 = ggplot(data = df, 
#            aes(x = taxon, y = lfc, fill = variable, color = variable)) + 
#         geom_bar(stat = "identity", width = 0.7, 
#            position = position_dodge(width = 0.4)) +
#         #geom_errorbar(aes(ymin = lfc - se_treatmentT1, ymax = lfc_treatmentT1 + se_treatmentT1), width = 0.2,
#                 #position = position_dodge(0.05), color = "black") + 
#         labs(x = NULL, y = "Log fold change", 
#           title = "Log fold changes") + 
#         scale_fill_discrete(name = "Variable") +
#         scale_color_discrete(name = NULL) +
#         ggthemes::theme_clean() + coord_flip()
#       mylist[["p"]] = p1
#     }
#   }
#   mylist[["out"]] = out
#   return(mylist)
# }

```


#### Beta-diversity
##### Ordination 
```{r message=FALSE, warning=FALSE}
#  
source("envtoverlay_oli.R")
pord1 = envtoverlay_oli(ps, covariates="time2", ordmet = "PCA", size = 3, shape = "matrix")
pord1
ggsave(plot = pord1, filename = "../Results_data_analysis/Figures/pca.pdf",width = 7, height = 7)
```

##### PERMANOVA
```{r}
perm_func = function(ps_temp, var){
  ps_temp = ps_temp %>% microbiome::transform(transform = "compositional")
  m_temp = ps_temp %>% pstoveg_sample()
  dist = distance(ps_temp, method="bray")
  formula = stats::as.formula(sprintf("%s ~ %s", "dist", paste(var,collapse = " * ")))
  ado = adonis2(formula = formula, data = m_temp, permutations = 999)
  return(ado)
}
perm_func(ps_temp = ps, var = c("time2","matrix"))

perm_func(ps_temp = ps %>% subset_samples(matrix=="nylon"),var = c("time2"))
perm_func(ps_temp = ps %>% subset_samples(matrix=="nylon disc"),var = c("time2"))
perm_func(ps_temp = ps %>% subset_samples(matrix=="nylon mesh"),var = c("time2"))
perm_func(ps_temp = ps %>% subset_samples(matrix=="sponge"),var = c("time2"))
perm_func(ps_temp = ps %>% subset_samples(matrix=="sponge water"),var = c("time2"))

# Pairwise-PERMANOVA
ppperm_func = function(ps_temp){
  ps_temp = ps_temp %>% microbiome::transform(transform = "compositional")
  m_temp = ps_temp %>% pstoveg_sample()
  dist = distance(ps_temp, method="bray")
  perm = pairwise.adonis(dist, m_temp$matrix)
  return(perm)
}

ppperm_res_all = ppperm_func(ps)
ppperm_res_12 = ppperm_func(ps %>% subset_samples(time%ni% c(0.1,0.3,1.0,3.0,6.0)))
ppperm_res_6 = ppperm_func(ps %>% subset_samples(time%ni% c(0.1,0.3,1.0,3.0,12)))
ppperm_res_3 = ppperm_func(ps %>% subset_samples(time%ni% c(0.1,0.3,1.0,6.0,12)))
ppperm_res_1 = ppperm_func(ps %>% subset_samples(time%ni% c(0.1,0.3,3.0,6.0,12)))
ppperm_res_03 = ppperm_func(ps %>% subset_samples(time%ni% c(0.1,1.0,3.0,6.0,12)))
ppperm_res_01 = ppperm_func(ps %>% subset_samples(time%ni% c(0.3,1.0,3.0,6.0,12)))

#networks for matrix community over time?
#Indicatorspecies OR ANCOM-BC per matrix and time
```

##### Evolution of beta-diversity similarity over time

```{r message=FALSE, warning=FALSE, include=FALSE}
similarity_time_func = function(ps, group, method, earliest_time){
    ps_trimmed = ps %>% subset_samples(matrix=="nylon") %>% 
      ps_mutate(time2 = as.factor(time2))
    ps_rar = ps_trimmed %>% rarefy_even_depth(sample.size = min(sample_sums(ps)), rngseed = 31, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) 
    ps_rar_clr <- ps_rar %>% microbiome::transform("compositional")
    temp = phyloseq_group_dissimilarity(ps_rar_clr, group = group, method = method, between_groups = T, justDF = T) %>% subset(Comparison =="between-groups")
    nylon = temp[gsub("(^.*)-.*$","\\1",temp$Group) == as.character(earliest_time),] %>% dplyr::mutate(matrix="nylon")
    
    ps_trimmed = ps %>% subset_samples(matrix=="nylon disc") %>% 
      ps_mutate(time2 = as.factor(time2))
    ps_rar = ps_trimmed %>% rarefy_even_depth(sample.size = min(sample_sums(ps)), rngseed = 31, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) 
    ps_rar_clr <- ps_rar %>% microbiome::transform("compositional")
    temp = phyloseq_group_dissimilarity(ps_rar_clr, group = group, method = method, between_groups = T, justDF = T) %>% subset(Comparison =="between-groups")
    nylon_disc = temp[gsub("(^.*)-.*$","\\1",temp$Group) == as.character(earliest_time),] %>% dplyr::mutate(matrix="nylon disc")
    
    ps_trimmed = ps %>% subset_samples(matrix=="nylon mesh") %>% 
      ps_mutate(time2 = as.factor(time2))
    ps_rar = ps_trimmed %>% rarefy_even_depth(sample.size = min(sample_sums(ps)), rngseed = 31, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) 
    ps_rar_clr <- ps_rar %>% microbiome::transform("compositional")
    temp = phyloseq_group_dissimilarity(ps_rar_clr, group = group, method = method, between_groups = T, justDF = T) %>% subset(Comparison =="between-groups")
    nylon_mesh = temp[gsub("(^.*)-.*$","\\1",temp$Group) == as.character(earliest_time),] %>% dplyr::mutate(matrix="nylon mesh")

    ps_trimmed = ps %>% subset_samples(matrix=="sponge") %>% 
      ps_mutate(time2 = as.factor(time2))
    ps_rar = ps_trimmed %>% rarefy_even_depth(sample.size = min(sample_sums(ps)), rngseed = 31, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) 
    ps_rar_clr <- ps_rar %>% microbiome::transform("compositional")
    temp = phyloseq_group_dissimilarity(ps_rar_clr, group = group, method = method, between_groups = T, justDF = T) %>% subset(Comparison =="between-groups")
    sponge = temp[gsub("(^.*)-.*$","\\1",temp$Group) == as.character(earliest_time),] %>% dplyr::mutate(matrix="sponge")
      
    ps_trimmed = ps %>% subset_samples(matrix=="sponge water") %>% 
      ps_mutate(time2 = as.factor(time2))
    ps_rar = ps_trimmed %>% rarefy_even_depth(sample.size = min(sample_sums(ps)), rngseed = 31, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) 
    ps_rar_clr <- ps_rar %>% microbiome::transform("compositional")
    temp = phyloseq_group_dissimilarity(ps_rar_clr, group = group, method = method, between_groups = T, justDF = T) %>% subset(Comparison =="between-groups")
    sponge_water = temp[gsub("(^.*)-.*$","\\1",temp$Group) == as.character(earliest_time),] %>% dplyr::mutate(matrix="sponge water")
    
  beta_all = bind_rows(nylon, nylon_disc, nylon_mesh, sponge, sponge_water)
  beta_all$Group = as.character(beta_all$Group)
  beta_all = beta_all %>% 
      dplyr::mutate(Group = str_replace(Group, '10-30', "30")) %>% 
      dplyr::mutate(Group = str_replace(Group, '10-60', "60")) %>% 
      dplyr::mutate(Group = str_replace(Group, '10-180', "180")) %>%
      dplyr::mutate(Group = str_replace(Group, '10-360', "360")) %>% 
      dplyr::mutate(Group = str_replace(Group, '10-720', "720")) %>% 
      dplyr::mutate(Group = as.numeric(Group))  
  sbreaks = c(30,60,180,360,720)
  df.summary <- beta_all %>%
  dplyr::group_by(Group,matrix) %>%
  dplyr::summarise(
    sd = sd(Dist, na.rm = TRUE),
    Dist = mean(Dist)
  )
  
  pdiss <- ggplot(beta_all, aes(x = Group, y = Dist, color = matrix, fill = matrix)) +
    #geom_jitter(size = 0.05, color="gray") +
    geom_line(size=2, data = df.summary) +
    #geom_errorbar( aes(ymin = Dist-sd, ymax = Dist+sd), data = df.summary, width = 0.25,size=0.8)+  
    labs(x = "Deployment time (min)", y = "Beta diversity dissimilarity") + 
    scale_fill_viridis_d(guide = guide_legend(title = "Matrix")) + 
    scale_color_viridis_d(guide = guide_legend(title = "Matrix")) +
    ggthemes::theme_clean()+ 
    scale_x_continuous(breaks=sbreaks) # +
    #scale_y_continuous(limits = c(0, NA))
  return(pdiss)
}

pdiss = similarity_time_func(ps = ps , group="time2", method="bray", earliest_time=10)
pdiss = set_palette(pdiss,palette = "jco")

ggsave(plot = pdiss, filename = paste0("../Results_data_analysis/Figures/Time_dissimilarity.pdf"),width = 9,height = 7,device = "pdf") 

dd = pdiss$data# %>% dplyr::mutate(deployment="ltd")

# smooth_pp = ggplot(dd, aes(x=Group %>% as.numeric(), y=Dist)) + 
#   #geom_smooth(method="gam", formula = y ~ s(x, bs = "cs", k=6),aes(color=location))+ 
#   geom_smooth(method="lm",aes(color=location,linetype = deployment))+ 
#   facet_wrap(.~location, drop = TRUE, scale="free", ncol=3)+
#   scale_color_brewer(palette="Paired") + 
#   ggthemes::theme_clean() + 
#   scale_x_continuous(breaks = c(2,6,12,26,39,52))+
#   xlab("Settlement time (week)")+
#   ylab("Dissimilarity")+
#   theme(legend.position="none")
# 
# smooth_pp
# ggsave(plot = smooth_pp, filename = paste0("Figures/Time_dissimilarity2.pdf"),width = 12,height = 4,device = "pdf") 
```