#ASV non chimeric sequences, taxonomic annotation and phylogenetic tree were obtained
#followin: 
#Callahan BJ, Sankaran K, Fukuyama JA et al. 
#Bioconductor Workflow for Microbiome 
#Data Analysis: from raw reads to community analyses 
#[version 2; peer review: 3 approved]. F1000Research 2016,
# 5:1492 (https://doi.org/10.12688/f1000research.8986.2)

#######         Phyloseq raw object

#Magellan penguin meta data
site <- c(rep("back", 11), rep("foot", 10), rep("nest", 9), rep("chest", 10))
sp <- rep("S. magellanicus", 40)
regs <- data.frame(Regions = site,
                   Species = sp)
#King peguin meta data
kreg <- c(rep("back",8), rep("foot", 8), rep("chest", 8))
Species <- rep("A. patagonicus", 8)
king_meta <- data.frame(Regions = kreg,
                        Species)

### Merge metadata objects by rows

Meta <- rbind(king_meta, regs)

#Assing sample names to rownames of metadata data frame
row.names(Meta) <- row.names(pen)


###### Phyloseq object
#ASV table - non chimeric sequences from DADA2
#tax table - non chimeric sequences annotation file 
#Sample data - described above
#phy tree - phylogenetic tree constructed with non chimeric sequences

RAW <- phyloseq(otu_table(pen, taxa_are_rows = FALSE),
                   tax_table(tax),
                   sample_data(Meta),
                   phy_tree(tree$tree))

# ADD DNA concentration data in DNA micrograms

DNA_quant_ug <- c(0.002,
                  0.004,
                  0.004,
                  0.004,
                  0.003,
                  0.002,
                  0.002,
                  0.009,
                  0.002,
                  0.002,
                  0.008,
                  0.002,
                  0.002,
                  0.002,
                  0.004,
                  0.006,
                  0.124,
                  0.004,
                  0.002,
                  0.027,
                  0.016,
                  0.005,
                  0.005,
                  0.059,
                  0.007,
                  0.002,
                  0.002,
                  0.005,
                  0.002,
                  0.002,
                  0.004,
                  0.004,
                  0.006,
                  0.004,
                  0.002,
                  0.009,
                  0.004,
                  0.006,
                  0.002,
                  0.003,
                  0.002,
                  0.001,
                  0.002,
                  0.003,
                  0.004,
                  0.029,
                  0.009,
                  0.035,
                  0.018,
                  0.025,
                  0.004,
                  0.004,
                  0.036,
                  0.017,
                  0.004,
                  0.006,
                  0.002,
                  0.013,
                  0.007,
                  0.002,
                  0.006,
                  0.005,
                  0.004,
                  0.02)

length(DNA_quant_ug)

# Add metadata field to RAW phyloseq object
sample_data(RAW)$DNA_quant_ug <- DNA_quant_ug

RAW

View(tax_table(RAW))

##### Filters

# Remove contaminants using the frecuency method

library(decontam)

contam_df_freq <- isContaminant(RAW, method = "frequency", conc = "DNA_quant_ug")

# Check number of ASVs identified as contaminants
table(contam_df_freq$contaminant)
# 20654 total ASVs
# 117 AsvS identified as contaminants 

# Remove this contaminant ASVs

RAW_nocontam <- prune_taxa(!contam_df_freq$contaminant, RAW)
sum(sample_sums(RAW)) - sum(sample_sums(RAW_nocontam))
#Remove taxa that only appears in one sample

#Compute prevalence
prevdf = apply(X = otu_table(RAW_nocontam),
               MARGIN = ifelse(taxa_are_rows(RAW_nocontam), yes = 1, no = 2),
               FUN = function(x) {sum(x>0)})

#Add taxonomy to prevalence data

prevdf = data.frame(prevalence = prevdf,
                    TotalAbundance = taxa_sums(RAW_nocontam),
                    tax_table(RAW_nocontam))


#Compute total and average prevalence for each phylum
plyr::ddply(prevdf, "Phylum", function(df1){cbind(mean(df1$prevalence), sum(df1$prevalence))})


#Phyla whose prevalence is 1, that is, only occurs in one sample
filterPhyla = c("Zixibacteria", "WS2", "Cloacimonadota", "FCPU426")
#FIlter low prevalence phyla
filtpen <- subset_taxa(RAW_nocontam, !Phylum %in% filterPhyla)
sum(sample_sums(filtpen))
sum(sample_sums(RAW_nocontam))

#Remove taxa with 0 observation
penps <- prune_taxa(taxa_sums(filtpen) > 0, filtpen)

#Remove taxa annotated as Chloroplast or Mithocondria
penclean <- penps %>% subset_taxa(Order!="Chloroplast")
penclean2 <- penclean %>% subset_taxa(Family!= "Mitochondria")

sum(sample_sums(RAW_nocontam)) - sum(sample_sums(penclean))

sum(sample_sums(penclean)) - sum(sample_sums(penclean2))

#Final ASV data set: 17 227 ASVs
range(sample_sums(penclean2))
#Library size spans 30 104 - 94 631 reads

### Normalize library size

penrare <- rarefy_even_depth(penclean2, 30100, rngseed = 123, replace = FALSE)             

# phyloseq object with normalized library size has
# 17 005 ASVs 

range(sample_sums(penrare))


######          COMMUNITY ANALYSIS        #####

######### LOAD REQUIRED LIBRARIES

library(MicrobiotaProcess)
library(reltools)
library(microbiomeutilities)
library(microbiome)
library(vegan)
library(picante)
library(gridExtra)
library(dplyr)
library(tidyr)
library(reshape2)
library(UpSetR)
library(minpack.lm)
library(Hmisc)
library(stats4)
library(patchwork)
library(phyloseq)

#Phyloseq object
penrare

### Add metadata file (useful for Upset plot)

sampleData(penrare)$Penguin_niche <- c(rep("Ap back", 8), 
                                       rep("Ap foot", 8), 
                                       rep("Ap chest", 8),
                                       rep("Sm back", 11), 
                                       rep("Sm foot", 10 ),
                                       rep("Sm nest soil", 9),
                                       rep("Sm chest", 10))


#### HEATMAPs of most abundant bacteria

### Collapse dataset taxonomy to 10 most abundant families

topfam <- aggregate_top_taxa2(penrare, "Family", top = 10)
View(otu_table(topfam))
# transform families dataset to relative abundance

famrel <- transform(topfam, "compositional")
View(otu_table(famrel))
#### Subset by species

king <- subset_samples(famrel, Species == "A. patagonicus")
mag <- subset_samples(famrel, Species == "S. magellanicus")

#### Subset by species from original phyloseq object

king_rare <- subset_samples(penrare, Species == "A. patagonicus")
mag_rare <- subset_samples(penrare, Species == "S. magellanicus")


### Define colors for heatmap
heat.cols <- c("#a8dadc","#457b9d", "#1d3557")
### Create a gradient color palette for abundance
grad_ab <- colorRampPalette(c("#96d4ca","#d3f3f1", "#7c65a9"))
heat.cols <- grad_ab(10)

### Heatmap for 10 most abundant bacterial families in king penguin body sites

heatmap_k <- simple_heatmap(king,
                            group.facet = "Regions",
                            group.order = NULL,
                            abund.thres = 0.01,
                            prev.thres = 0.01,
                            level = "Family",
                            scale.color = "log10",
                            na.fill = "white",
                            color.fill = heat.cols,
                            taxa.arrange=TRUE,
                            remove.other=TRUE,
                            panel.arrange="grid",
                            ncol=NULL,
                            nrow=NULL)
heatmap_k


#### TOP FIVE FAMILIES
library(microbiomeutilities)
#KING PENGUIN

kcols <- c("steelblue", "grey50", "red2")

pn <- plot_taxa_boxplot(king_rare,
                        taxonomic.level = "Family",
                        top.otu = 5, 
                        group = "Regions",
                        add.violin= FALSE,
                        title = "Top five families", 
                        keep.other = FALSE,
                        group.order = c("back","chest","foot"),
                        group.colors = kcols,
                        dot.size = 1)

print(pn + theme_biome_utils())

pn$data
View(pn$data)

#### MAGELLANIC PENGUIN

mcols <- c("steelblue", "grey50", "red2", "black")

magpn <- plot_taxa_boxplot(mag_rare,
                        taxonomic.level = "Family",
                        top.otu = 5, 
                        group = "Regions",
                        add.violin= FALSE,
                        title = "Top five families", 
                        keep.other = FALSE,
                        group.order = c("back","chest","foot", "nest"),
                        group.colors = mcols,
                        dot.size = 1)

print(magpn + theme_biome_utils())

View(magpn$data)


### Heatmap for 10 most abundant bacterial families in Magellan penguin body sites and nest soil

heatmap_ma <- simple_heatmap(mag,
                             group.facet = "Regions",
                             group.order = NULL,
                             abund.thres = 0.01,
                             prev.thres = 0.01,
                             level = "Family",
                             scale.color = "log10",
                             na.fill = "white",
                             color.fill = heat.cols,
                             taxa.arrange=TRUE,
                             remove.other=TRUE,
                             panel.arrange="grid",
                             ncol=NULL,
                             nrow=NULL)
heatmap_ma

### Arrange heatmaps in one graphic

heatmaps <- grid.arrange(heatmap_k, heatmap_ma, nrow = 2)

### Save heatmaps

ggsave("heatmaps_june.pdf", 
       plot = heatmaps,
       width = 20, 
       height = 20, 
       units = "cm")


#######       CORE BACTERIA     #####

#### Get data with binary format to perfom upset graphic between all penguin sample types

uppen <- get_upset(penrare, factorNames = "Penguin_niche")

### Upset plot with all penguin sample types directly exported as a PDF


pdf(file="pensppsharedASV_july.pdf", 
    onefile=FALSE,
    width = 10) 

upset(uppen,
      sets = c("Ap back", "Ap chest", "Ap foot", 
               "Sm back", "Sm chest", "Sm foot", "Sm nest soil"),
      order.by = "freq",
      mainbar.y.label = "Number of ASVs shared",
      queries = list(list(query = intersects,
                          params = list("Ap back", "Sm back"),
                          color = "lightgoldenrod",
                          active = T),
                     list(query = intersects,
                          params = list("Ap foot", "Sm foot"),
                          color = "lightblue",
                          active = T),
                     list(query = intersects, 
                          params = list("Ap chest", "Sm chest"),
                          color = "indianred2",
                          active = T),
                     list(query = intersects, 
                          params = list("Ap back", 
                                        "Ap chest",
                                        "Ap foot"),
                          color = "orange",
                          active = T),
                     list(query = intersects, 
                          params = list("Sm back",
                                        "Sm chest",
                                        "Sm foot"),
                          color = "pink",
                          active = T),
                     list(query = intersects, 
                          params = list("Sm back",
                                        "Sm chest",
                                        "Sm foot",
                                        "Sm nest soil"),
                          color = "green",
                          active = T),
                     list(query = intersects, 
                          params = list("Ap back", 
                                        "Ap chest",
                                        "Ap foot",
                                        "Sm back",
                                        "Sm chest",
                                        "Sm foot",
                                        "Sm nest soil"),
                          color = "purple",
                          active = T),
                     list(query = intersects, 
                          params = list("Ap back", 
                                        "Ap chest",
                                        "Ap foot",
                                        "Sm back",
                                        "Sm chest",
                                        "Sm foot"),
                          color = "blue",
                          active = T)))
dev.off()

### CORE BACTERIA

###Transform original phyloseq object to relative abundance
penrel <- transform(penrare, "compositional")
### ADD "ASV" tag to each ASV
allpenASV <- add_refseq(penrel, tag = "ASV")
### Add best taxonomy level possible to each ASV
allpenASV <- format_to_besthit(allpenASV)

# Core tresholds: minimum ASV relative abundance 0.001 % minimum prevalence among samples 90%

#Among all samples
full_core <- core(allpenASV, detection = 0.001, prevalence = .9)
View(tax_table(full_core))

#Among king penguin samples

#Add ASV tag and best taxonomic hit to king penguin samples
allking
kingASV <- add_refseq(allking, tag = "ASV")
kingASV2 <- format_to_besthit(kingASV)
king_ASV2 <- transform(kingASV2, "compositional")
#Core bacteria in king penguin

king_core <- core(king_ASV2, detection = 0.001, prevalence = .9)
View(tax_table(king_core))

#Among Magellan penguin samples

allmag
magASV <- add_refseq(allmag, tag = "ASV")
magASV2 <- format_to_besthit(magASV)
mag_ASV2 <- transform(magASV2, "compositional")

mag_core <- core(mag_ASV2, detection = 0.001, prevalence = .9)

View(tax_table(mag_core))



#######               ALPHA DIVERSITY ANALYSIS       #####

#### Subset by species from original phyloseq object
penrare

allking <- subset_samples(penrare, Species == "A. patagonicus")
allmag <- subset_samples(penrare, Species == "S. magellanicus")

###                          KING PENGUIN

allking

#DATA FRAME with alpha diversity (PD and Shannon) across body sites
div <- data.frame(
  "Shannon" = estimate_richness(allking, measures = "Shannon"),
  "PD" = pd(samp = otu_table(allking), tree = phy_tree(allking), include.root = FALSE),
  "Region" = sample_data(allking)$Region,
  "Species" = sample_data(allking)$Species)

colnames(div)

# Change column names

colnames(div) <- c("Shannon", "PD", "PD.SR", "Region", "Species")

### King penguin alpha diversty (Shannon & PD) trends across body sites

king_alpha <- div %>%
  gather(key = metric, value = value, c("Shannon", "PD")) %>%
  mutate(metric = factor(metric, levels = c("Shannon", "PD"))) %>%
  ggplot(aes(x = Region, y = value)) +
  geom_jitter(aes(colour = Region), size = 3, alpha = 1) +
  scale_color_manual(values = c("lightgoldenrod", "indianred2", "lightblue", "tan4")) +
  theme_classic() +
  labs(x = "", y = "") +
  facet_wrap(~ metric, scales = "free") +
  theme(legend.position="none") +
  labs(title = "King penguin alpha diversity measures")

king_alpha

#Exportamos grafico de alfa div a pdf

ggsave("king_alpha.pdf", 
       plot = king_alpha,
       width = 25, 
       height = 20, 
       units = "cm")

#### King penguin alpha diversity statistics

### Kruskal test between PD across king penguin body sites
kruskal.test(PD~Region, data=div)

#### Pairwise wilcoxon test to detect specific differences
pairwise.wilcox.test(div$PD, div$Region, exact=F)

### Kruskal test between Shannon diversity across king penguin body sites
kruskal.test(Shannon~Region, data=div) # NO DIFFERENCES

###             MAGELLAN PENGUIN

allmag

#DATA FRAME with alpha diversity (PD and Shannon) across body sites

magdiv <- data.frame(
  "Shannon" = estimate_richness(allmag, measures = "Shannon"),
  "PD" = pd(samp = otu_table(allmag), tree = phy_tree(allmag), include.root = FALSE),
  "Region" = sample_data(allmag)$Region,
  "Species" = sample_data(allmag)$Species)

### Change column names
colnames(magdiv) <- c("Shannon", "PD", "PD.SR", "Region", "Species")

### Magellan penguin alpha diversty (Shannon & PD) trends across body sites and nest

mag_alpha <- magdiv %>%
  gather(key = metric, value = value, c("Shannon", "PD")) %>%
  mutate(metric = factor(metric, levels = c("Shannon", "PD"))) %>%
  ggplot(aes(x = Region, y = value)) +
  geom_jitter(aes(colour = Region), size = 3, alpha = 1) +
  scale_color_manual(values = c("lightgoldenrod", "indianred2", "lightblue", "tan4")) +
  theme_classic()+
  labs(x = "", y = "") +
  facet_wrap(~ metric, scales = "free") +
  theme(legend.position="none") +
  labs(title = "Magellanic penguin alpha diversity measures")

mag_alpha

#### Magellan penguin alpha diversity statistics

### Kruskal test between PD across Magellan penguin body sites and nest soil
kruskal.test(PD~Region, data=magdiv)

#### Pairwise wilcoxon test to detect specific differences
pairwise.wilcox.test(magdiv$PD, magdiv$Region, exact=F)

### Kruskal test between Shannon across Magellan penguin body sites and nest soil
kruskal.test(Shannon~Region, data=magdiv) # NO DIFFERENCES


### Combine alpha diversity plots from each species and export to pdf

alpha <- king_alpha/mag_alpha

ggsave("penguin_alpha_july26.pdf", 
       plot = alpha,
       width = 25, 
       height = 20, 
       units = "cm")

##############      BETA DIVERSITY ANALYSIS     ##########

###   King penguin

# Transform king penguin phyloseq to relative abundances and compute weighted unifrac distance

allkrel <- transform(allking, "compositional")
wun_dist <- distance(allkrel, method = "wunifrac")

#Test dispersion among body sites
disp_wun <- betadisper(wun_dist, sample_data(allking)$Regions)
permutest(disp_wun)

#PCoA Ordination

kwuni <- ordinate(allkrel, method = "PCoA", distance = "wunifrac")

kingord <- plot_ordination(
  allkrel,
  kwuni, type="samples", color= "Regions") + 
  scale_colour_manual(values = c("lightgoldenrod", "indianred2", "lightblue")) +
  geom_point(size = 4) +
  stat_ellipse(aes(group = Regions), linetype = 2) +
  theme_classic() +                                                      #changes theme, removes grey background
  ggtitle("King penguin body sites")

### PERMANOVA global test among King penguin body sites

permall <- adonis(wun_dist ~ sample_data(allkrel)$Regions)

#Anova-like table for PERMANOVA results
permall$aov.tab

### Paired PERMANOVA for King penguin body site comparisons 


#Paired permanova function

pairwise.adonis.physeq <- function(x,factors,dist.m="wunifrac", perm=999,p.adjust.m ='BH'){
  co <- combn(unique(as.character(sample_data(x)[,factors][[1]])),2)
  pairs <- c()
  F.Model <- c()
  R2 <- c()
  p.value <- c()
  for (elem in 1:ncol(co)){ 
    cosa<-c(co[1,elem],co[2,elem])
    filtered<-prune_samples(sample_names(x)[which( sample_data(x)[,factors][[1]] %in% cosa)],x)
    filtered<- filter_taxa(filtered, function(x) sum(x) > 0, TRUE)
    x1=distance(filtered,dist.m)
    form<-paste("x1","~",factors)
    ad = adonis(as.formula(form),as(sample_data(filtered),"data.frame"),perm=perm);
    pairs = c(pairs,paste(co[1,elem],'vs',co[2,elem]));
    F.Model =c(F.Model,ad$aov.tab[1,4]);
    R2 = c(R2,ad$aov.tab[1,5]);
    p.value = c(p.value,ad$aov.tab[1,6])
  }
  p.adjusted = p.adjust(p.value,method=p.adjust.m)
  sig = c(rep('',length(p.adjusted)))
  sig[p.adjusted <= 0.05] <-'.'
  sig[p.adjusted <= 0.01] <-'*'
  sig[p.adjusted <= 0.001] <-'**'
  sig[p.adjusted <= 0.0001] <-'***'
  pairw.res = data.frame(pairs,F.Model,R2,p.value,p.adjusted,sig)
  print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")
  return(pairw.res)
}

#Perform PERMANOVA paired comparisons

allkrel %>% filter_taxa(function(x) sum(x) > 0, T) %>% 
  pairwise.adonis.physeq(.,factors = "Regions", dist.m = "wunifrac")

###       SIMPER analysis

#We use phyloseq object with best taxonomy hit and ASV tags
kingASV2
kv2 <- transform(kingASV2, "compositional")

#Save as data frame sample data and ASV table
mdk <- data.frame(sample_data(kingASV2))
otuk2 <- data.frame(otu_table(kv2))

#Run simper analysis and save it
simking <- simper(otuk2, mdk$Regions, permutations = 999)
sumsk <- summary(simking)

#Access particular combinations and store them in csv format for manipulation in Excel

###back vs foot
back_foot <- sumsk$back_foot
head(back_foot)
write.csv(back, "back_foot_june.csv")
####back vs chest

back_chest <- sumsk$back_chest
head(back_chest)
write.csv(chest_back, "back_chest_june.csv")

####foot vs chest

chest_foot <- sumsk$foot_chest
head(chest_foot)
write.csv(back, "foot_chest_june.csv")

###       Magellanic penguin

# Transform Magellan penguin phyloseq to relative abundances and compute weighted unifrac distance

allmrel <- transform(allmag, "compositional")
mwundist <- distance(allmag, method = "wunifrac")

#Test dispersion among body sites
magdisp_wun <- betadisper(mwundist, sample_data(allmag)$Regions)
permutest(magdisp_wun)

#PCoA ordination

magwuni <- ordinate(allmrel, method = "PCoA", distance = "wunifrac")

magord <- plot_ordination(
  allmrel,
  magwuni, type="samples", color= "Regions") + 
  scale_colour_manual(values = c("lightgoldenrod", "indianred2", "lightblue","tan4")) +
  geom_point(size = 4) +
  stat_ellipse(aes(group = Regions), linetype = 2) +
  theme_classic() +                                                      #changes theme, removes grey background
  ggtitle("Magellanic penguin body sites and nest")

### PERMANOVA global test among Magellanic penguin body sites

per_mag <- adonis(mwundist ~ sample_data(allmrel)$Regions)

#Anova-like table for PERMANOVA results
per_mag$aov.tab

### Paired PERMANOVA for Magellanic penguin body site comparisons 

allmrel %>% filter_taxa(function(x) sum(x) > 0, T) %>% 
  pairwise.adonis.physeq(.,factors = "Regions", dist.m = "wunifrac")

###   SIMPER Analysis

#We use phyloseq object with best taxonomy hit and ASV tags

magASV2
mv2 <- transform(magASV2, "compositional")

#Save as data frame sample data and ASV table
magsam <- data.frame(sample_data(magASV2))
otum <- data.frame(otu_table(mv2))

#Run simper analysis and save it
simmag <- simper(otum, magsam$Regions, permutations = 999)
sumsm <- summary(simmag)

#Access particular combinations that render significant results in paired permanova comparisons and store them in csv format for manipulation in Excel

###back vs nest

back <- sumsm$back_nest
head(back)
write.csv(back, "back_nest_june.csv")

#chest vs nest
chest <- sumsm$nest_chest
head(chest)
write.csv(chest, "chest_nest_june.csv")

#foot vs nest
pata <- sumsm$foot_nest
head(pata)
write.csv(chest, "foot_nest_june.csv")

#### Intraspecific ordination array and export as PDF

pens <- kingord + magord
ggsave("penord2_july26.pdf",
       plot = pens,
       width = 30, 
       height = 20, 
       units = "cm")

####          Global ordination and interspecific beta diversity comparisons

#Normalized phyloseq object
penrare
#Remove nest, to display only body sites comparisons
bodies <- subset_samples(penrare, Penguin_niche != "Sm nest soil")

# Transform penguin spp phyloseq to relative abundances and compute weighted unifrac distance

bodrel <- transform(bodies, "compositional")
bodwund <- distance(bodies, method = "wunifrac")

#Test dispersions among sample types
boddisp_wun <- betadisper(bodwund, sample_data(bodies)$Species)
permutest(boddisp_wun) #Dispersion differs between species

#PCoA Species ordination

bodwuni <- ordinate(bodrel, method = "PCoA", distance = "wunifrac")

bodord <- plot_ordination(
  bodrel,
  bodwuni, type="samples", color= "Species") + 
  scale_colour_manual(values = c("lightgoldenrod", "indianred2")) +
  geom_point(size = 3) +
  stat_ellipse(linetype = 2) +
  theme_classic() +                                                     
  ggtitle("Penguin species comparison")

### PERMANOVA global test among Magellan and King penguin body sites

perm_species <- adonis(bodwund ~ sample_data(bodrel)$Species)

#Anova-like table for PERMANOVA results

perm_species$aov.tab 

############# Interspecific body site ordinations

##################    BACK

#Subset backs from both species
back <- subset_samples(bodrel, Regions == "back")

#Interspecific back ordination
backwuni <- ordinate(back, method = "PCoA", distance = "wunifrac")


backord <- plot_ordination(back, backwuni, type="samples", color= "Species") + 
  scale_colour_manual(values = c("lightgoldenrod", "indianred2")) +
  geom_point(size = 3) +
  stat_ellipse(linetype = 2) +
  theme_classic() +                                                      
  ggtitle("Back comparison")

#Test dispersions among penguin backs

backw <- distance(back, method = "wunifrac")
backdisp <- betadisper(backw, sample_data(back)$Species)
permutest(backdisp) #Dispersions differ among penguin backs

#PERMANOVA test between penguins backs

permback <- adonis(backw ~ sample_data(back)$Species)
permback$aov.tab 


##################    CHEST


#Subset chests from both species
chest <- subset_samples(bodrel, Regions == "chest")

#Interspecific chest ordination

chestwuni <- ordinate(chest, method = "PCoA", distance = "wunifrac")

chestord <- plot_ordination(
  chest,
  chestwuni, type="samples", color= "Species") + 
  scale_colour_manual(values = c("lightgoldenrod", "indianred2")) +
  geom_point(size = 3) +
  stat_ellipse(linetype = 2) +
  theme_classic() +                     
  ggtitle("Back comparison")

#Test dispersions among penguin chests
chestw <- distance(chest, method = "wunifrac")
chestdisp <- betadisper(chestw, sample_data(chest)$Species)
permutest(chestdisp) #Dispersions differ among penguin chests

#PERMANOVA test between penguins chests
permchest <- adonis(chestw ~ sample_data(chest)$Species)
permchest$aov.tab 

##################    FOOT

#Subset foot from both species
foot <- subset_samples(bodrel, Regions == "foot")

#Interspecific foot ordination

footwuni <- ordinate(foot, method = "PCoA", distance = "wunifrac")

footord <- plot_ordination(
  foot,
  footwuni, type="samples", color= "Species") + 
  scale_colour_manual(values = c("lightgoldenrod", "indianred2")) +
  geom_point(size = 4) +
  stat_ellipse(linetype = 2) +
  theme_classic() +                                                      #changes theme, removes grey background
  ggtitle("Foot comparison")

#Test dispersions among penguin feet
footw <- distance(foot, method = "wunifrac")
dispfo <- betadisper(footw, sample_data(foot)$Species)
permutest(dispfo) #Dispersions differ among penguin feet

#PERMANOVA test between penguins feet
permfoot <- adonis(footw ~ sample_data(foot)$Species)
permfoot$aov.tab 

### Arrange interspecific ordinations and export it as PDF

bodiesords <- bodord + backord/chestord/footord

ggsave("interpen2_june.pdf",
       plot = bodiesords,
       width = 39, 
       height = 20, 
       units = "cm")


#######         SLOAN NEUTRAL COMMUNITY MODEL          #######

#Code from:
#Burns, A., Stephens, W., Stagaman, K. et al. 
#Contribution of neutral processes to the assembly of 
#gut microbial communities in the zebrafish over host 
#development. ISME J 10, 655-664 (2016). 
#https://doi.org/10.1038/ismej.2015.142

# Normalized ASV table (here exemplified with king penguin)
# These steps must be repeated with Magellanic penguin (without nest samples)

allking

onlymag <- subset_samples(bodies, Species == "S. magellanicus")

kspp <- otu_table(allking)

# FIT SLOAN NEUTRAL COMMUNITY MODEL

fit_sncm <- function(spp, pool=NULL, taxon=NULL){
  
  options(warn=-1)
  
  #Calculate the number of individuals per community
  N <- mean(apply(spp, 1, sum))
  
  #Calculate the average relative abundance of each taxa across communities
  if(is.null(pool)){
    p.m <- apply(spp, 2, mean)
    p.m <- p.m[p.m != 0]
    p <- p.m/N
  } else {
    p.m <- apply(pool, 2, mean)
    p.m <- p.m[p.m != 0]
    p <- p.m/N
  }
  
  #Calculate the occurrence frequency of each taxa across communities
  spp.bi <- 1*(spp>0)
  freq <- apply(spp.bi, 2, mean)
  freq <- freq[freq != 0]
  
  #Combine
  C <- merge(p, freq, by=0)
  C <- C[order(C[,2]),]
  C <- as.data.frame(C)
  #Removes rows with any zero (absent in either source pool or local communities)
  C.0 <- C[!(apply(C, 1, function(y) any(y == 0))),]
  p <- C.0[,2]
  freq <- C.0[,3]
  names(p) <- C.0[,1]
  names(freq) <- C.0[,1]
  
  #Calculate the limit of detection
  d = 1/N
  
  ##Fit model parameter m (or Nm) using Non-linear least squares (NLS)
  m.fit <- nlsLM(freq ~ pbeta(d, N*m*p, N*m*(1-p), lower.tail=FALSE), start=list(m=0.001))
  m.ci <- confint(m.fit, 'm', level=0.95)
  
  ##Calculate goodness-of-fit (R-squared and Root Mean Squared Error)
  freq.pred <- pbeta(d, N*coef(m.fit)*p, N*coef(m.fit)*(1-p), lower.tail=FALSE)
  Rsqr <- 1 - (sum((freq - freq.pred)^2))/(sum((freq - mean(freq))^2))
  RMSE <- sqrt(sum((freq-freq.pred)^2)/(length(freq)-1))
  
  pred.ci <- binconf(freq.pred*nrow(spp), nrow(spp), alpha=0.05, method="wilson", return.df=TRUE)
  
  ##Calculate AIC for Poisson model
  pois.LL <- function(mu, sigma){
    R = freq - ppois(d, N*p, lower.tail=FALSE)
    R = dnorm(R, mu, sigma)
    -sum(log(R))
  }
  pois.mle <- mle(pois.LL, start=list(mu=0, sigma=0.1), nobs=length(p))
  
  aic.pois <- AIC(pois.mle, k=2)
  bic.pois <- BIC(pois.mle)
  
  ##Goodness of fit for Poisson model
  pois.pred <- ppois(d, N*p, lower.tail=FALSE)
  Rsqr.pois <- 1 - (sum((freq - pois.pred)^2))/(sum((freq - mean(freq))^2))
  RMSE.pois <- sqrt(sum((freq - pois.pred)^2)/(length(freq) - 1))
  
  pois.pred.ci <- binconf(pois.pred*nrow(spp), nrow(spp), alpha=0.05, method="wilson", return.df=TRUE)
  
  ##Results
  fitstats <- data.frame(
    m=as.numeric(coef(m.fit)),
    m.ci=as.numeric(coef(m.fit)-m.ci[1]),
    poisLL=as.numeric(pois.mle@details$value),
    Rsqr=as.numeric(Rsqr), # measuring fit, #comparing fit differing datasets to the same model
    Rsqr.pois=as.numeric(Rsqr.pois),
    RMSE=as.numeric(RMSE), # measuring fit #comparing fit differing datasets to the same model
    RMSE.pois=as.numeric(RMSE.pois),
    AIC.pois=as.numeric(aic.pois),  #comparing differing models to the dataset
    BIC.pois=as.numeric(bic.pois), #comparing differing models to the dataset
    N=as.numeric(N),
    Samples=as.numeric(nrow(spp)),
    Richness=as.numeric(length(p)),
    Detect=as.numeric(d))
  
  A <- cbind(p, freq, freq.pred, pred.ci[,2:3])
  A <- as.data.frame(A)
  colnames(A) <- c('p', 'freq', 'freq.pred', 'pred.lwr', 'pred.upr')
  if(is.null(taxon)){
    B <- A[order(A[,1]),]
  } else {
    B <- merge(A, taxon, by=0, all=TRUE)
    row.names(B) <- B[,1]
    B <- B[,-1]
    B <- B[order(B[,1]),]
  }
  B <- B[!is.na(B$freq),]
  # fit_class for graphing
  B$fit_class <-"As predicted"
  B[which(B$freq < B$pred.lwr),"fit_class"]<- "Below prediction"
  B[which(B$freq > B$pred.upr),"fit_class"]<- "Above prediction"
  B[which(is.na(B$freq)),"fit_class"]<- "NA"
  
  # combine fit stats and predicitons into list
  i <- list(fitstats, B)
  names(i) <- c("fitstats", "predictions")
  return(i)
}

#Run neutral community model

unloadNamespace("MicrobiotaProcess") #Unload microbiota process, since has a redundant function for extracting taxonomy

kspp.out <- fit_sncm(kspp, pool=NULL, taxon=data.frame(tax_table(allking)))
kspp.out$fitstats

### Plot results

#Code extracted from https://rdrr.io/github/DanielSprockett/reltools/man/plot_sncm_fit.html

plot_sncm_fit <- function(spp.out, fill = NULL, title = NULL){
  
  tax_levels <- colnames(spp.out$predictions)[7:length(colnames(spp.out$predictions))-1]
  
  if(is.null(fill)){
    fill <- "fit_class"
  }
  
  r2_val <- paste("r^2 ==", round(spp.out$fitstats$Rsqr,4))
  m_val <- paste("m ==", round(spp.out$fitstats$m,4))
  df <- data.frame(t(table(spp.out$predictions$fit_class)))
  df <- df[,c(2,3)]
  colnames(df) <- c("Prediction", "ASV Abundance")
  
  p <- ggplot(data=spp.out$predictions)
  
  if(fill == "fit_class"){
    p <- p + geom_point(aes(x = log(p), y = freq, fill=eval(parse(text=fill))), shape =21, color="black", size =2, alpha=0.75)
    p <- p + scale_fill_manual(
      name = "Prediction",
      values = c("Above prediction" = "cyan4", "As predicted" = "blue", "Below prediction" = "goldenrod", "NA" = "white"),
      breaks = c("Above prediction", "As predicted", "Below prediction", "NA"),
      labels = c(paste0("Above prediction (",round((df[1,2]/spp.out$fitstats$Richness)*100, 1),"%)"),
                 paste0("As predicted (",round((df[2,2]/spp.out$fitstats$Richness)*100, 1),"%)"),
                 paste0("Below Prediction (",round((df[3,2]/spp.out$fitstats$Richness)*100, 1),"%)"),
                 paste0("NA (",df[4,2],")")))

  }else if (fill %in% tax_levels){
    p <- p + geom_point(aes(x = log(p), y = freq, fill=eval(parse(text=fill))), shape =21, color="black", size =2, alpha=0.75)
    p <- p + scale_fill_discrete(name = "Taxon")
    
  } else{
    print(paste0("fill variable: ", fill, " is not a valid taxonomic level or fit_class"))
  }
  
  p <- p + geom_line(aes(x = log(p), y = freq.pred), color = "blue")
  p <- p + geom_line(aes(x = log(p), y = pred.lwr), color = "blue", linetype="dashed")
  p <- p + geom_line(aes(x = log(p), y = pred.upr), color = "blue", linetype="dashed")
  p <- p + xlab("log(Mean Relative Abundance)")
  p <- p + ylab("Frequency")
  p <- p + ggtitle(title)
  p <- p + theme_classic()
  p <- p + annotate("text", x=mean(log(spp.out$predictions$p), na.rm = TRUE), y=0.95, size=5, label = r2_val, parse=TRUE)
  p <- p + annotate("text", x=mean(log(spp.out$predictions$p), na.rm = TRUE), y=0.9, size=5, label = m_val, parse=TRUE)
  
  return(p)
}

?geom_jitter
# Plot 

neutralking <- plot_sncm_fit(kspp.out, title = "Neutral model in King penguin body sites")

neutralking

### Extract data to obtain the taxa with most ASV in above and below prediction categories

res_king <- neutralking$data
write.csv(res_king, file = "neutral_king_june.csv")

# King penguin
# 
#Load taxa with most ASVs in each category

AB <- data_frame(Taxa = c("Psychrobacter",
                          "Chryseobacterium",
                          "Flavobacterium",
                          "Nocardioides",
                          "Deinococcus"),
                 ASV.number = c(10,
                                7,
                                4,
                                4,
                                3))

#Barplot of 5 taxa with most ASVs above prediction

above_k <-  ggplot(AB, aes(x = reorder(Taxa, ASV.number), y = ASV.number)) +
  geom_bar(stat = 'identity', fill = "cyan4") +
  xlab("Genus") +
  theme_classic() +
  theme(axis.text.x = element_text(face = "italic")) +
  ggtitle("Genera with more ASVs above predicted frequency")


#Barplot of 5 taxa with most ASVs below prediction


BE <- data.frame(Taxa = c("Psychrobacter",
                          "Ornithobacterium",
                          "Fusobacterium",
                          "Suttonella",
                          "Corynebacterium"),
                 ASV.number = c(26,
                                11,
                                8,
                                8,
                                7))

#Barplot of 5 taxa with most ASVs below prediction

below_k <-  ggplot(BE, aes(x = reorder(Taxa, ASV.number), y = ASV.number)) +
  geom_bar(stat = 'identity', fill = "goldenrod") +
  xlab("Genus") +
  theme_classic() +
  theme(axis.text.x = element_text(face = "italic")) +
  ggtitle("Genera with more ASVs below predicted frequency")




### Merge neutral model and barplots and export it to pdf

k_neutral_grobs <- neutralking + above_k/below_k

###EXPORTAMOS PDF

ggsave("neutral_king_july26.pdf",
       plot = k_neutral_grobs,
       width = 39, 
       height = 20, 
       units = "cm")

# Magellanic penguin
# 
#Load taxa with most ASVs in each category

ABmag <- data_frame(Taxa = c("Nocardioides",
                          "Sporosarcina",
                          "Pedobacter",
                          "Rhodococcus",
                          "Sphingomonas"),
                 ASV.number = c(11,
                                6,
                                4,
                                4,
                                4))

#Barplot of 5 taxa with most ASVs above prediction

above_mag <-  ggplot(ABmag, aes(x = reorder(Taxa, ASV.number), y = ASV.number)) +
  geom_bar(stat = 'identity', fill = "cyan4") +
  xlab("Genus") +
  theme_classic() +
  theme(axis.text.x = element_text(face = "italic")) +
  ggtitle("Genera with more ASVs above predicted frequency")

above_mag

#Barplot of 5 taxa with most ASVs below prediction


BE_mag <- data.frame(Taxa = c("Psychrobacter",
                          "Fusobacterium",
                          "Bacteroides",
                          "Ornithobacterium",
                          "Staphylococcus"),
                 ASV.number = c(28,
                                10,
                                7,
                                7,
                                7))

#Barplot of 5 taxa with most ASVs below prediction

below_m <-  ggplot(BE_mag, aes(x = reorder(Taxa, ASV.number), y = ASV.number)) +
  geom_bar(stat = 'identity', fill = "goldenrod") +
  xlab("Genus") +
  theme_classic() +
  theme(axis.text.x = element_text(face = "italic")) +
  ggtitle("Genera with more ASVs below predicted frequency")




### Merge neutral model and barplots and export it to pdf
### neutralking could have either data from king penguin or magellanic penguin!

k_neutral_grobs <- neutralking + above_k/below_k

mag_neutral_grobs <- neutralking + above_mag/below_m

###EXPORTAMOS PDF

ggsave("neutral_mag_july26.pdf",
       plot = mag_neutral_grobs,
       width = 39, 
       height = 20, 
       units = "cm")


##############      Supplemental figures/data     #########

### Rarefaction plots

cleanking <- subset_samples(penclean2, Species == "A. patagonicus")
cleanmag <- subset_samples(penclean2, Species == "S. magellanicus")


#Set seed
set.seed(1)
subsamples <- seq(0, 50000, by=1000)[-1]

#Repeat for each data set, here exemplified with king penguin clean data set
sample_data(cleanking)
p <- plot_alpha_rcurve(cleanmag, index="observed", 
                       subsamples = subsamples,
                       lower.conf = 0.025, 
                       upper.conf = 0.975,
                       group="Regions",
                       label.color = "brown3",
                       label.size = 3,
                       label.min = TRUE) 


### Most abundant genera

# In King penguin body site samples
dom_k <- dominant_taxa(allking,level = "Genus", group="Regions")
write.csv(dom_k$dominant_overview, "dom_genera_king_june.csv")

### In Magellan penguin body site and nest samples
dom_m <- dominant_taxa(allmag,level = "Genus", group="Regions")
write.csv(dom_m$dominant_overview, "dom_genera_mag_june.csv")


##### Psychrobacter phylogenetic tree sequences

allking <- subset_samples(penrare, Species == "A. patagonicus")
allmag <- subset_samples(penrare, Species == "S. magellanicus")

### Extract Psychrobacter sequences that only occur in King penguin
k_psy <- subset_taxa(allking, Genus == "Psychrobacter")

### Prune sequences with 0 abundance
k_psy <- prune_taxa(taxa_sums(k_psy) > 0, k_psy)

### TREE
k_tree <- plot_tree(k_psy,
                    color="Regions") +
  scale_color_manual(values = c("lightgoldenrod", "indianred2", "lightblue")) +
  ggtitle("Psychrobacter sequences associated to King penguin body sites")

### Magellan penguin phylogenetic tree

### Extract Psychrobacter sequences that only occur in King penguin
m_psy <- subset_taxa(allmag, Genus == "Psychrobacter")

### Prune sequences with 0 abundance
m_psy <- prune_taxa(taxa_sums(m_psy) > 0, m_psy)

### TREE
magtree <- plot_tree(m_psy,
                     color="Regions") +
  scale_color_manual(values = c("lightgoldenrod", "indianred2", "lightblue", "tan4")) +
  ggtitle("Psychrobacter sequences associated to Magellanic penguin samples")

### Arrange phylogenetic trees in one graphic and export it to PDF
trees <- k_tree + magtree

ggsave("psy_penspp_tree_july26.pdf", 
       plot = trees,
       width = 30, 
       height = 20, 
       units = "cm")

#####     Interspecific diversity comparisons

######## Shannon between penguin body sites

##Get specific columns from previous alpha diversity data frames
div[,c(1,4,5)]
magdiv[,c(1,4,5)]

#Shannon data frame 
Shdf <- rbind(div[,c(1,4,5)], magdiv[,c(1,4,5)])

### Remove nest soil samples to have only body site alpha diversity
Sh_bird <- subset(Shdf, Region != "nest")

#Prepare data to compare shannon diversity between body sites
Shdf <- data.frame(melt(Sh_bird))

cols <- c("lightgoldenrod", "indianred2")

SHintbox <- ggplot(data=Shdf) +
  geom_boxplot(aes(x=Region, y=value, fill=Species)) +
  theme_classic() + scale_fill_manual(values=cols) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=13)) +
  theme(axis.text.y = element_text(hjust = 1, size=13)) +
  ggtitle("Shannon diversity") + theme(plot.title = element_text(lineheight=3, face="bold", color="black", size=18))


#Specific tests comparing Shannon values between body sites
#foot
SH_foot <- subset(Sh_bird, Region == "foot")
kruskal.test(Shannon~Species, data=SH_foot)

#chest
SH_chest <- subset(Sh_bird, Region == "chest")
kruskal.test(Shannon~Species, data=SH_chest)
#back
SH_back <- subset(Sh_bird, Region == "back")
kruskal.test(Shannon~Species, data=SH_back)

########   PD between penguin body sites

##Get specific columns from previous alpha diversity data frames

div[,c(2,4,5)]
magdiv[,c(2,4,5)]

#PD data frame
PDdf <- rbind(div[,c(2,4,5)], magdiv[,c(2,4,5)])

### Remove nest soil samples to have only body site alpha diversity
PD_bird <- subset(PDdf, Region != "nest")

#Prepare data to compare PD diversity between body sites
df <- data.frame(melt(PD_bird))

cols <- c("lightgoldenrod", "indianred2")

PDintbox <- ggplot(data=df) +
  geom_boxplot(aes(x=Region, y=value, fill=Species)) +
  theme_classic() + scale_fill_manual(values=cols) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=13)) +
  theme(axis.text.y = element_text(hjust = 1, size=13)) +
  ggtitle("PD diversity") + theme(plot.title = element_text(lineheight=3, face="bold", color="black", size=18))

### PD diversity statistics between penguin samples

#Kruskal test comparing PD values between penguin species
kruskal.test(PD~Species, data=PD_bird)

#Specific tests comparing PD values between body sites
#foot
PD_foot <- subset(PD_bird, Region == "foot")
kruskal.test(PD~Species, data=PD_foot)
#Chest
PD_chest <- subset(PD_bird, Region == "chest")
kruskal.test(PD~Species, data=PD_chest)
#Back
PD_back <- subset(PD_bird, Region == "back")
kruskal.test(PD~Species, data=PD_back)


#Arrange plots and export to pdf
inter <- grid.arrange(PDintbox, SHintbox, ncol = 2)

ggsave("inter_alpha.pdf",
       plot = inter,
       width = 30, 
       height = 20, 
       units = "cm")