---
title: |
  | **No sex difference in preen oil chemical composition during incubation in Kentish Plovers**
  |
  | R code
author: "*M. Gilles, A. Kosztolányi, A. Rocha, I. C. Cuthill, T. Székely & B. A. Caspers*"
output: 
  pdf_document: 
    toc: true # to generate a table of contents
    toc_depth: 3  # upto three depths of headings (specified by #, ## and ###)
    number_sections: true  ## if you want number sections at each table header
    latex_engine: xelatex
  html_document: default
header-includes: 
 \usepackage{float}
 \floatplacement{figure}{H} # for the position of figures/plots
---

```{r global-options, include=FALSE}
knitr::opts_chunk$set(warning=FALSE, message=FALSE) 
# settings for the whole documents: no message and no warning will be displayed
```

>

Required packages
```{r, results='hide'}
library(qpcR) # to use rbind.na function
library(dplyr) # to manipulate data
library(GCalignR) # to align and process chemical data
library(vegan) # to analyse chemical data with mutlivariate analyses
library(lme4) # to analyse alpha diversity with linear mixed models
library(betareg) # to run beta regressions
library(broom.mixed) # to get confidence intervals from linear mixed models 
library(partR2) # to get R2 and confidence intervals of fixed effects from mixed models
library(performance) # to verify model assumptions
library(simr) # to conduct power analyses
library(ggplot2) # to plot
library(lemon) # for plotting (coord_capped_cart)
```

# Data

## Chemical data

Import the chemical data 
```{r, results='hide'}
prepforGCalignR <- read.csv("chemdata.csv", header = F, check.names = F)
```

Format the data for GCalignR
```{r, results='hide'}
prepforGCalignR <- prepforGCalignR[,c(T,T,F,F)] # remove every 3rd and 4th row
sampleids <- prepforGCalignR[1,] # create a dataframe with just the first row
sampleids <- sampleids[,c(T,F)] # remove empty cell between each sampleID
RTarea <- matrix(c("RT","area")) # prepare the row with RT/area
RTarea <- t(RTarea) # flip columns and rows
RTarea <- data.frame(RTarea) 
prepforGCalignR <- qpcR:::rbind.na(sampleids, RTarea, prepforGCalignR) 
# bind sampleIDs, RTarea and the data, fill the end of sampleIDs and RT/area with NAs
prepforGCalignR <- prepforGCalignR[-c(3,4),] # remove unnecessary rows
# save as a .txt file
write.table(prepforGCalignR,"dataforGCalignR.txt",
            row.names = F, col.names = F, sep="\t", na = "", quote = F)
check_input("dataforGCalignR.txt") # check that the data format is good
```

Create a vector with the name of blank samples
```{r}
blanknames <- c("blank2707","blank2807","blank2907","blank3007",
                "blank0308","blank0408","blank0209")
```

Align chromatograms
```{r, cache=TRUE, results='hide'}
aligned <- align_chromatograms(data = "dataforGCalignR.txt",
           rt_col_name = "RT",
           reference = "D59121_250619", 
           # set the first sample analysed in GC-MS as the reference
           blanks = blanknames, # remove substances that are detected in blanks
           delete_single_peak = TRUE, 
           # remove substances that are detected in one sample only
           remove_empty = TRUE, # remove empty samples
           max_diff_peak2mean = 0.02, # maximum difference to the mean RT across samples
           permute = F) # keep the order of samples constant
```

Check the output of the alignment procedure
```{r}
print(aligned) 
```

Heatmap of the aligned chromatograms
```{r}
gc_heatmap(aligned,threshold = 0.02) 
```

Normalize to get relative abundances
```{r results='hide'}
chemdata <- norm_peaks(aligned, conc_col_name = "area",rt_col_name = "RT", 
                       out = "data.frame")
```

Average (and SD) number of substances per sample
```{r}
mean <- mean(rowSums(chemdata != 0))
sd <- sd(rowSums(chemdata != 0))
data.frame(mean,sd)
```

Number and proportion of samples in which each substance occurs
```{r}
# RT of each substance
RT <- round(as.numeric(colnames(chemdata)), digits = 3)
# number of samples where each substance occurs
n_samples <- colSums(chemdata != 0) 
# proportion of samples where each substance occurs
prop_samples <- colSums(chemdata != 0)/nrow(chemdata) 
# put it in a data frame
subst_details <- data.frame(RT, n_samples, prop_samples)
# number of substances detected in all samples
nrow(subst_details[subst_details$n_samples == 20,])
```

Relationship between the proportion of samples in which a substance occurs and the retention time of the substance
```{r, fig.dim = c(3, 3.5), fig.align="center"}
# plot 
plot(subst_details$prop_samples~subst_details$RT, 
     ylab = "Proportion of samples",
     xlab = "Retention time (min)")
```

Beta regression to test the quadratic relationship between the proportion of samples in which a substance occurs and the retention time of the substance
```{r, fig.dim = c(4, 3), fig.align="center"}
# center RT to reduce the correlation between the linear and quadratic terms
subst_details$RT_centered <- scale(subst_details$RT)

# slightly modify proportion of samples to fit a beta regression (should not include 0 or 1)
subst_details_modified <- subst_details
subst_details_modified$prop_samples <- ifelse(subst_details_modified$prop_samples == 0, 0.000001, 
                                       ifelse(subst_details_modified$prop_samples == 1, 0.999999, 
                                       subst_details_modified$prop_samples))
# orthogonal polynomials using poly(X,2)
# complementary log-log link as the proportion of samples is mostly very high or very low
ortho <- betareg(prop_samples ~ poly(RT_centered,2), data = subst_details_modified,
                 link = "cloglog")
summary(ortho)
check_model(ortho)
```
*The substances that are detected in all samples (N = 35 substances) all have an intermediate retention time (between 12 and 24 min), while substances with a lower or higher retention times are detected in relatively less samples. Indeed, we find a negative quadratic effect of the retention time of a substance on the proportion of samples in which it occurs (beta = -5.57, P < 0.001).* 

Average (and SD) relative proportion of each substance in samples
```{r}
options(scipen = 999) # to disable scientific notation
# average relative abundance of each substance
subst_details$mean_abun <- colMeans(chemdata)
subst_details$mean_abun <- round(subst_details$mean_abun, digits = 2)
# sd relative abundance of each substance
subst_details$sd_abun <- sqrt(diag(cov(chemdata)))
subst_details$sd_abun <- round(subst_details$sd_abun, digits = 2)
```

Log-transform relative abundances to give more weight to low-abundance substances
```{r}
chemdata <- log(chemdata + 1)
```

## Metadata

Import metadata
```{r}
metadata <- read.csv("metadata.csv")
```

Add variable on independent versus paired (in a breeding pair) individuals
```{r}
metadata$paired <- ifelse(duplicated(metadata$nest) | 
                          duplicated(metadata$nest, fromLast = TRUE), "paired", "independent")
metadata$paired <- as.factor(metadata$paired)
```

Transform sex and nest variables as a factor
```{r}
metadata$sex <- as.factor(metadata$sex)
metadata$nest<- as.factor(metadata$nest)
```

Check that the design is balanced
```{r}
table(metadata$sex)
```

Check how many samples are from individuals from the same nest (i.e. pair ID)
```{r}
table(metadata$nest)
```

Check the variation in the seasonal variable (i.e. number of days after laying)
```{r}
mean <- mean(metadata$daysafterlay)
sd <- sd(metadata$daysafterlay)
min <- min(metadata$daysafterlay)
max <- max(metadata$daysafterlay)
data.frame(mean,sd,min,max)
```
```{r, fig.dim = c(3.5, 3.5), fig.align="center"}
hist(metadata$daysafterlay, main = NULL, xlab = "Days after laying")
```

Set sample names as row names
```{r}
rownames(metadata) <- metadata[,1]
metadata <- metadata[,-1]
```

Make sure that the rows in chemdata and metadata are consistent
```{r}
metadata <- metadata[rownames(metadata) %in% rownames(chemdata),]
chemdata <- chemdata[rownames(chemdata) %in% rownames(metadata),] 
chemdata <- chemdata[match(rownames(metadata),rownames(chemdata)),]
```

# Analysis

## PERMANOVA

Some samples are from independent individuals (N = 12) and from paired individuals (N = 8, from 4 breeding pairs). Because individuals from the same breeding pair may be more similar, we need to run the PERMANOVA using only independent individuals. Thus, we select the 12 independent samples, as well as 4 samples randomly selected among the paired individuals (always 1 male and 3 females, never from the same breeding pair), and iterate PERMANOVAs 1000 times. Doing so, we use all 20 samples, but run PERMANOVAs with 16 independent samples in a balanced design (8 females, 8 males). Having a balanced design is important, as PERMANOVA can be sensitive to differences in dispersion under unbalanced designs, especially with type I (sequential) sums of square. Here, we use type I (sequential) sums of square in order to test for the interaction between sex and the number of days after laying.

```{r}
# number of PERMANOVAs (iterations)
nad <- 1000
# number of permutations per PERMANOVA
perm <- 9999
# vectors to store the results from the iterated PERMANOVA
SSsex <- rep(NA,nad)
SSdays <- rep(NA,nad)
SSsexdays <- rep(NA,nad)
SSres <- rep(NA,nad)
R2sex <- rep(NA,nad)
R2days <- rep(NA,nad)
R2sexdays <- rep(NA,nad)
R2res <- rep(NA,nad)
Fsex <- rep(NA,nad)
Fdays <- rep(NA,nad)
Fsexdays <- rep(NA,nad)
nFsex <- rep(NA,nad)
nFdays <- rep(NA,nad)
nFsexdays <- rep(NA,nad)

# iterated PERMANOVA
for(r in 1:nad) {
  # select the data
  # 12 independent birds
  independent_birds <- metadata %>% 
    filter(paired == "independent")%>%
    select(nest)
  # 8 paired birds
  paired <- metadata %>% 
    filter(paired == "paired")
  # select randomly 2 males from the paired birds
  paired_males <- paired %>% 
    filter(sex == "M") %>%
    distinct(nest) %>%
    sample_n(2) 
  # select randomly 2 females from the paired birds 
  # (but not the partners of the selected males)
  paired_females <- paired %>% 
    filter(sex == "F" & !nest %in% paired_males$nest) %>%
    distinct(nest) %>%
    sample_n(2) 
  # 16 independent birds (7 females and 9 males)
  selection <- bind_rows(independent_birds, paired_males, paired_females) 
  # selection of chemical data and metadata
  chemdata1 <- chemdata[rownames(selection),]
  metadata1 <- metadata[rownames(selection),]
  # PERMANOVA
  aout <- adonis2(chemdata1~sex*daysafterlay,data=metadata1,permutations=perm,by="terms")
  # store the results
  SSsex[r] <- aout$SumOfSqs[1]
  SSdays[r] <- aout$SumOfSqs[2]
  SSsexdays[r] <- aout$SumOfSqs[3]
  SSres[r] <- aout$SumOfSqs[4]
  R2sex[r] <- aout$R2[1]
  R2days[r] <- aout$R2[2]
  R2sexdays[r] <- aout$R2[3]
  R2res[r] <- aout$R2[4]
  Fsex[r] <- aout$F[1]
  Fdays[r] <- aout$F[2]
  Fsexdays[r] <- aout$F[3]
  nFsex[r] <- sum(attr(aout,"F.perm")[,1]>=aout$F[1])
  nFdays[r] <- sum(attr(aout,"F.perm")[,2]>=aout$F[2])
  nFsexdays[r] <- sum(attr(aout,"F.perm")[,3]>=aout$F[3])
}
```

Results from iterated PERMANOVA: Sums of squares
```{r}
data.frame(sex=unclass(summary(SSsex)),days=unclass(summary(SSdays)),
           "sex:days"=unclass(summary(SSsexdays)),residuals=unclass(summary(SSres)))
```

Results from iterated PERMANOVA: *R<sup>2</sup>*
```{r}
data.frame(sex=unclass(summary(R2sex)),days=unclass(summary(R2days)),
           "sex:days"=unclass(summary(R2sexdays)),residuals=unclass(summary(R2res)))
```

Results from iterated PERMANOVA: *F*-statistics
```{r}
data.frame(sex=unclass(summary(Fsex)),days=unclass(summary(Fdays)),
           "sex:days"=unclass(summary(Fsexdays)))
```

Results from iterated PERMANOVA: *P*-values
```{r}
data.frame(sex=sum(nFsex+1)/(nad*(perm+1)),days=sum(nFdays+1)/(nad*(perm+1)),
           "sex:days"=sum(nFsexdays+1)/(nad*(perm+1)))
```

*No effect of sex, the number of days after laying,the interaction between sex and the number of days after laying on preen oil composition (Bray-Curtis dissimilarities). No difference in location (centroids) between females and males.*

Test whether dispersion differs between the sexes
```{r}
chemdata_bc <- vegdist(chemdata) # Bray-Curtis dissimilarity matrix
disp <- betadisper(chemdata_bc, metadata$sex, type = "centroid")
disp$group.distances # distance to median for each group
anova(disp) # test of difference in dispersion
```

*No difference in dispersion between females and males.*


## Non-metric multidimensional scaling (NMDS)

Calculate Bray-Curtis dissimlarities
```{r, results='hide'}
bc <- metaMDS(chemdata, distance = "bray")
```

Check the stress (how good the distance between samples in actual multivariate distance is represented in two dimensions)
```{r}
bc$stress
```
*A stress value inferior to 0.1 indicates a good fit: the distance between samples in two dimensions represents well the actual multivariate distance between samples.*

Plot
```{r, fig.dim = c(4, 4), fig.align="center"}
# add NMDS to the metadata
bc <- as.data.frame(bc[["points"]])
metadata$MDS1 <- bc$MDS1
metadata$MDS2 <- bc$MDS2
# plot
ggplot(data = metadata) + 
  geom_point(aes(MDS1, MDS2, color = sex, shape = sex, size = sex)) +
  # 95% confidence level for a multivariate t-distribution
  stat_ellipse(aes(MDS1, MDS2, color = sex), type = "t", level = 0.95) + 
  theme_void() +
  scale_size_manual(values=c(5,6)) +
  scale_shape_manual(values=c(16,18)) + 
  scale_color_manual(values = c("grey64","tan2")) + 
  annotate("text", x = 0.305, y = -.195, label = "2D Stress: 0.06", size = 3.5) + 
  theme(panel.background = element_rect(colour = "black", size = 1, fill = NA),
        aspect.ratio = 1, 
        legend.title = element_blank(), 
        legend.position = c(1.885, .88), 
        legend.text = element_text(size=14),
        legend.background = element_rect(size = 0.4, linetype = "solid", color = "black"), 
        legend.margin = margin(4, 8, 7, 5),
        plot.margin=unit(c(1,1,1,1),"cm")) 
```

## Shannon diversity (H)

Calculate and add Shannon diversity (H)
```{r}
metadata$H <- diversity(chemdata, "shannon")
``` 

Distribution of diversity
```{r, fig.dim = c(3.5, 3.5), fig.align="center"}
hist(metadata$H, main = NULL, xlab = "Shannon diversity") 
```

*The variable is positive, continuous, normal. We can therefore fit the linear regression with a Gaussian distribution.*

Linear mixed model testing the effects of sex and season (days after laying) while controlling for pair ID
```{r}
lmm.H <- lmer(H ~ sex * daysafterlay + (1|nest), data = metadata)
summary(lmm.H)
```

Beta estimates (with confidence intervals)
```{r}
tidy(lmm.H, conf.int = TRUE, conf.method = 'boot')
```

*Chemical diversity is not affected by sex or season.*

Check model assumptions
```{r, fig.dim = c(8, 9), fig.align="center"}
check_model(lmm.H)
```

Boxplot
```{r, fig.dim = c(2,4), fig.align="center"}
ggplot(metadata, aes(x=sex, y=H, fill = sex))+ 
  geom_boxplot()+
  geom_jitter(position=position_jitter(0.1))+
  scale_fill_manual(values=c("grey64","tan2"))+
  labs(x = element_blank(), y = "Shannon diversity")+
  theme_classic()+
  theme(legend.position = "none",
        axis.title = element_text(size = 12))+
  annotate("text", x = 1.5, y=4.025, label = "ns") +
  geom_segment(aes(x = 1, y = 4.008, xend = 2, yend = 4.008))
```

## Chemical richness (S)

Calculate and add richness (number of substances; S)
```{r}
metadata$S <- specnumber(chemdata)
``` 

Mean and SD number of substances in females and males
```{r}
aggregate(metadata$S, list(metadata$sex), FUN=mean)
aggregate(metadata$S, list(metadata$sex), FUN=sd)
```

Distribution of richness
```{r, fig.dim = c(3.5, 3.5), fig.align="center"}
hist(metadata$S, main = NULL, xlab = "Chemical richness") 
```

*The variable is positive continuous and normal. We can therefore fit the linear regression with a Gaussian distribution.*

Linear mixed model testing the effects of sex and season (days after laying) while controlling for pair ID
```{r}
lmm.S <- lmer(S ~ sex * daysafterlay + (1|nest), data = metadata)
summary(lmm.S)
```

Beta estimates (with confidence intervals)
```{r}
tidy(lmm.S, conf.int = TRUE, conf.method = 'boot')
```

*Chemical diversity is not affected by sex or season.*

Check model assumptions
```{r, fig.dim = c(8, 9), fig.align="center"}
check_model(lmm.S)
```

Boxplot
```{r, fig.dim = c(2, 4), fig.align="center"}
ggplot(metadata, aes(x=sex, y=S, fill = sex))+ 
  geom_boxplot()+
  geom_jitter(position=position_jitter(0.1))+
  scale_fill_manual(values=c("grey64","tan2"))+
  scale_x_discrete(labels= c("Female","Male"))+
  labs(x = element_blank(), y = "Number of substances")+
  theme_classic()+
  theme(legend.position = "none",
        axis.ticks.x =element_blank(),
        axis.text.x = element_text(size = 13),
        axis.text.y = element_text(size = 12),
        axis.title.y = element_text(size = 14))+
  annotate("text", x = 1.5, y=86.25, label = "ns") +
  geom_segment(aes(x = 1, y = 84.9, xend = 2, yend = 84.9))
```