## Supplementary file Code S1

# Analysis of temporal dynamics: HF Ant case study 
# Single location, multiple temporal samples

###################################################
### LOAD REQUIRED PACKAGES
###################################################
library(tidyverse) # for data manipulation and graphing
library(vegan) # for distance metrics
library(labdsv) # for distance metrics and ordination
library(codyn) # for temporal turnover
library(ggrepel) # for graphing 
library(VennDiagram) # overlap and Venn diagrams

###################################################
### READ IN THE DATA and SELECT DATA SUBSET FOR ANALYSIS
###################################################
# Data can be downloaded from https://harvardforest1.fas.harvard.edu/exist/apps/datasets/showData.html?id=hf118
dat.raw <- read.csv("hf118-01-ants.csv", header = T)
#table(dat.raw$date, dat.raw$year)

dat.temp <- dat.raw %>%
  dplyr::select(year, block, plot, treatment, genus, species, abundance) %>%
  mutate(abundance = ifelse(is.na(dat.raw$abundance) == T, 1, dat.raw$abundance)) %>%
  group_by(year, block, plot, treatment) %>%
  summarise(total_abundance = sum(abundance, na.rm = T)) %>%
  ungroup() %>%
  mutate(plot = as.factor(as.character(plot)))

dat.temp %>% 
  ggplot(aes(x = year, y = total_abundance, colour = treatment, shape = plot)) +
    geom_line() + geom_point() +
    scale_x_continuous(breaks = function(x) seq(ceiling(x[1]), floor(x[2]), by = 1)) + 
    scale_shape_manual(values = 0:8) +
    theme_bw(base_size = 18, )

# We selected plot 4 of the logged treatment because it was one of the most dynamic
# Generate the longer format dataset
dat.long <- dat.raw %>%
  mutate(abundance = ifelse(is.na(dat.raw$abundance) == T, 1, dat.raw$abundance)) %>%
  mutate(plot = as.factor(as.character(plot))) %>%
  filter(treatment == "Logged", plot == "4") %>%
  dplyr::select(year, genus, species, abundance) %>%
  filter(complete.cases(.)) %>%
  group_by(year, genus, species) %>%
  summarise(total_abundance = sum(abundance, na.rm = T))

# Generate the wider format dataset
dat.wide <- dat.raw %>%
  mutate(abundance = ifelse(is.na(dat.raw$abundance) == T, 1, dat.raw$abundance)) %>%
  mutate(plot = as.factor(as.character(plot))) %>%
  filter(treatment == "Logged", plot == "4") %>%
  dplyr::select(year, genus, species, abundance) %>%
  filter(complete.cases(.)) %>%
  group_by(year, genus, species) %>%
  summarise(total_abundance = sum(abundance, na.rm = T)) %>%
  pivot_wider(names_from = c(genus, species), values_from = total_abundance, values_fill = list(total_abundance = 0), names_sep = " ") %>% 
  column_to_rownames("year")

# vector of years of sampling
ant.years <- unique(dat.long$year)
  
###################################################
### EXPLORATORY DATA ANALYSIS (EDA)
###################################################

## COUNT THE NUMBER OF TAXA, NUMBER OF SAMPLES, NUMBER OF EMPTY SAMPLES

### NUMBER OF TAXA and NUMBER OF SAMPLES

dim(dat.wide) # 32 species sampled over 13 years
ant.years

### ROW AND COLUMN SUMMARIES AND RELATIVE ABUNDANCE DISTRIBUTION

#### ROW SUMMARIES, i.e. time sample summaries

# Total abundance across all species for each time by summing the abundance in each row. Interpretation: There are no empty transects and there are some transects with greater total relative abundance (i.e. cover) than others.
time.totals <- rowSums(dat.wide)
summary(time.totals)
hist(time.totals)

ggplot(dat.wide, aes(x = ant.years, y = time.totals)) +
  geom_line() + geom_point(size = 3) +
  scale_x_continuous(breaks = function(x) seq(ceiling(x[1]), floor(x[2]), by = 1)) + 
  ylab("Number of individuals") + xlab("Year") +
  theme_bw(base_size = 18) +
  theme(axis.text.x = element_text(angle = 90))

#### COLUMN SUMMARIES, i.e. taxa summary

# Calculate the total cover of each species across all transect-times  
species.totals <- colSums(dat.wide)
summary(species.totals)

#### RELATIVE ABUNDANCE DISTRIBUTION
# Interpretation: Most species were rare across the sampling period; only 3-4 species dominated the community
dat.wide %>%
  colSums(.) %>%
  as.data.frame() %>%
  rename(Total_abundance = ".") %>%
  rownames_to_column("Species") %>%
  ggplot(aes(x = reorder(Species, -Total_abundance), Total_abundance)) + 
  geom_bar(stat = "identity", fill = "steelblue4") +
  ylab("Total number of\nindividuals across\nall years") +
  xlab("") + 
  theme_bw(base_size = 16) +
  theme(axis.text.x = element_text(face = "italic", angle = 90, vjust = 0.5, hjust = 1)) 

### VISUALISE and QUANTIFY COMPOSITIONAL TURNOVER VIA DETRENDED CORRESPONDENCE ANALYSIS (DCA) and PRINCIPAL COORDINATES ANALYSIS (PCoA)

# The length of the gradient is given to us from the inertia values (Leps and Smilauer 2003). Interpretation: The first axis is <2.2. This indicates low to moderate temporal turnover (Leps and Smilauer 2003).

# Perform the DCA
dca.res <- decorana(dat.wide) 
dca.res
plot(dca.res)

## DETERMINE THE OPTIMAL DISSIMILARITY MEASURE FOR DISSIMILARITY-BASED ANALYSES
# We used Principal coordinates analysis (PCoA) for this, a.k.a. Classical Multidimensional Scaling. Interpretation: The Canberra distance shows the best spread of points.

# Euclidean distance: from vegdist function d[jk] = sqrt(sum(x[ij]-x[ik])^2)
pco.eu <- cmdscale(vegdist(dat.wide, method="euclidean"), eig=T) 

# Bray-Curtis dissimilarity: from vegdist function d[jk] = (sum abs(x[ij]-x[ik]))/(sum (x[ij]+x[ik]))
pco.bc <- cmdscale(vegdist(dat.wide, method="bray"), eig=T) 

# Alternative Gower distance: from vegdist function d[jk] = (1/NZ) sum(abs(x[ij] - x[ik])), NZ is the number of non-zero columns excluding double-zeros (Anderson et al. 2006)
pco.ag <- cmdscale(vegdist(dat.wide, method="altGower"), eig=T) 

# Manhattan dissimilarity: from vegdist function d[jk] = sum(abs(x[ij] - x[ik]))
pco.mh <- cmdscale(vegdist(dat.wide, method="manhattan"), eig=T) 

# Chi-square distance: from dsvdis function (exp - obs) / sqrt{exp}
pco.cs <- cmdscale(dsvdis(dat.wide, "chisq"), eig=T)

# Canberra distance: from vegdist function d[jk] = (1/NZ) sum ((x[ij]-x[ik])/(x[ij]+x[ik]))
pco.ca <- cmdscale(vegdist(dat.wide, "canberra"), eig=T) 

par(mfrow=c(3,2))
plot(pco.eu$points, main="PCoA Euclidean (i.e. PCA)")
plot(pco.bc$points, main="PCoA Bray-Curtis") 
plot(pco.ag$points, main="PCoA alt Gower") 
plot(pco.mh$points, main="PCoA Manhattan") 
plot(pco.cs$points, main="PCoA Chi-squared") 
plot(pco.ca$points, main="PCoA Canberra")
par(mfrow=c(1,1))

## PRINCIPAL CO-ORDINATES ANALYSIS (PCoA)

# Perform the PCoA using Canberra distance (as determined in the EDA). Canberra distance: from vegdist function d[jk] = (1/NZ) sum ((x[ij]-x[ik])/(x[ij]+x[ik])) where NZ is the number of non-zero entries
pco.ca <- cmdscale(vegdist(dat.wide, "canberra"), eig=T) 

# The variation explained by each axis is its eigenvalue divided by the sum of the positive eigenvalues. Interpretation: The first axis explains 20.6% and the second axis explains 15% of the variation. This is reasonably a good level of variance explained (35% of variation is explained by the first two axes).
pco.ca$eig[1] / sum(pco.ca$eig[pco.ca$eig > 0])  
pco.ca$eig[2] / sum(pco.ca$eig[pco.ca$eig > 0])

# Extract the pcoa year scores for graphing
year.PCO1 <- pco.ca$points[, 1]
year.PCO2 <- pco.ca$points[, 2]

gdat <- as.data.frame(cbind(ant.years, year.PCO1, year.PCO2))

# Create 'species scores': the correlation between the raw data and the year scores
pco.ca.species <- as.data.frame(cor(dat.wide, pco.ca$points)) 
pco.ca.species$species <- rownames(pco.ca.species)
names(pco.ca.species)[1:2] <- c("PCO1", "PCO2")

pco.ca.species <- pco.ca.species %>%
  separate(species, into = c("Genus", "Species"), sep = " ") %>%
  unite(species, Genus, Species, sep = "\n")

# Graph a trajectory of the year scores and the 'species scores'
# Interpretation: Certain taxa initially increased during the sampling period (A. picea, S. impar, T. lognispinosus, L. neoniger, and C. herculeanus). However, after 2008, composition reverted to being similar to the initial state.
ggplot() +
  geom_text_repel(data = pco.ca.species, aes(x = PCO1, y = PCO2, label = species, fontface = 3), size = 4, col = "firebrick", alpha = 0.8) +
  geom_text_repel(data = gdat, aes(x = year.PCO1, y = year.PCO2), size = 5, col = "steelblue4", alpha = 0.8, label = ant.years) +
  geom_path(data = gdat, aes(x = year.PCO1, y = year.PCO2)) +
  #ggtitle("Year scores and 'species scores', PCoA Canberra") +
  xlab("PCoA axis 1 (20.6%)") + ylab("PCoA axis 2 (15.0%)") +
  theme_bw(base_size = 18)
ggsave("Ant_Ord_Can.png", width = 10, height = 6, dpi = 600)

###################################################
### DATA ANALYSIS: 
### 1. Turnover analysis
### 2. Time lag analysis
### 3. Venn diagrams 
###################################################

# Question: Is there rapid and permanent compositional change after logging treatment in 2005?

##########################################################
# 1. Temporal turnover analysis
##########################################################
# Use the turnover function in the package 'codyn' to compare temporal turnover values over time. Interpretation: turnover between times increased and stayed relatively high after the after the experimental logging treatment was conducted (2005), but then fell between 2011 and 2012, before rising again from 2013. Turnover was caused by both gains and losses in species. A few species were lost from the system in the first half of the sampling period from 2003-2008. These species, and others, were regained from 2009-13. A balance of gains and losses occurred 2014-15.

total.to <-  # total turnover 
  dat.long %>% 
  unite(Species, genus, species, sep = " ") %>% 
  turnover(., time.var = "year", species.var = "Species", abundance.var = "total_abundance", metric = "total") %>%
  mutate(Measure = "Total turnover") %>% rename(Value = total)

appear.to <-  # relative species appearances over time
  dat.long %>% 
  unite(Species, genus, species, sep = " ") %>% 
  turnover(., time.var = "year", species.var = "Species", abundance.var = "total_abundance", metric = "appearance") %>%
  mutate(Measure = "Appearances") %>% rename(Value = appearance)

disappear.to <-  # relative species disappearances over time
  dat.long %>% 
  unite(Species, genus, species, sep = " ") %>% 
  turnover(., time.var = "year", species.var = "Species", abundance.var = "total_abundance", metric = "disappearance") %>%
  mutate(Measure = "Disappearances") %>% rename(Value = disappearance)

rbind(total.to, appear.to, disappear.to) %>%
  ggplot(aes(x = year, y = Value, col = Measure)) +
  geom_line() + geom_point(size = 3) +
  scale_x_continuous(breaks = function(x) seq(ceiling(x[1]), floor(x[2]), by = 1)) + 
  ylab("Turnover") + xlab("Year") +
  theme_bw(base_size = 18) +
  theme(axis.text.x = element_text(angle = 90))
ggsave("Ant_Turnover_Can.png", width = 10, height = 6, dpi = 600)

##########################################################
# 2. Time lag analysis
##########################################################

# Interpretation: Dissimilarity did not increase significantly over the entire sampling period; largest dissimlarities are at least 2.5 years apart.  

# Plot time Lag regression using Bray-Curtis dissimilarity
Ant_time_lag <- data.frame(can.dist = as.numeric(vegdist(dat.wide, method = "canberra")), time.lag = as.numeric(vegdist(ant.years, method = "euclidean")))

summary(lm(can.dist ~ time.lag, data = Ant_time_lag))

Ant_time_lag %>% 
  ggplot(aes(x = time.lag, y = can.dist)) +
  geom_point() +
  #stat_smooth(method = "lm", se = F, size = 1) + # fit a regression line
  stat_smooth(method = "loess", se = F, size = 1, col = "firebrick3") + # fit a curve
  xlab("Time lag (years)") + ylab("Canberra distance in composition") +
  theme_bw(base_size = 18)
ggsave("Ant_TimeLag_Can.png", width = 10, height = 6, dpi = 600)


#### 2. VENN DIAGRAMS showing overlap in species between temporal samples
# Interpretation: The 2003 baseline sample is completely nested within the final 2015 sample, showing that five novel species invaded the plot over the 13-year study period.

# Make a list for 2003 and 2015 showing those species that occur in that year.
spp.lists.year <- list("2003" = names(which(colSums(ant.pa[1, ]) > 0)),
                       "2015" = names(which(colSums(ant.pa[12, ]) > 0)))

# Show the list
spp.lists.year

# Calculate the shared species in the two samples
# Show the species that are shared and not shared between 2003 and 2015; 
# lists correspond to the three circles in the Venn diagram
# a1 = species in 2003; a2 = species in 2015; a3 = species in both 2003 and 2015

(overlap <- calculate.overlap(spp.lists.year))

# Plot the Venn diagram
venn.plot <- venn.diagram(x = spp.lists.year, category.names = c("2003", "2015"), filename = NULL, cex.category.names = 2, cex = 2, fill = c(4, 5), lwd = 3, alpha = rep(0.5, 2))

grid.draw(venn.plot)