# Sean Kinard
# 1-20-2020
# Spring 17 Texas Coastal Prairie

# Invertebrate Canonical Ordination Using Hellinger-transformation and 
# RDA constrained to annual precipitation

# Set Working Directory "...PeerJ_Sp17_REVISED\\"
# setwd("C:\\Users\\s2kin\\Dropbox\\Research\\Manuscipts\\Diss.2_Gradient\\PeerJ_Sp17_REVISED\\")

# Load Packages
library(BiodiversityR)
library(ggsci)
library(ggrepel)
library(ggforce)
library(ade4)
library(adegraphics)
library(adespatial)
library(cocorresp)
library(MASS)
library(ellipse)
library(FactoMineR)
library(rrcov)
library(tidyverse)

# Source additional functions 
source("sp17_r_source_files\\hcoplot.R")
source("sp17_r_source_files\\triplot.rda.R")
source("sp17_r_source_files\\plot.lda.R")
source("sp17_r_source_files\\polyvars.R")
source("sp17_r_source_files\\screestick.R")

# Load Data
env <- read.csv("sp17_data_files/sp17_site_x_env.csv")
invert <- read.csv("sp17_data_files/sp17_site_x_invert.csv")

# - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # -

# cleaning up data frames to include only the a priori selected variables and community abundance matrix
(env <- select(env, STAID, AP, flash.index, LFPP, NH4., log.cond, Rosgen.Index))

invert <- select(invert, STAID, ALISOTRICHIA:VALVATA)

# Merge frames by STAID to ensure same order of sites (rows)
mster <- merge(env,invert, by = "STAID")

# - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # -

# # # # # # # # # # # # # # # 
# Redundancy Analysis (RDA) #
# # # # # # # # # # # # # # # 

# step 1: The ordiplot object is obtained from the result of via function rda. The ordination is done with a community data set that is transformed by the Hellinger method as recommended in this article.

# script generated by the BiodiversityR GUI from the constrained ordination menu
invert.Hellinger <- disttransform(select(mster, ALISOTRICHIA:VALVATA), method='hellinger')
Ordination.model2 <- rda(invert.Hellinger ~ .,data=env, scaling="species")
summary(Ordination.model2)

# This can be plotted via ordiplot
plot2 <- ordiplot(Ordination.model2, choices=c(1,2))

# Step 2 : To plot data via ggplot2

# information on the locations of sites (circles in the ordiplot) is obtained via function sites.long.
sites.long2 <- sites.long(plot2, env.data=mster[,2:7])
head(sites.long2)

# Information on species is extracted by function species.long.
species.long2 <- species.long(plot2)
species.long2 

# Information on the labeling of the axes is obtained with function axis.long. This information is extracted from the ordination model and not the ordiplot, hence it is important to select the same axes via argument 'choices'. The extracted information includes information on the percentage of explained variation. I suggest that you cross-check with the summary of the redundancy analysis, where information on proportion explained is given.

axis.long2 <- axis.long(Ordination.model2, choices=c(1, 2))
axis.long2

# Step 3: Generate plot adding information on sites and species similar to the ordiplot:

plot_irda <- ggplot() +
  geom_vline(xintercept = c(0), color = "grey70", linetype = 2) +
  geom_hline(yintercept = c(0), color = "grey70", linetype = 2) +
  xlab(axis.long2[1, "label"]) +
  ylab(axis.long2[2, "label"]) +
  scale_x_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
  scale_y_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
  geom_point(data=sites.long2,
             aes(x=axis1, y=axis2, colour=AP),
             size=6) +
  geom_point(data=species.long2,
             aes(x=axis1, y=axis2)) +
  scale_colour_viridis_c(direction=-1) +
  coord_fixed(ratio=1) +
  theme_classic() +
  theme(text = element_text(size = 16)) +
  theme(axis.text = element_text(size = 16)) +
  theme(axis.title = element_text(size = 20))

plot_irda

# With an ordination via redundancy analysis, the positions of species should be interpreted as arrows. It is straightforward to show the positions of the species as arrows. However, with a large number of species, it is better to reduce the number to those that explain more of the variation among the sites. This information is obtained with the envfit function of vegan.

spec.envfit <- envfit(plot2, env=invert.Hellinger)
spec.data.envfit <- data.frame(r=spec.envfit$vectors$r, p=spec.envfit$vectors$pvals)
species.long2 <- species.long(plot2, spec.data=spec.data.envfit)
species.long2

# Species are selected that have a statistically significant p-value
species.long3 <- species.long2[species.long2$p <= 0.15, ]
species.long3

# adding environmental fit vectors
vectors.envfit <- envfit(plot2, env=mster[,2:7])
vectors.long2 <- vectorfit.long(vectors.envfit)
vectors.long2

# Only Predictors are selected that have a statistically significant p-value
vectors.long3 <- vectors.long2[vectors.long2$p <= 0.05, ]
vectors.long3

# Invetebrate RDA ggplot
plot_irda <- ggplot() +
  geom_vline(xintercept = c(0), color = "grey70", linetype = 2) +
  geom_hline(yintercept = c(0), color = "grey70", linetype = 2) +
  xlab(axis.long2[1, "label"]) +
  ylab(axis.long2[2, "label"]) +
  scale_x_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
  scale_y_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
  geom_segment(data=vectors.long2,
               aes(x=0, 
                   y=0, 
                   xend=axis1*.7,
                   yend=axis2*.7),
               colour="gray", 
               size=1.2, 
               arrow=arrow()) +
  geom_segment(data=vectors.long3,
               aes(x=0, 
                   y=0, 
                   xend=axis1*.7,
                   yend=axis2*.7),
               colour="black", 
               size=1.2, 
               arrow=arrow()) +
  geom_point(data=species.long2,
               aes(x=axis1*2, y=axis2*2),
               colour="red", shape = 3) +
  geom_label_repel(data=species.long3,
                   aes(x=axis1*2, y=axis2*2, label=labels),
                   box.padding = 1)  +
  geom_label_repel(data=vectors.long2,
                   aes(x=axis1*.5, y=axis2*.5, label=rownames(vectors.long2)),
                   colour="black", box.padding = 0.7
  ) +
  geom_point(data=sites.long2, alpha = 0.8,
             aes(x=axis1, y=axis2, colour=AP),
             size=5, show.legend = FALSE) +
  scale_colour_viridis_c(direction=-1) +
  coord_fixed(ratio=1) +
  theme_classic() +
  theme(legend.position="none") +
  theme(text = element_text(size = 16)) +
  theme(axis.text = element_text(size = 16)) +
  theme(axis.title = element_text(size = 18))

plot_irda

# which gives the following ordination diagram (length of vectors were multiplied, which does not change their interpretation)

# species vectors without labels
plot_irda_nolabel <- ggplot() +
  geom_vline(xintercept = c(0), color = "grey70", linetype = 2) +
  geom_hline(yintercept = c(0), color = "grey70", linetype = 2) +
  xlab(axis.long2[1, "label"]) +
  ylab(axis.long2[2, "label"]) +
  scale_x_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
  scale_y_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
  geom_point(data=species.long2,
             aes(x=axis1*2, y=axis2*2),
             colour="red", shape = 3) +
  geom_point(data=sites.long2, alpha = 0.8,
             aes(x=axis1, y=axis2, colour=AP),
             size=5, show.legend = FALSE) +
  scale_colour_viridis_c(direction=-1) +
  coord_fixed(ratio=1) +
  theme_classic() +
  theme(legend.position="none") +
  theme(text = element_text(size = 16)) +
  theme(axis.text = element_text(size = 16)) +
  theme(axis.title = element_text(size = 18))

plot_irda_nolabel

# - # - # - # - # - # - # - # - # - # - # - # - # # - # - # - # - #

# Seek Alternative RDA constrained to all environmental predictors 
# because all variables are not advised for small sample size
spe <- select(mster, ALISOTRICHIA:VALVATA)
env <- select(mster, AP:Rosgen.Index)

# RDA: all environmental variables
spe.hel <- disttransform(spe, method='hellinger')
(spe.rda <- rda(spe.hel ~ ., env)) 
vif.cca(spe.rda) 
# AP            flash.index  LFPP         NH4.         log.cond     Rosgen.Index 
# 10.466060     2.090356     3.416691     6.289334     8.231157     1.152091  

#     RDA with all explanatory variables
spe.rda.all <- rda(spe.hel ~ ., data = env)
# Global adjusted R^2
(R2a.all <- RsquareAdj(spe.rda.all))
# R2 = 0.6514918

# Forward selection using vegan's ordistep()
# This function allows the use of factors. 
mod0 <- rda(spe.hel ~ 1, data = env)
step.forward <- 
  ordistep(mod0, 
           scope = formula(spe.rda.all), 
           direction = "forward", 
           permutations = how(nperm = 499) # Step: spe.hel ~ log.cond 
  )
RsquareAdj(step.forward)

# Forward model selection stops at spe.hel ~ log.cond with  R2 = 0.159445


## 1. Variation partitioning Conductivity vs all other environmental variables
(spe.part.all <- varpart(spe.hel, env$log.cond, select(env, -log.cond)))

# Plot of the partitioning results
par(mfrow = c(1, 1))
    plot(spe.part.all, digits = 2, bg = c("red", "blue", "yellow"))

# No combination of predictors contains a useful amount of variation in the 
# hellinger transformed invertebrate community matrix
    
# conducitivity  <0
# shared         0.103
# other Env      <0
# residuals      1.046
    
# End   
# - # - # - # - # - # - # - # - # - # - # - # - # # - # - # - # - #