---
title: "Cymothoidae outline analysis"
author: "van der Wal, Schädel, Ekrt & Haug"
date: "30.09.2020"
output:
  pdf_document: default
  html_document: default
---

# Setting up the work environment
Loading the necessary libraries.

```{r, results="hide", message=FALSE}
library(Momocs) # elliptic fourier transformation and plotting
library(magrittr) # T pipes
library(dplyr) # data wrangling
library(reshape2) # converting between wide and long table formats
library(ggplot2) # plotting
library(ggrepel) # labels on plots
library(ggpubr) # arranging plots
library(MASS) # for the lda model
library(nnet) # for the multinomial regression
```


```{r}
setwd("/home/mario/Documents/Kuclin_Cymothoidae/momocs_stuff")
metadata <- read.csv("shape_outlines_28_09_ms.csv")

```

The image files are read by walking through the filename column in the metadata table. Files that do not occur there will not be read. The outlines are stored as coordinates.

```{r, results="hide"}
outlines_coo <- import_jpg(paste("/home/mario/Documents/Kuclin_Cymothoidae/outline_rasters_aug_2020_resized/", metadata$filename, sep = ""))
outlines_coo <- Out(outlines_coo)
```

The factors by which the outlines are grouped are read from the metadata table and assigned to the coordinates object.
```{r}
fac <- metadata[, names(metadata) %in% c("ontogeny_coarse",
                                         "ontogeny_fine", 
                                         "attachment_site_coarse", 
                                         "attachment_site_fine")]
outlines_coo$fac <- fac
```

Check if the outlines are correctly read and that the factors match the outlines.

```{r}
panel(outlines_coo, names = TRUE, fac = "ontogeny_coarse")
```

# Performing the elliptic fourier transformation (EFT)
Before the eFT is done, the outlines must be alligned correctly. This can also be checked visually.

```{r}
outlines_coo %>%
  coo_center %>%
  coo_scale %>%
  coo_align %>%
  coo_slidedirection() %T>%
  print() %>% stack()
```

If the alignment looks fine, it can be applied.

```{r}
outlines_coo <- outlines_coo %>%
  coo_center %>%
  coo_scale %>%
  coo_align %>%
  coo_slidedirection()
```

Now, the EFT can be done. ```{r} nb.h``` sets the number of harmonics to use. The coefficients are normalized (the default procedure). This can come with problems depending on the data. See the helpfile for the function for urther information.

```{r}
outlines_coo.f <- efourier(outlines_coo, nb.h = 10, norm = TRUE)
```
The outline data is now stored as fourier coefficients. The first harmonic holds the most variation.


```{r}
boxplot(outlines_coo.f)
```

The variation is getting smaller in further harmonics.

```{r}
boxplot(outlines_coo.f, drop = 1)
```

# Ordination
To visualise the variation in the outline shapes and to simplyfy the data a principal component analysis (PCA) is done.

```{r}
outlines_coo.p <- PCA(outlines_coo.f)
outlines_coo.p[["x"]][,1] <- outlines_coo.p[["x"]][,1]*-1 # invert the x axis for the plots
```

The outlines can now be visualised taking in account the assigned factors.


```{r}
pal <- c("#C10093", "#000000","#4CAB38","#0085F8") # A color blind friendly palette.
plot(outlines_coo.p, 
     chull=TRUE, 
     pos.shp = "full", 
     abbreviate.labelsgroups = FALSE, 
     abbreviate.labelspoints = FALSE,
     points=TRUE,
     labelspoints = FALSE,
     morphospace = TRUE,
     rug = FALSE, 
     box = FALSE,
     "ontogeny_coarse",
     center.origin = TRUE,
     zoom = 1.55,
     title = "",
     col = pal
     )
```

```{r}
pal2 <- c("#C10093","#4CAB38","#0085F8", "#000000") # A color blind friendly palette.
plot(outlines_coo.p, 
     chull=TRUE, 
     pos.shp = "full", 
     abbreviate.labelsgroups = FALSE, 
     abbreviate.labelspoints = FALSE,
     points=TRUE, 
     labelspoints = FALSE,
     morphospace = TRUE,
     "attachment_site_coarse",
     rug = FALSE, 
     box = FALSE,
     center.origin = TRUE,
     zoom = 1.55,
     title = "",
     col = pal2)
```


It can also be visually represented what each principle component stands for.

```{r}
PCcontrib(outlines_coo.p)
```

The variation in the principal components can also be visualised in a boxplot. This is just an alternative representation and holds the same information like the scatterplots.

```{r}
boxplot(outlines_coo.p, "ontogeny_coarse")
```

A scree plot visualises, proportionally, how much variation each of the overall variation is explained by each principal component.

```{r}
scree_plot(outlines_coo.p)
```


```{r}

 # mean shape of females 
outlines_coo.ms <- mshapes(outlines_coo.f, "ontogeny_coarse")
female <- outlines_coo.ms$shp$female
immature <- outlines_coo.ms$shp$immature  %T>% coo_plot(border="#4CAB38", lwd = 3)
coo_ruban(immature, edm(immature, female), lwd=15)
immature <- outlines_coo.ms$shp$immature    %T>% coo_draw(border="#4CAB38", lwd = 3)
male <- outlines_coo.ms$shp$male  %T>% coo_draw(border="#0085F8", lwd = 3)
female %T>% coo_draw(border="#C10093", lwd = 3)
legend("center", lwd=3,
       col=c("#4CAB38", "#0085F8", "#C10093"), legend=c("immature",  "male", "female"))
title("Mean shapes of immatures, males and females\nand euclidean distance between immatures and adult females\n(colour band)")
#("#0085F8","#4CAB38","#C10093","#000000")


```


Adjusting the data for the use outside of Momocs.


```{r, warning=FALSE}
outlines_pca <- outlines_coo.p %>% as_df()
outlines_pca$filename <- paste(outlines_pca$.id,".jpg", sep = "")
matching <- metadata[, names(metadata) %in% c("filename", "size_mm")]
outlines_pca_size <- left_join(outlines_pca, matching)
outlines_pca_size$filename <- NULL
outlines_pca_size$ontogeny_coarse <- factor(outlines_pca_size$ontogeny_coarse, levels = c("immature","male","female", "fossil"))
```

To track the individual specimens, here is a plot, with the labels for each specimen. It can be compared with the plots from the momocs package (which I can not customise as much). It is just a bit stretched/compressed.

```{r}
shapes <- c(15,18,17,16)
ggplot(data = outlines_pca_size, aes(x=PC1, y=PC2))+
  geom_point(aes(color=ontogeny_coarse, shape=ontogeny_coarse))+
  geom_text_repel(data=subset(outlines_pca_size, !ontogeny_coarse %in% c("female", "male", "immature")), 
                  label=subset(outlines_pca_size, !ontogeny_coarse %in% c("female", "male", "immature"))$.id)+
  scale_color_manual(values =  c("#0085F8","#4CAB38","#C10093","#000000"))+ # color blind friendly
  scale_shape_manual(values=shapes)+
  labs(color  = "Ontogeny / sex", shape = "Ontogeny / sex")

```


# Shape vs. size

The shape parameters can also be visualised in relation to the body length. For this the PCA data has to be joined with the metadata.


Here is a plot, visualising the relationship between PC1 and the body length. Some datapoints are excluded because there is no body length available in the literature.


```{r, warning=FALSE}
library(ggpmisc)
myformula <- y ~ x
shapes <- c(15,18,17,16)
(sizeplot1 <- ggplot(data = outlines_pca_size, aes(x=size_mm, y=PC1))+
  geom_smooth(method=lm, formula = myformula, color="gray50")+
  stat_poly_eq(formula = myformula, 
                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
                parse = TRUE) + 
  geom_point(aes(color=ontogeny_coarse, shape=ontogeny_coarse))+
  geom_text_repel(data=subset(outlines_pca_size, !ontogeny_coarse %in% c("female", "male", "immature")), 
                  label=subset(outlines_pca_size, !ontogeny_coarse %in% c("female", "male", "immature"))$.id)+
  scale_color_manual(values =  c("#0085F8","#4CAB38","#C10093","#000000"))+ # color blind friendly
  scale_shape_manual(values=shapes)+
  scale_x_log10()+
  labs(color  = "Ontogeny / sex", shape = "Ontogeny / sex")
)
```

```{r}
lm(PC1 ~ log(size_mm), data = outlines_pca_size) %>% summary
```


And here is a plot, visualising the relationship between PC2 and the body length. Some datapoints are excluded because there is no body length available in the literature.


```{r, warning=FALSE}
myformula <- y ~ x
shapes <- c(15,18,17,16)
(sizeplot2 <- ggplot(data = outlines_pca_size, aes(x=size_mm, y=PC2))+
  geom_smooth(method=lm, formula = y~x, color="gray50")+
  stat_poly_eq(formula = myformula, 
              aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
              parse = TRUE) + 
  geom_point(aes(color=ontogeny_coarse, shape=ontogeny_coarse))+
  geom_text_repel(data=subset(outlines_pca_size, !ontogeny_coarse %in% c("female", "male", "immature")), 
                  label=subset(outlines_pca_size, !ontogeny_coarse %in% c("female", "male", "immature"))$.id)+
  scale_color_manual(values =  c("#0085F8","#4CAB38","#C10093","#000000"))+ # color blind friendly
  scale_shape_manual(values=shapes)+
  scale_x_log10()+
  labs(color  = "Ontogeny / sex", shape = "Ontogeny / sex")
)
```

```{r}
lm(PC2 ~ log(size_mm), data = outlines_pca_size) %>% summary
```

```{r}
ggarrange(sizeplot1, sizeplot2, ncol=2, common.legend=TRUE, legend="bottom")
```

# Predictive models

Given the extant outline data, from which we know factors like the sex and the ontogenetic stage, we can try to make predictions about the outline data from the fossils.

For this, the dataset needs to be split in 'training data' (extant specimens) and 'testing data' (fossils).

```{r}
outlines_pca_train <- subset(outlines_pca_size, ontogeny_coarse != "fossil")
outlines_pca_test <- subset(outlines_pca_size, ontogeny_coarse == "fossil")
```

Some data cleaning.


```{r}
outlines_pca_train$ontogeny_coarse <- factor(outlines_pca_train$ontogeny_coarse)
outlines_pca_train$ontogeny_coarse <- relevel(outlines_pca_train$ontogeny_coarse, ref = "immature")
outlines_pca_train <- na.omit(outlines_pca_train)
```

# Linear discriminant analysis (LDA)
This method is not optimal for the data at hand, it assumes that the observations (the specimens) are independent of each other. This is not the case her because there are multiple specimens for the same species and the species are species are related to each other by phylogeny and other factors like the attachment site. However, a LDA can provide a visually accessible output (a 2D scatter plot). And by using other tools in parallel, the accuracy and the compatibility of a LDA can be assessed. 

Unlike a PCA, LDA does not create a space with the most variation but a space with the most separation of the predefined groups. 

The model uses PC1 to PC6 as variables from which the outcome will be predicted.
```{r}
outlines_lda_model <- lda(ontogeny_coarse ~ PC1 +
                                PC2 +
                                PC3 +
                                PC4 +
                                PC5 +
                                PC6 ,
                              data = outlines_pca_train)
```

With this model we can predict to which category the fossil specimens belong. But first the test data is to be formatted.

```{r}
outlines_test <- outlines_pca_test[, names(outlines_pca_test) %in% c("PC1","PC2",
                                                            "PC3","PC4",
                                                            "PC5","PC6")]
```

Now we can make a prediction. Two dimensions are specified because we want a 2D scatter plot.

```{r}
predict(outlines_lda_model, outlines_test, dimen = 2)
```
The output provides the classes (```$class```), which the model predicted for the specimens and the probabilities (```$posterior```)for each class and specimen. And it provides X and Y coordinates for the Scatterplot (```$x```).

Now, the test data has to be combined with the training data again, to produce a single plot.

```{r}
training_lda <- predict(outlines_lda_model)$x
training_lda <- cbind(as.character(predict(outlines_lda_model)$class), training_lda)

test_lda <- predict(outlines_lda_model, outlines_test)$x
test_lda <- cbind(as.character(outlines_pca_test$.id), test_lda)

all <- as.data.frame(rbind(test_lda, training_lda))
all$LD1 <- as.numeric(as.character(all$LD1))
all$LD2 <- as.numeric(as.character(all$LD2))
colnames(all) <- c("ontogeny_coarse", "LD1", "LD2")
```

```{r}
hull_f <- all %>% subset(ontogeny_coarse == "female") %>%
  slice(chull(LD1, LD2))
hull_m <- all %>% subset(ontogeny_coarse == "male") %>%
  slice(chull(LD1, LD2))
hull_i <- all %>% subset(ontogeny_coarse == "immature") %>%
  slice(chull(LD1, LD2))

shapes <- c(15,18,17,16,16,16,16)

factor(all$ontogeny_coarse)
all$ontogeny_coarse <- factor(all$ontogeny_coarse, levels = c("immature","male","female", "Specimen_01", "Specimen_02", "Specimen_06", "Specimen_07", "Specimen_08"))


(ldaplot <- ggplot(all, aes(x=LD1, y=LD2))+
  geom_polygon(data = hull_i, alpha = 0.2, fill="#0085F8")+
  geom_polygon(data = hull_m, alpha = 0.2, fill="#4CAB38")+  
  geom_polygon(data = hull_f, alpha = 0.2, fill="#C10093")+
  geom_point(data = subset(all, ontogeny_coarse %in% c("female", "male", "immature")),
             aes(color= ontogeny_coarse, shape= ontogeny_coarse))+
  geom_point(data = subset(all, !ontogeny_coarse %in% c("female", "male", "immature")))+
  scale_color_manual(values =  c("#0085F8","#4CAB38","#C10093","#000000"))+
  theme(legend.key = element_rect(fill = NA, color = NA))+
  geom_text_repel(data = subset(all, !ontogeny_coarse %in% c("female", "male", "immature")),
                  label=subset(all, !ontogeny_coarse %in% c("female", "male", "immature"))$ontogeny_coarse)+
      labs(color  = "Ontogeny / sex", shape = "Ontogeny / sex")
)





```

# Multinomial logistic regression
A multinomial regression is much more robust to non-independence of the observations. However, it does not come with such a graphical representation.

We fit the model without the size. We can use the training and test data from above.

```{r}
mult_nom_model <- multinom(ontogeny_coarse ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 , data = outlines_pca_train)
summary(mult_nom_model)
```

```{r}
mult_nom_pred <- outlines_pca_test[, 1:2]
mult_nom_pred$prediction <- predict(mult_nom_model, outlines_pca_test)
mult_nom_pred <- cbind(mult_nom_pred, predict(mult_nom_model, outlines_pca_test, type = "prob"))
mult_nom_pred

```

This gives us the discrete predictions and the probabilities that the model produces. This way, we can assess with what certainty the modelpredicts the outcomes. Here, we can see that the model is quite certain that specimen 01 is an immature but not very certain about the others.