#######################################################################################################################
####The biosonar of the boto: evidence of differences among species of river dolphins (Inia spp.) from the Amazon####
####Melo et al.####
####PeerJ####
#######################################################################################################################

#Kruskal-wallis
kruskal.test(data~spp)
plot(data~spp)

#boxplot
qqnorm(data)
qqline(data)

#Mann-Whitney
wilcox.test(data~spp, pared=F)
plot(data~spp)

#post hoc Dunn, method Bonferroni
library(dunn.test)
library(FSA)
dunnTest(data,spp, method="bonferroni")

##Random Forest
#Loading packages
library(randomForest)
library(caret)
library(e1071)
library(pROC)

# Split into Training and Validation datasets
set.seed(500)
train <- sample(nrow(data1), 0.80*nrow(data1), replace = FALSE)
TrainSet <- data1[train,]
ValidSet <- data1[-train,]
summary(TrainSet)
summary(ValidSet)

# Choosing the best mtry
set.seed(1)           
mtry <- tuneRF(x = TrainSet[,2:4], y = TrainSet[,1], mtryStart=4, ntreeTry = 500, trace=TRUE, plot=TRUE)
print(mtry)

# Random Forest with the best mtry
model <- randomForest(spp ~ ., data = TrainSet, ntree = 500, mtry = 2, importance = TRUE, proximity=TRUE)

# Predicting on training set
predTrain <- predict(model, TrainSet, type = "class")
confusionMatrix(predTrain, TrainSet$spp)

# Predicting on Validation set
predValid <- predict(model, ValidSet, type = "class")
confusionMatrix(predValid, ValidSet$spp)

# Variables importance
importance(model)
varImp(model)
varImpPlot(model)

#ROC curves with AUC
predictions <- as.data.frame(predict(model, ValidSet, type = "prob"))

# predict class and then attach test class
predictions$predict <- names(predictions)[1:4][apply(predictions[,1:4], 1, which.max)]
predictions$observed <- ValidSet$spp
head(predictions)
roc.Ia <- roc(ifelse(predictions$observed=="Ia", "Ia", "non-Ia"), as.numeric(predictions$Ia), smoothed = TRUE)
roc.Ia
plot(roc.Ia, col="red")
lines(roc.Ia, col="red", text(0.1, 0.0, cex=0.8, labels=sprintf("AUC Ia: %0.3f", auc(roc.Ia)), col="red"))

##K-means Cluster
#Loading packages
library(tidyverse)
library(cluster)
library(factoextra)
library(NbClust)

# Scaling data
data2 <- data.frame(scale(dat1))

# Verifying uniformity of variance and getting principal components
plot(sapply(data2, var))
pc <- prcomp(data2)
comp <- data.frame(pc$x[,1:2])

#Choosing number of K
fviz_nbclust(comp, kmeans, method = "silhouette", barfill = "black",
             barcolor = "black", linecolor = "black", nboot=999)+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.border = element_rect(linetype = "solid", fill = NA))

# Compute k-means with k = 3
set.seed(123)
km.res3 <- kmeans(comp, 3, nstart = 25)
print(km.res)
aggregate(comp, by=list(cluster=km.res$cluster), mean)
km.res$cluster

fviz_cluster(km.res3, comp, geom = "point", ellipse.type = "norm", cex=0.8, ellipse.level = 0.95, ellipse.alpha = 0.2, pointsize=1)+
  theme(legend.key = element_rect(fill = "white"), legend.text = element_text(size=10), legend.position="right",
        legend.box.background = element_rect(),legend.background= element_blank(),
        panel.background= element_blank(), panel.grid.major = element_line(colour = "darkgray", size = 0.5, linetype = 'dotted'),
        panel.grid.minor = element_blank(),panel.border = element_rect(linetype = "solid", fill = NA))+
  xlab("PC1 - 62.38%") + ylab("PC2 - 24.15%")

#Number of k Cluster validation
nb <- NbClust(comp, distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans", index ="all")
fviz_nbclust(nb) + theme_minimal()
hist(nb$Best.nc[1,],breaks = 15)