####################################################################################################################### ####The biosonar of the boto: evidence of differences among species of river dolphins (Inia spp.) from the Amazon#### ####Melo et al.#### ####PeerJ#### ####################################################################################################################### #Kruskal-wallis kruskal.test(data~spp) plot(data~spp) #boxplot qqnorm(data) qqline(data) #Mann-Whitney wilcox.test(data~spp, pared=F) plot(data~spp) #post hoc Dunn, method Bonferroni library(dunn.test) library(FSA) dunnTest(data,spp, method="bonferroni") ##Random Forest #Loading packages library(randomForest) library(caret) library(e1071) library(pROC) # Split into Training and Validation datasets set.seed(500) train <- sample(nrow(data1), 0.80*nrow(data1), replace = FALSE) TrainSet <- data1[train,] ValidSet <- data1[-train,] summary(TrainSet) summary(ValidSet) # Choosing the best mtry set.seed(1) mtry <- tuneRF(x = TrainSet[,2:4], y = TrainSet[,1], mtryStart=4, ntreeTry = 500, trace=TRUE, plot=TRUE) print(mtry) # Random Forest with the best mtry model <- randomForest(spp ~ ., data = TrainSet, ntree = 500, mtry = 2, importance = TRUE, proximity=TRUE) # Predicting on training set predTrain <- predict(model, TrainSet, type = "class") confusionMatrix(predTrain, TrainSet$spp) # Predicting on Validation set predValid <- predict(model, ValidSet, type = "class") confusionMatrix(predValid, ValidSet$spp) # Variables importance importance(model) varImp(model) varImpPlot(model) #ROC curves with AUC predictions <- as.data.frame(predict(model, ValidSet, type = "prob")) # predict class and then attach test class predictions$predict <- names(predictions)[1:4][apply(predictions[,1:4], 1, which.max)] predictions$observed <- ValidSet$spp head(predictions) roc.Ia <- roc(ifelse(predictions$observed=="Ia", "Ia", "non-Ia"), as.numeric(predictions$Ia), smoothed = TRUE) roc.Ia plot(roc.Ia, col="red") lines(roc.Ia, col="red", text(0.1, 0.0, cex=0.8, labels=sprintf("AUC Ia: %0.3f", auc(roc.Ia)), col="red")) ##K-means Cluster #Loading packages library(tidyverse) library(cluster) library(factoextra) library(NbClust) # Scaling data data2 <- data.frame(scale(dat1)) # Verifying uniformity of variance and getting principal components plot(sapply(data2, var)) pc <- prcomp(data2) comp <- data.frame(pc$x[,1:2]) #Choosing number of K fviz_nbclust(comp, kmeans, method = "silhouette", barfill = "black", barcolor = "black", linecolor = "black", nboot=999)+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.border = element_rect(linetype = "solid", fill = NA)) # Compute k-means with k = 3 set.seed(123) km.res3 <- kmeans(comp, 3, nstart = 25) print(km.res) aggregate(comp, by=list(cluster=km.res$cluster), mean) km.res$cluster fviz_cluster(km.res3, comp, geom = "point", ellipse.type = "norm", cex=0.8, ellipse.level = 0.95, ellipse.alpha = 0.2, pointsize=1)+ theme(legend.key = element_rect(fill = "white"), legend.text = element_text(size=10), legend.position="right", legend.box.background = element_rect(),legend.background= element_blank(), panel.background= element_blank(), panel.grid.major = element_line(colour = "darkgray", size = 0.5, linetype = 'dotted'), panel.grid.minor = element_blank(),panel.border = element_rect(linetype = "solid", fill = NA))+ xlab("PC1 - 62.38%") + ylab("PC2 - 24.15%") #Number of k Cluster validation nb <- NbClust(comp, distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans", index ="all") fviz_nbclust(nb) + theme_minimal() hist(nb$Best.nc[1,],breaks = 15)