# #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 
#
# Machine Learning Code for Palaeontol- and Archaeo-stratigraphy
#
# Lloyd A. Courtenay - ladc1995@gmail.com       TIDOP Research Group - University of Salamanca
#
# #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 

# Here lies a generic example of the code used to produce results for the paper "Application of Artificially Intelligent Systems for the Identification of
# Discrete Fossiliferous Levels" by Martin-Perea and colleagues.

# There is no one-code-for-all that can be applied to any site.
# The analyst is thus required to adapt the code according to the site at hand. Therefore, the generic building blocks have been supplied here:

# Libraries ------------------------------------------------------------------

library(xlsx)
library(openxlsx)
library(e1071)
library(caret)
library(tidyverse)
library(fpc)
library(dbscan)
library(ggplot2)

# Read data -----------------------------------------------------------------

a<-read.xlsx(file.choose(), sheet = 1) # Load data
a<-as_tibble(a) %>% na.omit %>% as.data.frame # Clean database - ensure no NA values are existant

plot(a$x, # Plot and inspect loaded data <- NOTE : Change x and y where necessary
     a$z)

# Unsupervised Algorithm ----------------------------------------------------

# Set hyperparameters for DBSCAN

MINPTS <- ## Here the analyst has to input hyperparameter settings ##

# Recomendations for hyperparameter settings have to consider the site at hand, however as a general guide:
# MinPts - a value between 3 and 5 is recomended. These values are intuitive depending on slices with tightly packed densities (MinPts = 5) and
#           lower densities (MinPts = 3). Recomendations by Sander et al. 1998 and Schubert et al. 2017 describe MinPts = 2 * nº dimensions
#           to be optimal. This should then be adjusted according to complexity of the data under study
# eps - This is calculated in accordance with the MinPts value and the complexity of the data under study. Optimisation of this value can be
#           performed using:

dbscan::kNNdistplot(a, k = MINPTS, all = TRUE)

#           From the graph produced, search for the albow/knee joint that defines the optimal eps value. Recomendations
#           usually define the smallest eps value the better. Therefore our recomendations are to select a value slightly under the elbow joint

# Set EPS value

EPS <- ## Here the analyst has to input hyperparameter settings ##

# DBSCAN - using previously defined hyperparameter values

db<-fpc::dbscan(a, eps = EPS, MinPts = MINPTS, method = "hybrid")

# Geologist-in-the-loop intervention -------------------------------------------------------

# Here human interaction takes place. The analyst is required to carefully sift through results searching for areas where the algorithm have created too many clusters
# which can be grouped together - this may be product of underlying geological components the algorithm is unable to detect/quantify

# First plot results

cluster_groups<-db$cluster; cluster_groups<-factor(cluster_groups) # Extract cluster groupings
b<-tibble(x = a$x, y = a$y, z = a$z,
          cluster = cluster) %>% # Add cluster groupings to the old database
  filter(cluster != "0") %>% droplevels(exclude = "0") %>% # clean out noise
  as.data.frame
hull<-b %>% group_by(cluster) %>% # Create convex hulls
  slice(chull(x, # NOTE <- be sure to change x and y where necessary
              z)
        )
ggplot(b, aes(x = y, # NOTE <- be sure to change x and y where necessary
              y = z,
              color = cluster)) + # colour by cluster <- optional
  geom_point(size = 2) +
  geom_polygon(data = hull, alpha = 0.2)


# Once the analyst has identified the overall layers, create a final labelling (sense-making) :

corrected_clustering<-b; corrected_clustering<-fct_collapse(corrected_clustering$cluster,
                                                            Level_I = c("1"), # here insert the number assigned by DBSCAN
                                                            # example: # Level_II = c("2"),
                                                            # example: # Level_III = c("3"),
                                                            # ... #
                                                            indet = c("0") # Here the analyst can insert groups that are noise considered by DBSCAN or the geologist etc.
                                                            )

# Replot new cleaned dataset for revision

ggplot(corrected_clustering,
       aes(x = y, # NOTE <- be sure to change x and y where necessary
           y = z,
           color = cluster)) + # colour by cluster <- optional
  geom_point(size = 2)

# plot without noise

corrected_clustering_no_noise<-as.tibble(corrected_clustering) %>% droplevels(exclude = "indet") %>% as.data.frame
ggplot(corrected_clustering_no_noise,
       aes(x = y, # NOTE <- be sure to change x and y where necessary
           y = z,
           color = cluster)) + # colour by cluster <- optional
  geom_point(size = 2)

# Supervised learning ------------------------------------------------------------------

# Seperate indets and data that will be used for training

# The code to seperate indets can be performed via the following:
indet<-as.tibble(corrected_clustering) %>%
  filter(cluster == "indet") %>%
  droplevels(exclude = c("Level_I" # , Remove the factor levels that were previously defined by geologist-in-the-loop intervention
                         # "Level_II"
                         # ... #
                         )) %>% as.data.frame


corrected_clustering_no_indet<-as.tibble(corrected_clustering) %>% droplevels(exclude = "indet") %>% as.data.frame
indet<-na.omit(indet); corrected_clustering_no_indet<-na.omit(corrected_clustering_no_indet)

# Split into training and test sets

split.data = function(data,p = 0.7, s = 666) {
  set.seed(2)
  index = sample (1:dim(data)[1])
  train = data [index[1: floor(dim(data)[1]*p)],]
  test = data [index[((ceiling(dim(data)[1]*p)) + 1) :dim(data)[1]],]
  return(list(train = train, test = test))}

allset<-split.data(new, p = 0.7) # split data 70% for training
trainset<-allset$train
testset<-allset$test

# From this point onwards, supervised learning can be performed in a number of different manners. Here are some basic algorithms that can be used:

# SVMs can be trained in a number of different packages. Here 'caret' has been used to provide an example:

# Example of a cross-validation algorithm

ctrl<-trainControl(method = "cv", number = 10, repeats = 10)

# Train SVM in caret

best_score<-0  # Set initial score to 0 - this will be updated during tuning untill reaching best model performance
C<-c(0); gamma<-c(0); best_params<-data.frame(C,gamma) # Set initial Values to 0 - these will be updated as before

for (i in 50) { # Run the tuning process for 50 iterations
  svm_tune<-svm(Sample~., data = trainset, kernel = "radial", # SVM model using a radial kernel
                cost = runif(1, min = 0, max = 500), # run a random number per iteration with cost values between 0 and 100
                gamma = runif(1, min = 0, max = 500), # run a random number per iteration with gamma values between 0 and 100
                trControl = ctrl, probability = TRUE) # Use k fold cross validation and allow for probability calculations
  svm.fit<-predict(svm_tune, testset[, !names(testset) %in% c("Sample")]) # Evaluate tuned model on testset
  conmax<-confusionMatrix(table(svm.fit,testset$Sample)) # Evaluate model performance
  score<-conmax$overall["Accuracy"] # Extract the accuracy of the tested model
  if (score > best_score) { # Update previously defined values using the best values obtained during tuning
    best_score <- score # Save the best accuracy value obtained
    best_cost <- svm_tune$cost # Save the optimal cost value used to obtain the best accuracy score
    best_gamma <- svm_tune$gamma # Save the optimal gamma value used to obtain the best accuracy score
  }
  optimal_model <- data.frame(best_score,best_cost,best_gamma) # Create table with best values
}; optimal_model # Print the optimal model hyperparameters

svm.model<-svm(cluster~., data = trainset, kernel = "radial",
               cost = optimal_model$best_cost, # Choosing the best cost values found in tuning
               gamma = optimal_model$best_gamma, # Choosing the best gamma values found in tuning
               trControl = ctrl, probability = TRUE) # Use k fold cross validation and allow for probability calculations
summary(svm.model) # View final model constructed

# evaluate performance

svm.predict<-predict(svm.model, testset[, !names(testset) %in% c("cluster")])
svm.table<-table(svm.predict, testset$cluster)
confusionMatrix(table(svm.predict, testset$cluster)) # confusion matrix

svm.prob<-predict(svm.model, testset[, !names(testset) %in% c("cluster")], decision.values = TRUE, probability = TRUE)
svm.classprob<-attr(svm.prob,"probabilities"); View(svm.classprob) # Returns class probabilities

# Random Forests can be trained in a number of different packages. Here 'caret' has been used to provide an example

RF.model<-train(cluster~., data = trainset, method = "rf", trControl = ctrl, probability = TRUE)

# model evaluation

RF.predict<-predict(RF.model, testset[c("x","y","z")])
confusionMatrix(RF.predict, testset$cluster)

RF.perc<-predict(RF.model, indet[2:4], type = "prob") # for class probabilities

# Use a model to predict the labels of the indeterminable points ie. profile cleaning

indet_labels<-predict(RF.model, # or svm depending on the best performing model
                      indet[c("x","y","z")],
                      type = "prob" # or "class"
                      )
View(indet_labels)

# Drawing SVM decision boundaries

# Create a fine grid of the feature space
x <- seq(from = min(trainset$x), to = max(trainset$x), length = 20) # Lengths can vary depending on the desired resolution
y <- seq(from = min(trainset$y), to = max(trainset$y), length = 100) # Lengths can vary depending on the desired resolution
z <- seq(from = min(trainset$z), to = max(trainset$z), length = 30) # Lengths can vary depending on the desired resolution
x.grid<-expand.grid(x, y, z)

# Get class predictions over grid
pred<-predict(svm.model, newdata = x.grid)

# Plot the results
cols <- brewer.pal(3, "Set1")
plot(x.grid$Var2,x.grid$Var3, pch = 15, cex = 1.5, col = adjustcolor(cols[pred], alpha.f = 0.05))
points(a$y, # adjust accordingly
       a$z,
       pch = ".", cex = 3)

#