# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# Machine Learning Code for Palaeontol- and Archaeo-stratigraphy
#
# Lloyd A. Courtenay - ladc1995@gmail.com TIDOP Research Group - University of Salamanca
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Here lies a generic example of the code used to produce results for the paper "Application of Artificially Intelligent Systems for the Identification of
# Discrete Fossiliferous Levels" by Martin-Perea and colleagues.
# There is no one-code-for-all that can be applied to any site.
# The analyst is thus required to adapt the code according to the site at hand. Therefore, the generic building blocks have been supplied here:
# Libraries ------------------------------------------------------------------
library(xlsx)
library(openxlsx)
library(e1071)
library(caret)
library(tidyverse)
library(fpc)
library(dbscan)
library(ggplot2)
# Read data -----------------------------------------------------------------
a<-read.xlsx(file.choose(), sheet = 1) # Load data
a<-as_tibble(a) %>% na.omit %>% as.data.frame # Clean database - ensure no NA values are existant
plot(a$x, # Plot and inspect loaded data <- NOTE : Change x and y where necessary
a$z)
# Unsupervised Algorithm ----------------------------------------------------
# Set hyperparameters for DBSCAN
MINPTS <- ## Here the analyst has to input hyperparameter settings ##
# Recomendations for hyperparameter settings have to consider the site at hand, however as a general guide:
# MinPts - a value between 3 and 5 is recomended. These values are intuitive depending on slices with tightly packed densities (MinPts = 5) and
# lower densities (MinPts = 3). Recomendations by Sander et al. 1998 and Schubert et al. 2017 describe MinPts = 2 * nÂș dimensions
# to be optimal. This should then be adjusted according to complexity of the data under study
# eps - This is calculated in accordance with the MinPts value and the complexity of the data under study. Optimisation of this value can be
# performed using:
dbscan::kNNdistplot(a, k = MINPTS, all = TRUE)
# From the graph produced, search for the albow/knee joint that defines the optimal eps value. Recomendations
# usually define the smallest eps value the better. Therefore our recomendations are to select a value slightly under the elbow joint
# Set EPS value
EPS <- ## Here the analyst has to input hyperparameter settings ##
# DBSCAN - using previously defined hyperparameter values
db<-fpc::dbscan(a, eps = EPS, MinPts = MINPTS, method = "hybrid")
# Geologist-in-the-loop intervention -------------------------------------------------------
# Here human interaction takes place. The analyst is required to carefully sift through results searching for areas where the algorithm have created too many clusters
# which can be grouped together - this may be product of underlying geological components the algorithm is unable to detect/quantify
# First plot results
cluster_groups<-db$cluster; cluster_groups<-factor(cluster_groups) # Extract cluster groupings
b<-tibble(x = a$x, y = a$y, z = a$z,
cluster = cluster) %>% # Add cluster groupings to the old database
filter(cluster != "0") %>% droplevels(exclude = "0") %>% # clean out noise
as.data.frame
hull<-b %>% group_by(cluster) %>% # Create convex hulls
slice(chull(x, # NOTE <- be sure to change x and y where necessary
z)
)
ggplot(b, aes(x = y, # NOTE <- be sure to change x and y where necessary
y = z,
color = cluster)) + # colour by cluster <- optional
geom_point(size = 2) +
geom_polygon(data = hull, alpha = 0.2)
# Once the analyst has identified the overall layers, create a final labelling (sense-making) :
corrected_clustering<-b; corrected_clustering<-fct_collapse(corrected_clustering$cluster,
Level_I = c("1"), # here insert the number assigned by DBSCAN
# example: # Level_II = c("2"),
# example: # Level_III = c("3"),
# ... #
indet = c("0") # Here the analyst can insert groups that are noise considered by DBSCAN or the geologist etc.
)
# Replot new cleaned dataset for revision
ggplot(corrected_clustering,
aes(x = y, # NOTE <- be sure to change x and y where necessary
y = z,
color = cluster)) + # colour by cluster <- optional
geom_point(size = 2)
# plot without noise
corrected_clustering_no_noise<-as.tibble(corrected_clustering) %>% droplevels(exclude = "indet") %>% as.data.frame
ggplot(corrected_clustering_no_noise,
aes(x = y, # NOTE <- be sure to change x and y where necessary
y = z,
color = cluster)) + # colour by cluster <- optional
geom_point(size = 2)
# Supervised learning ------------------------------------------------------------------
# Seperate indets and data that will be used for training
# The code to seperate indets can be performed via the following:
indet<-as.tibble(corrected_clustering) %>%
filter(cluster == "indet") %>%
droplevels(exclude = c("Level_I" # , Remove the factor levels that were previously defined by geologist-in-the-loop intervention
# "Level_II"
# ... #
)) %>% as.data.frame
corrected_clustering_no_indet<-as.tibble(corrected_clustering) %>% droplevels(exclude = "indet") %>% as.data.frame
indet<-na.omit(indet); corrected_clustering_no_indet<-na.omit(corrected_clustering_no_indet)
# Split into training and test sets
split.data = function(data,p = 0.7, s = 666) {
set.seed(2)
index = sample (1:dim(data)[1])
train = data [index[1: floor(dim(data)[1]*p)],]
test = data [index[((ceiling(dim(data)[1]*p)) + 1) :dim(data)[1]],]
return(list(train = train, test = test))}
allset<-split.data(new, p = 0.7) # split data 70% for training
trainset<-allset$train
testset<-allset$test
# From this point onwards, supervised learning can be performed in a number of different manners. Here are some basic algorithms that can be used:
# SVMs can be trained in a number of different packages. Here 'caret' has been used to provide an example:
# Example of a cross-validation algorithm
ctrl<-trainControl(method = "cv", number = 10, repeats = 10)
# Train SVM in caret
best_score<-0 # Set initial score to 0 - this will be updated during tuning untill reaching best model performance
C<-c(0); gamma<-c(0); best_params<-data.frame(C,gamma) # Set initial Values to 0 - these will be updated as before
for (i in 50) { # Run the tuning process for 50 iterations
svm_tune<-svm(Sample~., data = trainset, kernel = "radial", # SVM model using a radial kernel
cost = runif(1, min = 0, max = 500), # run a random number per iteration with cost values between 0 and 100
gamma = runif(1, min = 0, max = 500), # run a random number per iteration with gamma values between 0 and 100
trControl = ctrl, probability = TRUE) # Use k fold cross validation and allow for probability calculations
svm.fit<-predict(svm_tune, testset[, !names(testset) %in% c("Sample")]) # Evaluate tuned model on testset
conmax<-confusionMatrix(table(svm.fit,testset$Sample)) # Evaluate model performance
score<-conmax$overall["Accuracy"] # Extract the accuracy of the tested model
if (score > best_score) { # Update previously defined values using the best values obtained during tuning
best_score <- score # Save the best accuracy value obtained
best_cost <- svm_tune$cost # Save the optimal cost value used to obtain the best accuracy score
best_gamma <- svm_tune$gamma # Save the optimal gamma value used to obtain the best accuracy score
}
optimal_model <- data.frame(best_score,best_cost,best_gamma) # Create table with best values
}; optimal_model # Print the optimal model hyperparameters
svm.model<-svm(cluster~., data = trainset, kernel = "radial",
cost = optimal_model$best_cost, # Choosing the best cost values found in tuning
gamma = optimal_model$best_gamma, # Choosing the best gamma values found in tuning
trControl = ctrl, probability = TRUE) # Use k fold cross validation and allow for probability calculations
summary(svm.model) # View final model constructed
# evaluate performance
svm.predict<-predict(svm.model, testset[, !names(testset) %in% c("cluster")])
svm.table<-table(svm.predict, testset$cluster)
confusionMatrix(table(svm.predict, testset$cluster)) # confusion matrix
svm.prob<-predict(svm.model, testset[, !names(testset) %in% c("cluster")], decision.values = TRUE, probability = TRUE)
svm.classprob<-attr(svm.prob,"probabilities"); View(svm.classprob) # Returns class probabilities
# Random Forests can be trained in a number of different packages. Here 'caret' has been used to provide an example
RF.model<-train(cluster~., data = trainset, method = "rf", trControl = ctrl, probability = TRUE)
# model evaluation
RF.predict<-predict(RF.model, testset[c("x","y","z")])
confusionMatrix(RF.predict, testset$cluster)
RF.perc<-predict(RF.model, indet[2:4], type = "prob") # for class probabilities
# Use a model to predict the labels of the indeterminable points ie. profile cleaning
indet_labels<-predict(RF.model, # or svm depending on the best performing model
indet[c("x","y","z")],
type = "prob" # or "class"
)
View(indet_labels)
# Drawing SVM decision boundaries
# Create a fine grid of the feature space
x <- seq(from = min(trainset$x), to = max(trainset$x), length = 20) # Lengths can vary depending on the desired resolution
y <- seq(from = min(trainset$y), to = max(trainset$y), length = 100) # Lengths can vary depending on the desired resolution
z <- seq(from = min(trainset$z), to = max(trainset$z), length = 30) # Lengths can vary depending on the desired resolution
x.grid<-expand.grid(x, y, z)
# Get class predictions over grid
pred<-predict(svm.model, newdata = x.grid)
# Plot the results
cols <- brewer.pal(3, "Set1")
plot(x.grid$Var2,x.grid$Var3, pch = 15, cex = 1.5, col = adjustcolor(cols[pred], alpha.f = 0.05))
points(a$y, # adjust accordingly
a$z,
pch = ".", cex = 3)
#