# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Machine Learning Code for Palaeontol- and Archaeo-stratigraphy # # Lloyd A. Courtenay - ladc1995@gmail.com TIDOP Research Group - University of Salamanca # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Here lies a generic example of the code used to produce results for the paper "Application of Artificially Intelligent Systems for the Identification of # Discrete Fossiliferous Levels" by Martin-Perea and colleagues. # There is no one-code-for-all that can be applied to any site. # The analyst is thus required to adapt the code according to the site at hand. Therefore, the generic building blocks have been supplied here: # Libraries ------------------------------------------------------------------ library(xlsx) library(openxlsx) library(e1071) library(caret) library(tidyverse) library(fpc) library(dbscan) library(ggplot2) # Read data ----------------------------------------------------------------- a<-read.xlsx(file.choose(), sheet = 1) # Load data a<-as_tibble(a) %>% na.omit %>% as.data.frame # Clean database - ensure no NA values are existant plot(a$x, # Plot and inspect loaded data <- NOTE : Change x and y where necessary a$z) # Unsupervised Algorithm ---------------------------------------------------- # Set hyperparameters for DBSCAN MINPTS <- ## Here the analyst has to input hyperparameter settings ## # Recomendations for hyperparameter settings have to consider the site at hand, however as a general guide: # MinPts - a value between 3 and 5 is recomended. These values are intuitive depending on slices with tightly packed densities (MinPts = 5) and # lower densities (MinPts = 3). Recomendations by Sander et al. 1998 and Schubert et al. 2017 describe MinPts = 2 * nÂș dimensions # to be optimal. This should then be adjusted according to complexity of the data under study # eps - This is calculated in accordance with the MinPts value and the complexity of the data under study. Optimisation of this value can be # performed using: dbscan::kNNdistplot(a, k = MINPTS, all = TRUE) # From the graph produced, search for the albow/knee joint that defines the optimal eps value. Recomendations # usually define the smallest eps value the better. Therefore our recomendations are to select a value slightly under the elbow joint # Set EPS value EPS <- ## Here the analyst has to input hyperparameter settings ## # DBSCAN - using previously defined hyperparameter values db<-fpc::dbscan(a, eps = EPS, MinPts = MINPTS, method = "hybrid") # Geologist-in-the-loop intervention ------------------------------------------------------- # Here human interaction takes place. The analyst is required to carefully sift through results searching for areas where the algorithm have created too many clusters # which can be grouped together - this may be product of underlying geological components the algorithm is unable to detect/quantify # First plot results cluster_groups<-db$cluster; cluster_groups<-factor(cluster_groups) # Extract cluster groupings b<-tibble(x = a$x, y = a$y, z = a$z, cluster = cluster) %>% # Add cluster groupings to the old database filter(cluster != "0") %>% droplevels(exclude = "0") %>% # clean out noise as.data.frame hull<-b %>% group_by(cluster) %>% # Create convex hulls slice(chull(x, # NOTE <- be sure to change x and y where necessary z) ) ggplot(b, aes(x = y, # NOTE <- be sure to change x and y where necessary y = z, color = cluster)) + # colour by cluster <- optional geom_point(size = 2) + geom_polygon(data = hull, alpha = 0.2) # Once the analyst has identified the overall layers, create a final labelling (sense-making) : corrected_clustering<-b; corrected_clustering<-fct_collapse(corrected_clustering$cluster, Level_I = c("1"), # here insert the number assigned by DBSCAN # example: # Level_II = c("2"), # example: # Level_III = c("3"), # ... # indet = c("0") # Here the analyst can insert groups that are noise considered by DBSCAN or the geologist etc. ) # Replot new cleaned dataset for revision ggplot(corrected_clustering, aes(x = y, # NOTE <- be sure to change x and y where necessary y = z, color = cluster)) + # colour by cluster <- optional geom_point(size = 2) # plot without noise corrected_clustering_no_noise<-as.tibble(corrected_clustering) %>% droplevels(exclude = "indet") %>% as.data.frame ggplot(corrected_clustering_no_noise, aes(x = y, # NOTE <- be sure to change x and y where necessary y = z, color = cluster)) + # colour by cluster <- optional geom_point(size = 2) # Supervised learning ------------------------------------------------------------------ # Seperate indets and data that will be used for training # The code to seperate indets can be performed via the following: indet<-as.tibble(corrected_clustering) %>% filter(cluster == "indet") %>% droplevels(exclude = c("Level_I" # , Remove the factor levels that were previously defined by geologist-in-the-loop intervention # "Level_II" # ... # )) %>% as.data.frame corrected_clustering_no_indet<-as.tibble(corrected_clustering) %>% droplevels(exclude = "indet") %>% as.data.frame indet<-na.omit(indet); corrected_clustering_no_indet<-na.omit(corrected_clustering_no_indet) # Split into training and test sets split.data = function(data,p = 0.7, s = 666) { set.seed(2) index = sample (1:dim(data)[1]) train = data [index[1: floor(dim(data)[1]*p)],] test = data [index[((ceiling(dim(data)[1]*p)) + 1) :dim(data)[1]],] return(list(train = train, test = test))} allset<-split.data(new, p = 0.7) # split data 70% for training trainset<-allset$train testset<-allset$test # From this point onwards, supervised learning can be performed in a number of different manners. Here are some basic algorithms that can be used: # SVMs can be trained in a number of different packages. Here 'caret' has been used to provide an example: # Example of a cross-validation algorithm ctrl<-trainControl(method = "cv", number = 10, repeats = 10) # Train SVM in caret best_score<-0 # Set initial score to 0 - this will be updated during tuning untill reaching best model performance C<-c(0); gamma<-c(0); best_params<-data.frame(C,gamma) # Set initial Values to 0 - these will be updated as before for (i in 50) { # Run the tuning process for 50 iterations svm_tune<-svm(Sample~., data = trainset, kernel = "radial", # SVM model using a radial kernel cost = runif(1, min = 0, max = 500), # run a random number per iteration with cost values between 0 and 100 gamma = runif(1, min = 0, max = 500), # run a random number per iteration with gamma values between 0 and 100 trControl = ctrl, probability = TRUE) # Use k fold cross validation and allow for probability calculations svm.fit<-predict(svm_tune, testset[, !names(testset) %in% c("Sample")]) # Evaluate tuned model on testset conmax<-confusionMatrix(table(svm.fit,testset$Sample)) # Evaluate model performance score<-conmax$overall["Accuracy"] # Extract the accuracy of the tested model if (score > best_score) { # Update previously defined values using the best values obtained during tuning best_score <- score # Save the best accuracy value obtained best_cost <- svm_tune$cost # Save the optimal cost value used to obtain the best accuracy score best_gamma <- svm_tune$gamma # Save the optimal gamma value used to obtain the best accuracy score } optimal_model <- data.frame(best_score,best_cost,best_gamma) # Create table with best values }; optimal_model # Print the optimal model hyperparameters svm.model<-svm(cluster~., data = trainset, kernel = "radial", cost = optimal_model$best_cost, # Choosing the best cost values found in tuning gamma = optimal_model$best_gamma, # Choosing the best gamma values found in tuning trControl = ctrl, probability = TRUE) # Use k fold cross validation and allow for probability calculations summary(svm.model) # View final model constructed # evaluate performance svm.predict<-predict(svm.model, testset[, !names(testset) %in% c("cluster")]) svm.table<-table(svm.predict, testset$cluster) confusionMatrix(table(svm.predict, testset$cluster)) # confusion matrix svm.prob<-predict(svm.model, testset[, !names(testset) %in% c("cluster")], decision.values = TRUE, probability = TRUE) svm.classprob<-attr(svm.prob,"probabilities"); View(svm.classprob) # Returns class probabilities # Random Forests can be trained in a number of different packages. Here 'caret' has been used to provide an example RF.model<-train(cluster~., data = trainset, method = "rf", trControl = ctrl, probability = TRUE) # model evaluation RF.predict<-predict(RF.model, testset[c("x","y","z")]) confusionMatrix(RF.predict, testset$cluster) RF.perc<-predict(RF.model, indet[2:4], type = "prob") # for class probabilities # Use a model to predict the labels of the indeterminable points ie. profile cleaning indet_labels<-predict(RF.model, # or svm depending on the best performing model indet[c("x","y","z")], type = "prob" # or "class" ) View(indet_labels) # Drawing SVM decision boundaries # Create a fine grid of the feature space x <- seq(from = min(trainset$x), to = max(trainset$x), length = 20) # Lengths can vary depending on the desired resolution y <- seq(from = min(trainset$y), to = max(trainset$y), length = 100) # Lengths can vary depending on the desired resolution z <- seq(from = min(trainset$z), to = max(trainset$z), length = 30) # Lengths can vary depending on the desired resolution x.grid<-expand.grid(x, y, z) # Get class predictions over grid pred<-predict(svm.model, newdata = x.grid) # Plot the results cols <- brewer.pal(3, "Set1") plot(x.grid$Var2,x.grid$Var3, pch = 15, cex = 1.5, col = adjustcolor(cols[pred], alpha.f = 0.05)) points(a$y, # adjust accordingly a$z, pch = ".", cex = 3) #