# Persistent Homology Classification Algorithm #Required Libraries library(TDA) library(rgl) library(dplyr) library(dplyr) library(caTools) library(caret) library("RColorBrewer") library(devtools) library(TDAmapper) library(igraph) #General Parameters maxd <- 0 #maxdimension maxsc <- 1 #maxscale SR <- 0.8 #Split Ratio Between training and testing ########################################################################## ## CREATING the SYNTHETIC DATA ## This part can be modified depending on which data you want to analyze. ########################################################################## ##################################################### # uniform sample of points on the circle defined by x^2 + y^2 = r^2 ##################################################### set.seed(7) sizen <- 200 circleSample <- circleUnif(n = sizen, r = 5) #circleSample$x3 <- matrix(0, 500, 1) zerocol <- matrix(2, sizen, 1) onescol <- matrix(1, sizen, 1) Circle1 <- cbind(circleSample, zerocol, onescol) #plot3d(Circle1[,1:3]) ########################################################################## # uniform sample on the sphere defined by x^2 + y^2 + z^2 = 1 ########################################################################## set.seed(7) sphereSample <- sphereUnif(n = sizen, d = 2, r = 1) #plot3d(sphereSample) twoscol <- matrix(2, sizen, 1) Sphere2 <- cbind(sphereSample, twoscol) Sphere2[,3] <- Sphere2[,3] + 1 ########################################################################## # uniform sample on the torus defined by (3-sqrt(x^2 + y^2))^2 + z^2 = 1 ########################################################################## set.seed(7) #torusSample <- torusUnif(n = 1.5*sizen, a = 1, c = 3) torusSample <- torusUnif(n = sizen, a = 1, c = 3) #plot3d(torusSample) #threescol <- matrix(3, 1.5*sizen, 1) threescol <- matrix(3, sizen, 1) Torus3 <- cbind(torusSample, threescol) Torus3[,3] <- Torus3[,3] - 1 head(Torus3) CircleSphereTorus <- rbind(Circle1,Sphere2, Torus3) #plot3d(CircleSphereTorus[,1:3]) dataset <- data.frame(CircleSphereTorus) data <- dataset colnames(dataset) <- c("x1", "x2", "x3", "classname") ###################### # FORMATTING DATASET # ###################### XY <- dataset nr <- nrow(XY) nc <- ncol(XY) X <- dataset[-nc] #removes target column XY[,nc] <- as.numeric(XY[,nc])#Turning nonnumeric data to numeric # Encoding the target feature as factor absmax <- function(x) { x[which.max( abs(x) )]} for (f in 1:(nc-1)) { colabsmax <- abs(absmax(X[,f])) XY[,f] <- (X[,f])*(1/colabsmax) } XY[,nc] = factor(XY[,nc]) classes <- levels(XY[,nc]) numclasses <- as.numeric(classes) maxclass <- max(numclasses) len <- length(numclasses) KofCV <- 5 ConM <- matrix(0,maxclass,maxclass) for (CV in 0:(KofCV-1)){ #Splitting data into classes trainM <- list() testM <- list() DiagTr <- list() # Initializing matrices used BB <- matrix(1:maxclass, nrow=1, byrow=FALSE) CC <- matrix(1:maxclass, nrow=1, byrow=FALSE) DD <- matrix(1:maxclass, nrow=1, byrow=FALSE) EE <- matrix(1:(maxclass*3), nrow=maxclass, byrow=FALSE) FF <- matrix(1:(maxclass*3), nrow=maxclass, byrow=FALSE) GG <- matrix(1:(maxclass*(nc-1)), nrow=maxclass, byrow=FALSE) LL <- matrix(1:maxclass, nrow=maxclass, byrow=FALSE) nrtr <- matrix(1:maxclass, nrow=1, byrow=FALSE) nctr <- matrix(1:maxclass, nrow=1, byrow=TRUE) nrte <- matrix(1:maxclass, nrow=1, byrow=TRUE) ncte <- matrix(1:maxclass, nrow=1, byrow=TRUE) numofDiagTrrows <- matrix(1:maxclass, nrow=1, byrow=TRUE) MM <- filter(XY, XY[,nc] == 100) newdata <- MM maxnum <- maxclass*3 DiagTrcolSums <- MM[1:3] DiagTrcolmean <- MM[1:(nc-1)] Featmean <- MM[1:(nc-1)] FeatLength <- MM[1:1] ####################### # PHCA Implementation # ####################### for (class in numclasses) { M <- filter(XY, XY[,nc] == class) print(M) for (k in 1:nrow(M)) { M[k,nc+1] <- k%%KofCV; } colnames(M)[nc+1] <- c("CVlabel") test_set = subset(M, CVlabel == CV)[,-(nc+1)] training_set = subset(M, CVlabel != CV)[,-(nc+1)] # Optional Feature Scaling, depending on dataset #training_set[-nc] = scale(training_set[-nc]) #test_set[-nc] = scale(test_set[-nc]) trainM[[class]] <- training_set testM[[class]] <- test_set newdata <- rbind(newdata, test_set) nrtr[class] <- nrow(training_set) nctr[class] <- ncol(training_set) nrte[class] <- nrow(test_set) ncte[class] <- ncol(test_set) print(c(nrtr[class], nctr[class], nrte[class], ncte[class])) ################################## # PH COMPUTATION of TRAINING SETS ################################## ##################################### # Computing Rips persistence diagram ##################################### MMM <- trainM[[class]][1:nc-1] DiagTr[[class]] <- ripsDiag(X = MMM, maxdimension = maxd, maxscale = maxsc, library = "GUDHI", printProgress = FALSE) par(mfrow=c(1,2)) plot(DiagTr[[class]][["diagram"]], main = "Persistence Diagram") plot(DiagTr[[class]][["diagram"]], barcode = TRUE, main = "Persistence Barcode") numofDiagTrrows[class] <- nrow(DiagTr[[class]][["diagram"]]) for (g in 1:3) { DiagTrcolmean[class,g] <- mean(DiagTr[[class]][["diagram"]][,g]) } DiagTrcolSums <- rbind(DiagTrcolSums, colSums(DiagTr[[class]][["diagram"]]), deparse.level = 0) for (gg in 1:(nc-1)) { Featmean[class,gg] <- mean(MMM[,gg]) Lengthsum <- 0 for (i in nrow(DiagTr[[class]][["diagram"]])) { Lengthsum <- Lengthsum + DiagTr[[class]][["diagram"]][i,3] - DiagTr[[class]][["diagram"]][i,2] } FeatLength[class,] <- Lengthsum } } newdata # PHCA CLASSIFIER resulta <- matrix(1:maxclass, nrow=1, byrow=FALSE) result <- matrix(1:1, nrow=1, byrow=FALSE) Classifying <- function(M5){ for (j in numclasses) { MM <- rbind(trainM[[j]], M5, deparse.level = 0) MM #countit <- countit + 1 # Optional MMtest <- MM[-nc] DiagTe <- ripsDiag(X = MMtest, maxdimension = maxd, maxscale = maxsc, library = "GUDHI", printProgress = FALSE) #BB[j] <- bottleneck(Diag1 = DiagTr[[j]][["diagram"]], #Diag2 = DiagTe[["diagram"]], #dimension = 1) #CC[j] <- wasserstein(Diag1 = DiagTr[[j]][["diagram"]], Diag2 = DiagTe[["diagram"]], #p = 2, dimension = 1) #EE[j,1] <- abs(sum(DiagTe[["diagram"]][,1]) - DiagTrcolSums[j,1]) EE[j,2] <- abs(sum(DiagTe[["diagram"]][,2]) - DiagTrcolSums[j,2]) EE[j,3] <- abs(sum(DiagTe[["diagram"]][,3]) - DiagTrcolSums[j,3]) #FF[j,1] <- abs(mean(DiagTe[["diagram"]][,1]) - DiagTrcolmean[j,1]) #FF[j,2] <- abs(mean(DiagTe[["diagram"]][,2]) - DiagTrcolmean[j,2]) #FF[j,3] <- abs(mean(DiagTe[["diagram"]][,3]) - DiagTrcolmean[j,3]) #Lengthsum <- 0 #for (i in nrow(DiagTe[["diagram"]])) { # Lengthsum <- Lengthsum + DiagTe[["diagram"]][i,3] - DiagTe[["diagram"]][i,2] #} #LL[j,1] <- abs(Lengthsum - FeatLength[j,]) cc <- 0 for (kk in 1:(nc-1)) { GG[j,kk] <- abs(Featmean[j,kk] - M5[1,kk]) cc <- cc +GG[j,kk] } #DD[j] <- -EE[j,1] + EE[j,3] - FF[j,1] + FF[j,3] + cc + CC[j] DD[j] <- EE[j,3] - EE[j,2] #+ cc #+ CC[j] } resulta <- which( DD == min(DD), arr.ind=FALSE) return(resulta[1]) } Count <- 0 for (ii in numclasses) { for (jj in 1:nrte[ii]) { Count <- Count + 1 newdata[Count, nc+1] <- Classifying(testM[[ii]][jj,]) } } # Predicting the Test set results for PHCA y_label = newdata[,nc] y_pred = newdata[,nc+1] #predict(classifier, newdata = test_set[-nc]) # Making the Confusion Matrix cm = table(y_label, y_pred) #For Confusion Matrix with Cross-validation ConM <- ConM + cm } ################################### # OTHER CLASSIFICATION ALGORITHMS # ################################### #CircleSphereTorus <- rbind(Circle1,Sphere2, Torus3) #plot3d(CircleSphereTorus[,1:3]) dataset <- data.frame(CircleSphereTorus) dataset[,nc] <- as.character(dataset[,nc]) data <- dataset colnames(dataset) <- c("x1", "x2", "x3", "classname") # create a list of 80% of the rows in the original dataset we can use for training validation_index <- createDataPartition(dataset$classname , p=SR, list=FALSE) # select 20% of the data for validation validation <- dataset[-validation_index,] # use the remaining 80% of data to training and testing the models dataset <- dataset[validation_index,] # dimensions of dataset dim(dataset) # list types for each attribute sapply(dataset, class) # take a peek at the first 5 rows of the data head(dataset) # list the levels for the class dataset$classname = factor(dataset$classname) levels(dataset$classname) # summarize the class distribution percentage <- prop.table(table(dataset$classname)) * 100 cbind(freq=table(dataset$classname), percentage=percentage) # summarize attribute distributions summary(dataset) nc <- ncol(dataset) # split input and output x <- dataset[,1:(nc-1)] y <- dataset[,nc] # split input and output x <- dataset[,1:(nc-1)] y <- dataset[,nc] # boxplot for each attribute on one image par(mfrow=c(1,(nc-1))) for(i in 1:(nc-1)) { boxplot(x[,i], main=names(dataset)[i]) } #uninteresting # barplot for class breakdown plot(y) # scatterplot matrix featurePlot(x=x, y=y, plot="ellipse") # box and whisker plots for each attribute featurePlot(x=x, y=y, plot="box") # density plots for each attribute by class value scales <- list(x=list(relation="free"), y=list(relation="free")) featurePlot(x=x, y=y, plot="density", scales=scales) # Run algorithms using 10-fold cross validation control <- trainControl(method="cv", number=KofCV) metric <- "Accuracy" #VARIOUS MODELS # a) linear algorithms set.seed(7) fit.lda <- train(classname~., data=dataset, method="lda", metric=metric, trControl=control) # b) nonlinear algorithms # CART set.seed(7) fit.cart <- train(classname~., data=dataset, method="rpart", metric=metric, trControl=control) # kNN set.seed(7) fit.knn <- train(classname~., data=dataset, method="knn", metric=metric, trControl=control) # c) advanced algorithms # SVM set.seed(7) fit.svm <- train(classname~., data=dataset, method="svmRadial", metric=metric, trControl=control) # Random Forest set.seed(7) fit.rf <- train(classname~., data=dataset, method="rf", metric=metric, trControl=control) # summarize accuracy of models #Selecting best model results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf)) summary(results) # compare accuracy of models dotplot(results) ############################################################ ## Generate the performance measures for each classifier. ## ############################################################ # estimate skill of LDA on the validation dataset predictions <- predict(fit.lda, validation) confusionMatrix(predictions, factor(validation[,nc]))[4] confusionMatrix(predictions,factor(validation[,nc]))[2] confusionMatrix(predictions,factor(validation[,nc]))[3] perflda <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[4])), dim=c(1, maxclass, 11)) perf_sen <- perflda[,,1] perf_spec <- perflda[,,2] perfacc <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[3])), dim=c(1, maxclass, 11))[1] perfacc perf_acc <- perfacc perf_prec <- perflda[,,5] perf_rec <- perflda[,,6] perf_f1 <- perflda[,,7] # estimate skill of CART on the validation dataset predictions <- predict(fit.cart, validation) confusionMatrix(predictions, factor(validation[,nc]))[4] confusionMatrix(predictions,factor(validation[,nc]))[2] confusionMatrix(predictions,factor(validation[,nc]))[3] perfcart <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[4])), dim=c(1, maxclass, 11)) perf_sen <- rbind(perf_sen, perfcart[,,1]) perf_spec <- rbind(perf_spec, perfcart[,,2]) perfacc <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[3])), dim=c(1, maxclass, 11))[1] perfacc perf_acc <- rbind(perf_acc, perfacc) perf_prec <- rbind(perf_prec, perfcart[,,5]) perf_rec <- rbind(perf_rec, perfcart[,,6]) perf_f1 <- rbind(perf_f1, perfcart[,,7]) # estimate skill of KNN on the validation dataset predictions <- predict(fit.knn, validation) confusionMatrix(predictions, factor(validation[,nc]))[4] confusionMatrix(predictions,factor(validation[,nc]))[2] confusionMatrix(predictions,factor(validation[,nc]))[3] perfknn <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[4])), dim=c(1, maxclass, 11)) perf_sen <- rbind(perf_sen, perfknn[,,1]) perf_spec <- rbind(perf_spec, perfknn[,,2]) perfacc <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[3])), dim=c(1, maxclass, 11))[1] perfacc perf_acc <- rbind(perf_acc, perfacc) perf_prec <- rbind(perf_prec, perfknn[,,5]) perf_rec <- rbind(perf_rec, perfknn[,,6]) perf_f1 <- rbind(perf_f1, perfknn[,,7]) # estimate skill of SVM on the validation dataset predictions <- predict(fit.svm, validation) confusionMatrix(predictions, factor(validation[,nc]))[4] confusionMatrix(predictions,factor(validation[,nc]))[2] confusionMatrix(predictions,factor(validation[,nc]))[3] perfsvm <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[4])), dim=c(1, maxclass, 11)) perf_sen <- rbind(perf_sen, perfsvm[,,1]) perf_spec <- rbind(perf_spec, perfsvm[,,2]) perfacc <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[3])), dim=c(1, maxclass, 11))[1] perfacc perf_acc <- rbind(perf_acc, perfacc) perf_prec <- rbind(perf_prec, perfsvm[,,5]) perf_rec <- rbind(perf_rec, perfsvm[,,6]) perf_f1 <- rbind(perf_f1, perfsvm[,,7]) # estimate skill of RF on the validation dataset predictions <- predict(fit.rf, validation) confusionMatrix(predictions, factor(validation[,nc]))[4] confusionMatrix(predictions,factor(validation[,nc]))[2] confusionMatrix(predictions,factor(validation[,nc]))[3] perfrf <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[4])), dim=c(1, maxclass, 11)) perf_sen <- rbind(perf_sen, perfrf[,,1]) perf_spec <- rbind(perf_spec, perfrf[,,2]) perfacc <- array(as.numeric(unlist(confusionMatrix(predictions,factor(validation[,nc]))[3])), dim=c(1, maxclass, 11))[1] perfacc perf_acc <- rbind(perf_acc, perfacc) perf_prec <- rbind(perf_prec, perfrf[,,5]) perf_rec <- rbind(perf_rec, perfrf[,,6]) perf_f1 <- rbind(perf_f1, perfrf[,,7]) ####################### ## Result Using PHCA ## ####################### confusionMatrix(ConM) confusionMatrix(ConM)[4] confusionMatrix(ConM)[2] confusionMatrix(ConM)[3] perfphca <- array(as.numeric(unlist(confusionMatrix(ConM)[4])), dim=c(1, maxclass, 11)) perf_sen <- rbind(perf_sen, perfphca[,,1]) perf_spec <- rbind(perf_spec, perfphca[,,2]) perfacc <- array(as.numeric(unlist(confusionMatrix(ConM)[3])), dim=c(1, maxclass, 11))[1] perfacc perf_acc <- rbind(perf_acc, perfacc) perf_prec <- rbind(perf_prec, perfphca[,,5]) perf_rec <- rbind(perf_rec, perfphca[,,6]) perf_f1 <- rbind(perf_f1, perfphca[,,7]) dev.off() perf_prec perf_rec perf_acc perf_spec perf_f1 PI <- cbind(perf_prec, perf_rec, perf_acc, perf_spec, perf_f1) M <- t(PI) colnames(M) <- c('LDA', 'CART', 'KNN', 'SVM', 'RF', 'PHCA') rownames(M) <- c('prec_class1', 'prec_class2', 'prec_class3', 'rec_class1', 'rec_class2', 'rec_class3', 'accuracy', 'spec_class1', 'spec_class2', 'spec_class3', 'f1score_class1', 'f1score_class2', 'f1score_class3') library(DescTools) g <- factor(rep(1:6, each=13), labels = c("LDA", "CART", "KNN", "SVM", "RF", "PHCA")) NemenyiTest(c(M), g) ###################################### ## Plotting of Performance Measures ## ###################################### # Plot for Sensitivity/Recall colnames(perf_sen)<- c("Class 1", "Class 2", "Class 3") par(mar=c(4,4,4,4)) barplot(perf_sen, #main = "Recall/Sensitivity per class per classifier", xlab = "Classes", ylab = "Recall/Sensitivity", col = brewer.pal(n = 6, name = "BuGn"),#RdBu beside = TRUE #) #legend("bottomleft", #c("lda", "cart", "knn", "svm", "rf", "phca"), #fill = brewer.pal(n = 6, name = "BuGn") ) # Plot for Specificity/Positive Predictive Value colnames(perf_spec)<- c("Class 1", "Class 2", "Class 3") par(mar=c(4,4,4,4)) barplot(perf_spec, #main = "Specificity per class per classifier", xlab = "Classes", ylab = "Specificity", col = brewer.pal(n = 6, name = "BuGn"), beside = TRUE #) #legend("bottomleft", # c("lda", "cart", "knn", "svm", "rf", "phca"), # fill = brewer.pal(n = 6, name = "BuGn") ) #Plot for Precision colnames(perf_prec)<- c("Class 1", "Class 2", "Class 3") par(mar=c(4,4,4,4)) barplot(perf_prec, #main = "Precision per class per classifier", xlab = "Classes", ylab = "Precision", col = brewer.pal(n = 6, name = "BuGn"), beside = TRUE #) #legend("bottomleft", #c("lda", "cart", "knn", "svm", "rf", "phca"), #fill = brewer.pal(n = 6, name = "BuGn") ) #Plot for F1 Score colnames(perf_f1)<- c("Class 1", "Class 2", "Class 3") par(mar=c(4,4,4,4)) barplot(perf_f1, #main = "F1 Score per class per classifier", xlab = "Classes", ylab = "F1 Score", col = brewer.pal(n = 6, name = "BuGn"), beside = TRUE #) #legend("bottomleft", #c("lda", "cart", "knn", "svm", "rf", "phca"), #fill = brewer.pal(n = 6, name = "BuGn") ) #Plot for Accuracy par(mar=c(4,6,4,6)) barplot(perf_acc, #main = "Accuracy per classifier", ylim = c(0,1.0), xlab = "Classifiers", ylab = "Accuracy", col = brewer.pal(n = 6, name = "BuGn"), names.arg = c("LDA", "CART", "KNN", "SVM", "RF", "PHCA"), beside = TRUE ) #legend("topright", c("LDA", "CART", "KNN", "SVM", "RF", "PHCA"), # fill = brewer.pal(n = 6, name = "BuGn"), inset = c(- 0.25, 0) #) M t(M)