install.packages("xgboost") library(xgboost) install.packages("caret") library(caret) install.packages("dplyr") library(dplyr) install.packages("tidyverse") library(tidyverse) install.packages("sjPlot") library(sjPlot) #####XGBOOST df<-read_csv('CMU_XG_GEND.csv') df<-select(df, -W_Length, -GEND) df$Gender <- as.factor(df$Gender) #Creating K-folds set.seed(1) df_fold1_idx <- sample(nrow(df), 1/8 * nrow(df)) df_A <- df[df_fold1_idx, ] df_BCDEFGH <- df[-df_fold1_idx, ] df_fold2_idx <- sample(nrow(df_BCDEFGH), 1/7 * nrow(df_BCDEFGH)) df_B <- df_BCDEFGH[df_fold2_idx, ] df_CDEFGH <- df_BCDEFGH[-df_fold2_idx, ] df_fold3_idx <- sample(nrow(df_CDEFGH), 1/6 * nrow(df_CDEFGH)) df_C <- df_CDEFGH[df_fold3_idx, ] df_DEFGH <- df_CDEFGH[-df_fold3_idx, ] df_fold4_idx <- sample(nrow(df_DEFGH), 1/5 * nrow(df_DEFGH)) df_D <- df_DEFGH[df_fold4_idx, ] df_EFGH <- df_DEFGH[-df_fold4_idx, ] df_fold5_idx <- sample(nrow(df_EFGH), 1/4 * nrow(df_EFGH)) df_E <- df_EFGH[df_fold5_idx, ] df_FGH <- df_EFGH[-df_fold5_idx, ] df_fold6_idx <- sample(nrow(df_FGH), 1/3 * nrow(df_FGH)) df_F <- df_FGH[df_fold6_idx, ] df_GH <- df_FGH[-df_fold6_idx, ] df_fold7_idx <- sample(nrow(df_GH), 1/2 * nrow(df_GH)) df_G <- df_GH[df_fold7_idx, ] df_H <- df_GH[-df_fold7_idx, ] #Recombine for testing subsets Te1 <- rbind(df_A,df_B) Te2 <- rbind(df_A,df_C) Te3 <- rbind(df_A,df_D) Te4 <- rbind(df_A,df_E) Te5 <- rbind(df_A,df_F) Te6 <- rbind(df_A,df_G) Te7 <- rbind(df_A,df_H) Te8 <- rbind(df_B,df_C) Te9 <- rbind(df_B,df_D) Te10 <- rbind(df_B,df_E) Te11 <- rbind(df_B,df_F) Te12 <- rbind(df_B,df_G) Te13 <- rbind(df_B,df_H) Te14 <- rbind(df_C,df_D) Te15 <- rbind(df_C,df_E) Te16 <- rbind(df_C,df_F) Te17 <- rbind(df_C,df_G) Te18 <- rbind(df_C,df_H) Te19 <- rbind(df_D,df_E) Te20 <- rbind(df_D,df_F) Te21 <- rbind(df_D,df_G) Te22 <- rbind(df_D,df_H) Te23 <- rbind(df_E,df_F) Te24 <- rbind(df_E,df_G) Te25 <- rbind(df_E,df_H) Te26 <- rbind(df_F,df_G) Te27 <- rbind(df_F,df_H) Te28 <- rbind(df_G,df_H) #Recombine for training subsets Tr1 <- rbind(df_C,df_D,df_E,df_F,df_G,df_H) Tr2 <- rbind(df_B,df_D,df_E,df_F,df_G,df_H) Tr3 <- rbind(df_B,df_C,df_E,df_F,df_G,df_H) Tr4 <- rbind(df_B,df_C,df_D,df_F,df_G,df_H) Tr5 <- rbind(df_B,df_C,df_D,df_E,df_G,df_H) Tr6 <- rbind(df_B,df_C,df_D,df_E,df_F,df_H) Tr7 <- rbind(df_B,df_C,df_D,df_E,df_F,df_G) Tr8 <- rbind(df_A,df_D,df_E,df_F,df_G,df_H) Tr9 <- rbind(df_A,df_C,df_E,df_F,df_G,df_H) Tr10 <- rbind(df_A,df_C,df_D,df_F,df_G,df_H) Tr11 <- rbind(df_A,df_C,df_D,df_E,df_G,df_H) Tr12 <- rbind(df_A,df_C,df_D,df_E,df_F,df_H) Tr13 <- rbind(df_A,df_C,df_D,df_E,df_F,df_G) Tr14 <- rbind(df_A,df_B,df_E,df_F,df_G,df_H) Tr15 <- rbind(df_A,df_B,df_D,df_F,df_G,df_H) Tr16 <- rbind(df_A,df_B,df_D,df_E,df_G,df_H) Tr17 <- rbind(df_A,df_B,df_D,df_E,df_F,df_H) Tr18 <- rbind(df_A,df_B,df_D,df_E,df_F,df_G) Tr19 <- rbind(df_A,df_B,df_C,df_F,df_G,df_H) Tr20 <- rbind(df_A,df_B,df_C,df_E,df_G,df_H) Tr21 <- rbind(df_A,df_B,df_C,df_E,df_F,df_H) Tr22 <- rbind(df_A,df_B,df_C,df_E,df_F,df_G) Tr23 <- rbind(df_A,df_B,df_C,df_D,df_G,df_H) Tr24 <- rbind(df_A,df_B,df_C,df_D,df_F,df_H) Tr25 <- rbind(df_A,df_B,df_C,df_D,df_F,df_G) Tr26 <- rbind(df_A,df_B,df_C,df_D,df_E,df_H) Tr27 <- rbind(df_A,df_B,df_C,df_D,df_E,df_G) Tr28 <- rbind(df_A,df_B,df_C,df_D,df_E,df_F) #Creating a tuning grid for XGBoost grid_tune <- expand.grid( nrounds = c(5000), #number of trees max_depth = c(2,4,6), eta = c(0.025,0.05,0.1,0.3), #Learning rate (reducing options here will greatly speed up processes) gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0), # pruning colsample_bytree = 1, # c(0.4, 0.6, 0.8, 1.0) subsample ratio of columns for tree min_child_weight = 1, # c(1,2,3) # the larger, the more conservative the model subsample = 1) # c(0.5, 0.75, 1.0) # used to prevent overfitting by sampling X% training) #Train control train_control <- trainControl(method = "cv", number=3, verboseIter = TRUE, allowParallel = TRUE) #XG1 #Convert training subset to data frame Tr1<-as.data.frame(Tr1) #construct XGBoost model using the tuning grid. XG1 <- train(x = Tr1[,2:40],y = Tr1[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) #Feed testing data into the XGBoost model PR_XG1 <- predict(XG1, Te1) #Create confusion matrix XG1_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG1)), as.factor(as.numeric(Te1$Gender))) IMP_XG1 <- varImp(XG1) #XG2 Tr2<-as.data.frame(Tr2) XG2 <- train(x = Tr2[,2:40],y = Tr2[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG2 <- predict(XG2, Te2) XG2_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG2)), as.factor(as.numeric(Te2$Gender))) IMP_XG2 <- varImp(XG2) #XG3 Tr3<-as.data.frame(Tr3) XG3 <- train(x = Tr3[,2:40],y = Tr3[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG3 <- predict(XG3, Te3) XG3_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG3)), as.factor(as.numeric(Te3$Gender))) IMP_XG3 <- varImp(XG3) #XG4 Tr4<-as.data.frame(Tr4) XG4 <- train(x = Tr4[,2:40],y = Tr4[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG4 <- predict(XG4, Te4) XG4_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG4)), as.factor(as.numeric(Te4$Gender))) IMP_XG4 <- varImp(XG4) #XG5 Tr5<-as.data.frame(Tr5) XG5 <- train(x = Tr5[,2:40],y = Tr5[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG5 <- predict(XG5, Te5) XG5_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG5)), as.factor(as.numeric(Te5$Gender))) IMP_XG5 <- varImp(XG5) #XG6 Tr6<-as.data.frame(Tr6) XG6 <- train(x = Tr6[,2:40],y = Tr6[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG6 <- predict(XG6, Te6) XG6_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG6)), as.factor(as.numeric(Te6$Gender))) IMP_XG6 <- varImp(XG6) #XG7 Tr7<-as.data.frame(Tr7) XG7 <- train(x = Tr7[,2:40],y = Tr7[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG7 <- predict(XG7, Te7) XG7_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG7)), as.factor(as.numeric(Te7$Gender))) IMP_XG7 <- varImp(XG7) #XG8 Tr8<-as.data.frame(Tr8) XG8 <- train(x = Tr8[,2:40],y = Tr8[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG8 <- predict(XG8, Te8) XG8_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG8)), as.factor(as.numeric(Te8$Gender))) IMP_XG8 <- varImp(XG8) #XG9 Tr9<-as.data.frame(Tr9) XG9 <- train(x = Tr9[,2:40],y = Tr9[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG9 <- predict(XG9, Te9) XG9_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG9)), as.factor(as.numeric(Te9$Gender))) IMP_XG9 <- varImp(XG9) #XG10 Tr10<-as.data.frame(Tr10) XG10 <- train(x = Tr10[,2:40],y = Tr10[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG10 <- predict(XG10, Te10) XG10_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG10)), as.factor(as.numeric(Te10$Gender))) IMP_XG10 <- varImp(XG10) #XG11 Tr11<-as.data.frame(Tr11) XG11 <- train(x = Tr11[,2:40],y = Tr11[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG11 <- predict(XG11, Te11) XG11_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG11)), as.factor(as.numeric(Te11$Gender))) IMP_XG11 <- varImp(XG11) #XG12 Tr12<-as.data.frame(Tr12) XG12 <- train(x = Tr12[,2:40],y = Tr12[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG12 <- predict(XG12, Te12) XG12_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG12)), as.factor(as.numeric(Te12$Gender))) IMP_XG12 <- varImp(XG12) #XG13 Tr13<-as.data.frame(Tr13) XG13 <- train(x = Tr13[,2:40],y = Tr13[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG13 <- predict(XG13, Te13) XG13_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG13)), as.factor(as.numeric(Te13$Gender))) IMP_XG13 <- varImp(XG13) #XG14 Tr14<-as.data.frame(Tr14) XG14 <- train(x = Tr14[,2:40],y = Tr14[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG14 <- predict(XG14, Te14) XG14_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG14)), as.factor(as.numeric(Te14$Gender))) IMP_XG14 <- varImp(XG14) #XG15 Tr15<-as.data.frame(Tr15) XG15 <- train(x = Tr15[,2:40],y = Tr15[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG15 <- predict(XG15, Te15) XG15_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG15)), as.factor(as.numeric(Te15$Gender))) IMP_XG15 <- varImp(XG15) #XG16 Tr16<-as.data.frame(Tr16) XG16 <- train(x = Tr16[,2:40],y = Tr16[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG16 <- predict(XG16, Te16) XG16_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG16)), as.factor(as.numeric(Te16$Gender))) IMP_XG16 <- varImp(XG16) #XG17 Tr17<-as.data.frame(Tr17) XG17 <- train(x = Tr17[,2:40],y = Tr17[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG17 <- predict(XG17, Te17) XG17_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG17)), as.factor(as.numeric(Te17$Gender))) IMP_XG17 <- varImp(XG17) #XG18 Tr18<-as.data.frame(Tr18) XG18 <- train(x = Tr18[,2:40],y = Tr18[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG18 <- predict(XG18, Te18) XG18_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG18)), as.factor(as.numeric(Te18$Gender))) IMP_XG18 <- varImp(XG18) #XG19 Tr19<-as.data.frame(Tr19) XG19 <- train(x = Tr19[,2:40],y = Tr19[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG19 <- predict(XG19, Te19) XG19_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG19)), as.factor(as.numeric(Te19$Gender))) IMP_XG19 <- varImp(XG19) #XG20 Tr20<-as.data.frame(Tr20) XG20 <- train(x = Tr20[,2:40],y = Tr20[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG20 <- predict(XG20, Te20) XG20_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG20)), as.factor(as.numeric(Te20$Gender))) IMP_XG20 <- varImp(XG20) #XG21 Tr21<-as.data.frame(Tr21) XG21 <- train(x = Tr21[,2:40],y = Tr21[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG21 <- predict(XG21, Te21) XG21_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG21)), as.factor(as.numeric(Te21$Gender))) IMP_XG21 <- varImp(XG21) #XG22 Tr22<-as.data.frame(Tr22) XG22 <- train(x = Tr22[,2:40],y = Tr22[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG22 <- predict(XG22, Te22) XG22_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG22)), as.factor(as.numeric(Te22$Gender))) IMP_XG22 <- varImp(XG22) #XG23 Tr23<-as.data.frame(Tr23) XG23 <- train(x = Tr23[,2:40],y = Tr23[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG23 <- predict(XG23, Te23) XG23_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG23)), as.factor(as.numeric(Te23$Gender))) IMP_XG23 <- varImp(XG23) #XG24 Tr24<-as.data.frame(Tr24) XG24 <- train(x = Tr24[,2:40],y = Tr24[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG24 <- predict(XG24, Te24) XG24_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG24)), as.factor(as.numeric(Te24$Gender))) IMP_XG24 <- varImp(XG24) #XG25 Tr25<-as.data.frame(Tr25) XG25 <- train(x = Tr25[,2:40],y = Tr25[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG25 <- predict(XG25, Te25) XG25_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG25)), as.factor(as.numeric(Te25$Gender))) IMP_XG25 <- varImp(XG25) #XG26 Tr26<-as.data.frame(Tr26) XG26 <- train(x = Tr26[,2:40],y = Tr26[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG26 <- predict(XG26, Te26) XG26_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG26)), as.factor(as.numeric(Te26$Gender))) IMP_XG26 <- varImp(XG26) #XG27 Tr27<-as.data.frame(Tr27) XG27 <- train(x = Tr27[,2:40],y = Tr27[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG27 <- predict(XG27, Te27) XG27_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG27)), as.factor(as.numeric(Te27$Gender))) IMP_XG27 <- varImp(XG27) #XG28 Tr28<-as.data.frame(Tr28) XG28 <- train(x = Tr28[,2:40],y = Tr28[,1],trControl = train_control,tuneGrid = grid_tune,method= "xgbTree",verbose = TRUE) PR_XG28 <- predict(XG28, Te28) XG28_CONF<-confusionMatrix(as.factor(as.numeric(PR_XG28)), as.factor(as.numeric(Te28$Gender))) IMP_XG28 <- varImp(XG28)