install.packages("tidyverse") library(tidyverse) install.packages("DataExplorer") library(DataExplorer) library(caret) library(randomForest) library(pROC) rawdata <- read.csv(file.choose(), stringsAsFactors = F) # 打开data.csv str(rawdata) rawdata$gender <- as.factor(rawdata$gender) rawdata$dyspnea <- as.factor(rawdata$dyspnea) rawdata$class <- as.factor(rawdata$class) rawdata$CD4 <- as.numeric(rawdata$CD4) glimpse(rawdata) plot_missing(rawdata) rawdata3 <- as.data.frame(na.roughfix(rawdata)) plot_missing(rawdata3) prop.table(table(rawdata3$class)) contrasts(rawdata3$class) set.seed(9560) up_rawdata3 <- upSample(x = rawdata3[, -ncol(rawdata3)], y = rawdata3$class, yname = "class") table(up_rawdata3$class) set.seed(623) head(up_rawdata3) NCOL(up_rawdata3) subsets = seq(2,16,1) ctrl= rfeControl(functions = rfFuncs, method = "cv",verbose = FALSE, returnResamp = "final") Profile = rfe(up_rawdata3[,-c(1,ncol(up_rawdata3))], up_rawdata3$class, sizes = subsets, rfeControl = ctrl) print(Profile) Profile$optVariables colnames(up_rawdata3) indexFeature = which(colnames(up_rawdata3) %in% Profile$optVariables) ## 这句是为了找出最优子集的在原来数据中列坐标 print(indexFeature) head(up_rawdata3) new_up_rawdata3 = up_rawdata3[,indexFeature] new_up_rawdata3$class = up_rawdata3$class colnames(new_up_rawdata3) str(new_up_rawdata3) set.seed(42) trains <- createDataPartition(y = new_up_rawdata3$class, p = 0.8, list = F) traindata <- new_up_rawdata3[trains, ] testdata <- new_up_rawdata3[-trains, ] prop.table(table(traindata$class)) prop.table(table(testdata$class)) tc_set <- trainControl(method = "repeatedcv", repeats = 5, number = 5, classProbs = T, summaryFunction = prSummary) tune_set <- expand.grid(mtry = 1:6) set.seed(42) #install.packages('MLmetrics') library("MLmetrics") up_rfs <- train(x = traindata[,-ncol(traindata)], y = traindata$class, method = "rf", trControl = tc_set, tuneGrid = tune_set, importance = T) up_rfs plot(up_rfs) final_rf <- up_rfs$finalModel final_rf par(mfrow=c(1,1)) plot(final_rf, main = "ERROR & TREES") legend("top", legend = colnames(final_rf$err.rate), lty = 1:3, col = 1:3, horiz = T) importance(final_rf) varImpPlot(final_rf, main = "varImpPlot", type=2) partialPlot(x = final_rf, pred.data = traindata, x.var = Myo, which.class = "death", ylab = "death") ############################################################# trainpredprob <- predict(final_rf, newdata = traindata, type = "prob") trainroc <- roc(response = traindata$class, predictor = trainpredprob[, 2]) trainroc$thresholds plot(trainroc, print.auc = TRUE, auc.polygon = TRUE, grid = c(0.1, 0.2), grid.col = c("green", "red"), max.auc.polygon = TRUE, auc.polygon.col = "skyblue", print.thres = TRUE) trainpredlab <- as.factor( ifelse(trainpredprob[, 2] > 0.414, "survival", "death") ) confusionMatrix(data = trainpredlab, reference = traindata$class, positive = "death", mode = "everything") ############################################################# testpredprob <- predict(final_rf, newdata = testdata, type = "prob") testpredlab <- as.factor( ifelse(testpredprob[, 2] > 0.414, "survival", "death") ) confusionMatrix(data = testpredlab, reference = testdata$class, positive = "death", mode = "everything") testroc <- roc(response = testdata$class, predictor = testpredprob[, 2]) plot(testroc, print.auc = TRUE, auc.polygon = TRUE, grid = c(0.1, 0.2), grid.col = c("green", "red"), max.auc.polygon = TRUE, auc.polygon.col = "skyblue", print.thres = TRUE) plot(trainroc, print.auc = TRUE, grid = c(0.1, 0.2), auc.polygon = F, max.auc.polygon = T, main = "模型效果图示", grid.col=c("green", "red")) plot(testroc, print.auc = TRUE, print.auc.y = 0.4, add = T, col = "red") legend("bottomright", legend = c("traindata", "testdata"), col = c(par("fg"), "red"), lwd = 2, cex = 0.9) ########################################################## traindata <- cbind(traindata, trainpredprob) traindata$predclass <- trainpredlab write.csv(traindata, "过抽样训练集数据加预测.csv", row.names = F) testdata <- cbind(testdata, testpredprob) testdata$predclass <- testpredlab write.csv(testdata, "过抽样测试集数据加预测.csv", row.names = F)