# # Library install # library(randomForest) # # Calculate AUC of random forest model # auc_rf <- function(trained_model, variables, outcomes) { prediction_results <- predict(trained_model,variables,type='prob')[,'Yes'] validation_results <- ifelse(outcomes=='Yes',1,0) performance(prediction(prediction_results,validation_results),'auc')@y.values[[1]] } # # Perform the feature selection by 5-fold cross-validation # set.seed(1) aucs <- NA for (index in 1:folds_in_cv) { # Split the train_data into train_in_loop and validate_in_loop samp <- sample_order[(validate_count*(index-1)+1):(validate_count*index)] train_in_loop <- train_data[-samp,] validate_in_loop <- train_data[samp,] # Develop the model by train_in_loop # TODO: Remove variable one by one model_rf <- randomForest(diarrhea~age+sex+rankin_scale+tube_feeding+ppi+h2_blocker+abx, data=train_in_loop) # Calculate AUC for validate_in_loop auc <- auc_rf(trained_model=model_rf, variables=validate_in_loop[,c('age','sex','rankin_scale','tube_feeding','ppi','h2_blocker','abx')], outcomes=validate_in_loop[,'diarrhea']) aucs <- append(aucs, auc) } mean(aucs, na.rm=TRUE) # # Set the hyperparameter of mtry by 5-fold cross-validation # param_results <- c(0, 0) # Search hyperparameter mtry from 1 to 5 for (mtry in 1:5) { set.seed(1) aucs <- NA for (index in 1:folds_in_cv) { # Split the train_data into train_in_loop and validate_in_loop samp <- sample_order[(validate_count*(index-1)+1):(validate_count*index)] train_in_loop <- train_data[-samp,] validate_in_loop <- train_data[samp,] # Develop the model by train_in_loop model_rf <- randomForest(diarrhea~age+sex+rankin_scale+tube_feeding+abx, # PPI and H2RA were removed mtry=mtry, data=train_in_loop) # Calculate AUC for validate_in_loop auc <- auc_rf(trained_model=model_rf, variables=validate_in_loop[,c('age','sex','rankin_scale','tube_feeding','abx')], outcomes=validate_in_loop[,'diarrhea']) aucs <- append(aucs, auc) } param_results <- rbind(param_results, c(mtry, mean(aucs, na.rm=TRUE))) } param_results # # Validate on the independent test dataset # set.seed(1) # Develop the model by train_data model_rf <- randomForest(diarrhea~age+sex+rankin_scale+tube_feeding+abx, mtry=1, data=train_data) auc_rf(trained_model=model_rf, variables=test_data[,c('age','sex','rankin_scale','tube_feeding','abx')], outcomes=test_data[,'diarrhea'])