library(gplots) library(caret) library(randomForest) set.seed(1) # set the directory you are working from directory <- "/home/shenzy/software/deltaBS/data/faa_build_rf_good_72genomes" traindata <- read.delim(paste(directory, "/bitscores.tsv", sep="")) traindata <- t(traindata) phenotype <- read.delim(paste(directory, "/phenotype.tsv", sep=""), header=F) phenotype[,1] <- make.names(phenotype[,1]) traindata <- cbind.data.frame(traindata, phenotype=phenotype[match(row.names(traindata), phenotype[,1]),2]) traindata[is.na(traindata)] <- 0 # traindata <- na.roughfix(traindata) names(traindata) <- make.names(names(traindata)) # Set the seed for reproducibility set.seed(1) # Initialize vectors to store error rates and sparsity error <- vector() sparsity <- vector() # Convert phenotype to factor if it's categorical if (!is.numeric(traindata$phenotype)) { traindata$phenotype <- as.factor(traindata$phenotype) } # Loop over different values of ntree for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) { # Fit the random forest model model <- randomForest(phenotype ~ ., data=traindata, ntree=i) # Store the OOB error rate (for classification) or MSE (for regression) if (is.factor(traindata$phenotype)) { error <- c(error, model$err.rate[model$ntree, "OOB"]) } else { error <- c(error, model$mse[model$ntree]) } # Calculate sparsity (fraction of features with zero importance) sparsity <- c(sparsity, (sum(model$importance[,1] <= 0)) / (ncol(traindata) - 1)) } set.seed(1) # varying ntree error <- vector() sparsity <- vector() for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) { model <- randomForest(phenotype ~ ., data=traindata, ntree=i) error <- c(error, model$err.rate[length(model$err.rate)]) sparsity <- c(sparsity, (sum(model$importance[,1]<=0))/(ncol(traindata)-1)) } # varying mtry error2 <- vector() sparsity2 <- vector() param <- ncol(traindata)-1 for(i in c(1, round(param/10), round(param/5), round(param/3), round(param/2), param)) { model <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=i) error2 <- c(error2, model$err.rate[length(model$err.rate)]) sparsity2 <- c(sparsity2, (sum(model$importance[,1]<=0))/(ncol(traindata)-1)) } model <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=param/10) png(paste(directory, "/model_training/m1_error_vs_ntree.png", sep=""), width=350, height = 350) plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=error, xlab="Number of trees", ylab="OOB error rate", pch=16) dev.off() png(paste(directory, "/model_training/m1_sparsity_vs_ntree.png", sep=""), width=350, height = 350) plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=sparsity, xlab="Number of trees", ylab="% genes uninformative", pch=16) dev.off() png(paste(directory, "/model_training/m1_error_vs_mtry.png", sep=""), width=350, height = 350) plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=error2, xlab="Number of genes sampled per tree", ylab="OOB error rate", pch=16) dev.off() png(paste(directory, "/model_training/m1_sparsity_vs_mtry.png", sep=""), width=350, height = 350) plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=sparsity2, xlab="Number of genes sampled per tree", ylab="% genes uninformative", pch=16) dev.off() train2 <- traindata[,match(names(model$importance[model$importance[,1]>0,]), colnames(traindata))] train2 <- cbind(train2, phenotype=traindata$phenotype) error <- vector() sparsity <- vector() for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) { model <- randomForest(phenotype ~ ., data=train2, ntree=i) error <- c(error, median(model$err.rate)) sparsity <- c(sparsity, (sum(model$importance[,1]<=0))/(ncol(train2)-1)) } error2 <- vector() sparsity2 <- vector() param <- ncol(train2)-1 for(i in c(1, round(param/10), round(param/5), round(param/3), round(param/2), param)) { model <- randomForest(phenotype ~ ., data=train2, ntree=10000, mtry=i) error2 <- c(error2, median(model$err.rate)) sparsity2 <- c(sparsity2, (sum(model$importance[,1]<=0))/(ncol(train2)-1)) } model <- randomForest(phenotype ~ ., data=train2, ntree=10000, mtry=param/10) png(paste(directory, "/model_training/m2_error_vs_ntree.png", sep=""), width=350, height = 350) plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=error, xlab="Number of trees", ylab="OOB error rate", pch=16) dev.off() png(paste(directory, "/model_training/m2_sparsity_vs_ntree.png", sep=""), width=350, height = 350) plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=sparsity, xlab="Number of trees", ylab="% genes uninformative", pch=16) dev.off() png(paste(directory, "/model_training/m2_error_vs_mtry.png", sep=""), width=350, height = 350) plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=error2, xlab="Number of genes sampled per tree", ylab="OOB error rate", pch=16) dev.off() png(paste(directory, "/model_training/m2_sparsity_vs_mtry.png", sep=""), width=350, height = 350) plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=sparsity2, xlab="Number of genes sampled per tree", ylab="% genes uninformative", pch=16) dev.off() train3 <- train2[,match(names(model$importance[model$importance[,1]>quantile(model$importance[,1], 0.5),]), colnames(train2))] train3 <- cbind(train3, phenotype=train2$phenotype) error <- vector() sparsity <- vector() param <- ncol(train3)-1 for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) { model <- randomForest(phenotype ~ ., data=train3, ntree=i) error <- c(error, median(model$err.rate)) sparsity <- c(sparsity, (sum(model$importance[,1]<=0))/(ncol(train3)-1)) } error2 <- vector() sparsity2 <- vector() for(i in c(1, round(param/10), round(param/5), round(param/3), round(param/2), param)) { model <- randomForest(phenotype ~ ., data=train3, ntree=10000, mtry=i) error2 <- c(error2, median(model$err.rate)) sparsity2 <- c(sparsity2, (sum(model$importance[,1]<=0))/(ncol(train3)-1)) } model <- randomForest(phenotype ~ ., data=train3, ntree=10000, mtry=param/10) png(paste(directory, "/model_training/m3_error_vs_ntree.png", sep=""), width=350, height = 350) plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=error, xlab="Number of trees", ylab="OOB error rate", pch=16) dev.off() png(paste(directory, "/model_training/m3_sparsity_vs_ntree.png", sep=""), width=350, height = 350) plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=sparsity, xlab="Number of trees", ylab="% genes uninformative", pch=16) dev.off() png(paste(directory, "/model_training/m3_error_vs_mtry.png", sep=""), width=350, height = 350) plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=error2, xlab="Number of genes sampled per tree", ylab="OOB error rate", pch=16) dev.off() png(paste(directory, "/model_training/m3_sparsity_vs_mtry.png", sep=""), width=350, height = 350) plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=sparsity2, xlab="Number of genes sampled per tree", ylab="% genes uninformative", pch=16) dev.off() train4 <- train3[,match(names(model$importance[model$importance[,1]>quantile(model$importance[,1], 0.5),]), colnames(train3))] train4 <- cbind(train4, phenotype=train3$phenotype) error <- vector() sparsity <- vector() for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) { model <- randomForest(phenotype ~ ., data=train4, ntree=i) error <- c(error, median(model$err.rate)) sparsity <- c(sparsity, (sum(model$importance[,1]<=0))/(ncol(train4)-1)) } error2 <- vector() sparsity2 <- vector() param <- ncol(train4)-1 for(i in c(1, round(param/10), round(param/5), round(param/3), round(param/2), param)) { model <- randomForest(phenotype ~ ., data=train4, ntree=10000, mtry=i) error2 <- c(error2, median(model$err.rate)) sparsity2 <- c(sparsity2, (sum(model$importance[,1]<=0))/(ncol(train4)-1)) } model <- randomForest(phenotype ~ ., data=train4, ntree=10000, mtry=param/10) png(paste(directory, "/model_training/m4_error_vs_ntree.png", sep=""), width=350, height = 350) plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=error, xlab="Number of trees", ylab="OOB error rate", pch=16) dev.off() png(paste(directory, "/model_training/m4_sparsity_vs_ntree.png", sep=""), width=350, height = 350) plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=sparsity, xlab="Number of trees", ylab="% genes uninformative", pch=16) dev.off() png(paste(directory, "/model_training/m4_error_vs_mtry.png", sep=""), width=350, height = 350) plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=error2, xlab="Number of genes sampled per tree", ylab="OOB error rate", pch=16) dev.off() png(paste(directory, "/model_training/m4_sparsity_vs_mtry.png", sep=""), width=350, height = 350) plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=sparsity2, xlab="Number of genes sampled per tree", ylab="% genes uninformative", pch=16) dev.off() model$predicted names(model$importance[order(model$importance, decreasing=T),][1:10]) save(model, train2, train3, train4, file=paste(directory, "/traindata.Rdata", sep="")) set.seed(1) param <- ncol(traindata)-1 model1 <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=param/10) model1 png(paste(directory, "/VI_full.png", sep=""), width=400, height=350) plot(1:param, model1$importance[order(model1$importance, decreasing=T)], xlim=c(1,1000), ylab="Variable importance", xlab="Top genes") dev.off() train2 <- traindata[,match(names(model1$importance[model1$importance[,1]>0,]), colnames(traindata))] train2 <- cbind(train2, phenotype=traindata$phenotype) param <- ncol(train2)-1 model2 <- randomForest(phenotype ~ ., data=train2, ntree=10000, mtry=param/10) model2 train3 <- train2[,match(names(model2$importance[model2$importance[,1]>quantile(model2$importance[,1], 0.5),]), colnames(train2))] train3 <- cbind(train3, phenotype=train2$phenotype) param <- ncol(train3)-1 model3 <- randomForest(phenotype ~ ., data=train3, ntree=10000, mtry=param/10) model3 train4 <- train3[,match(names(model3$importance[model3$importance[,1]>quantile(model3$importance[,1], 0.5),]), colnames(train3))] train4 <- cbind(train4, phenotype=train3$phenotype) param <- ncol(train4)-1 model4 <- randomForest(phenotype ~ ., data=train4, ntree=10000, mtry=param/10) model4 train5 <- train4[,match(names(model4$importance[model4$importance[,1]>quantile(model4$importance[,1], 0.5),]), colnames(train4))] train5 <- cbind(train5, phenotype=train4$phenotype) param <- ncol(train5)-1 model5 <- randomForest(phenotype ~ ., data=train5, ntree=10000, mtry=param/10, proximity=T) model5 train6 <- train5[,match(names(model5$importance[model5$importance[,1]>quantile(model5$importance[,1], 0.5),]), colnames(train5))] train6 <- cbind(train6, phenotype=train5$phenotype) param <- ncol(train6)-1 model6 <- randomForest(phenotype ~ ., data=train6, ntree=10000, mtry=param/10, proximity=T) model6 train7 <- train6[,match(names(model6$importance[model6$importance[,1]>quantile(model6$importance[,1], 0.5),]), colnames(train6))] train7 <- cbind(train7, phenotype=train6$phenotype) param <- ncol(train7)-1 model7 <- randomForest(phenotype ~ ., data=train7, ntree=10000, mtry=param/10, proximity=T) model7 model7$predicted names(model7$importance[order(model7$importance[,1], decreasing=T),])[1:10] png(paste(directory, "final_model.png", sep=""), width=400, height=350) plot(1:param, model7$importance[order(model7$importance, decreasing=T),], xlab="", ylab="Variable importance") dev.off() save(model1, model2, model3, model4, model5,model6, model7, traindata, train2, train3, train4, train5, train6, train7, file=paste("finalmodel.Rdata", sep="")) topgenes <- vector() topgenes <- c(topgenes, colnames(traindata)) table(topgenes) write(topgenes,"detected_topgenes_list_train1.csv", sep="") topgenes <- vector() topgenes <- c(topgenes, colnames(train2)) table(topgenes) write(topgenes,"detected_topgenes_list_train2.csv", sep="") topgenes <- vector() topgenes <- c(topgenes, colnames(train3)) table(topgenes) write(topgenes,"detected_topgenes_list_train3.csv", sep="") topgenes <- vector() topgenes <- c(topgenes, colnames(train4)) table(topgenes) write(topgenes,"detected_topgenes_list_train4.csv", sep="") topgenes <- vector() topgenes <- c(topgenes, colnames(train5)) table(topgenes) write(topgenes,"detected_topgenes_list_train5.csv", sep="") topgenes <- vector() topgenes <- c(topgenes, colnames(train6)) table(topgenes) write(topgenes,"detected_topgenes_list_train6.csv", sep="") topgenes <- vector() topgenes <- c(topgenes, colnames(train7)) table(topgenes) write(topgenes,"detected_topgenes_list_train7.csv", sep="") votedata <- rbind.data.frame(cbind.data.frame(model=rep("1", 72), Serovar=row.names(model1$votes), Invasive=model1$votes[,2]), cbind.data.frame(model=rep("2", 72), Serovar=row.names(model1$votes), Invasive=model2$votes[,2]), cbind.data.frame(model=rep("3", 72), Serovar=row.names(model1$votes), Invasive=model3$votes[,2]), cbind.data.frame(model=rep("4", 72), Serovar=row.names(model1$votes), Invasive=model4$votes[,2]), cbind.data.frame(model=rep("5", 72), Serovar=row.names(model1$votes), Invasive=model5$votes[,2]), cbind.data.frame(model=rep("6", 72), Serovar=row.names(model1$votes), Invasive=model6$votes[,2]), cbind.data.frame(model=rep("7", 72), Serovar=row.names(model1$votes), Invasive=model7$votes[,2])) votedata$Phenotype <- phenotype[match(votedata$Serovar, phenotype[,1]),2] ggplot(votedata, aes(x=model, y=Invasive, col=Serovar)) + geom_jitter(width=0.1) + theme_classic(8) + ylab("Proportion of votes for non-BCG wild-type") + xlab("Model iteration") + geom_hline(yintercept=0.5, lty=2, col="grey") + theme(legend.key.size = unit(0.3, "cm")) ggsave("votes.png")