library(gplots)
library(caret)
library(randomForest)
set.seed(1)
# set the directory you are working from
directory <- "/home/shenzy/software/deltaBS/data/faa_build_rf_good_72genomes"
traindata <- read.delim(paste(directory, "/bitscores.tsv", sep=""))
traindata <- t(traindata)
phenotype <- read.delim(paste(directory, "/phenotype.tsv", sep=""), header=F)
phenotype[,1] <- make.names(phenotype[,1])
traindata <- cbind.data.frame(traindata, phenotype=phenotype[match(row.names(traindata), phenotype[,1]),2])
traindata[is.na(traindata)] <- 0
# traindata <- na.roughfix(traindata)
names(traindata) <- make.names(names(traindata))
# Set the seed for reproducibility
set.seed(1)
# Initialize vectors to store error rates and sparsity
error <- vector()
sparsity <- vector()

# Convert phenotype to factor if it's categorical
if (!is.numeric(traindata$phenotype)) {
  traindata$phenotype <- as.factor(traindata$phenotype)
}

# Loop over different values of ntree
for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) {
# Fit the random forest model
  model <- randomForest(phenotype ~ ., data=traindata, ntree=i)
# Store the OOB error rate (for classification) or MSE (for regression)
  if (is.factor(traindata$phenotype)) {
    error <- c(error, model$err.rate[model$ntree, "OOB"])
  } else {
    error <- c(error, model$mse[model$ntree])
  }
# Calculate sparsity (fraction of features with zero importance)
  sparsity <- c(sparsity, (sum(model$importance[,1] <= 0)) / (ncol(traindata) - 1))
}
set.seed(1)
# varying ntree
error <- vector()
sparsity <- vector()
for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) { 
  model <- randomForest(phenotype ~ ., data=traindata, ntree=i)
  error <- c(error, model$err.rate[length(model$err.rate)])
  sparsity <- c(sparsity, (sum(model$importance[,1]<=0))/(ncol(traindata)-1))
}
# varying mtry
error2 <- vector()
sparsity2 <- vector()
param <- ncol(traindata)-1
for(i in c(1, round(param/10), round(param/5), round(param/3), round(param/2), param)) {
  model <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=i)
  error2 <- c(error2, model$err.rate[length(model$err.rate)])
  sparsity2 <- c(sparsity2, (sum(model$importance[,1]<=0))/(ncol(traindata)-1))
}
model <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=param/10)
png(paste(directory, "/model_training/m1_error_vs_ntree.png", sep=""), width=350, height = 350)
plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=error, xlab="Number of trees", ylab="OOB error rate", pch=16)
dev.off()
png(paste(directory, "/model_training/m1_sparsity_vs_ntree.png", sep=""), width=350, height = 350)
plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=sparsity, xlab="Number of trees", ylab="% genes uninformative", pch=16)
dev.off()
png(paste(directory, "/model_training/m1_error_vs_mtry.png", sep=""), width=350, height = 350)
plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=error2, xlab="Number of genes sampled per tree", ylab="OOB error rate", pch=16)
dev.off()
png(paste(directory, "/model_training/m1_sparsity_vs_mtry.png", sep=""), width=350, height = 350)
plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=sparsity2, xlab="Number of genes sampled per tree", ylab="% genes uninformative", pch=16)
dev.off()
train2 <- traindata[,match(names(model$importance[model$importance[,1]>0,]), colnames(traindata))]
train2 <- cbind(train2, phenotype=traindata$phenotype)
error <- vector()
sparsity <- vector()
for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) {
  model <- randomForest(phenotype ~ ., data=train2, ntree=i)
  error <- c(error, median(model$err.rate))
  sparsity <- c(sparsity, (sum(model$importance[,1]<=0))/(ncol(train2)-1))
}
error2 <- vector()
sparsity2 <- vector()
param <- ncol(train2)-1
for(i in c(1, round(param/10), round(param/5), round(param/3), round(param/2), param)) {
  model <- randomForest(phenotype ~ ., data=train2, ntree=10000, mtry=i)
  error2 <- c(error2, median(model$err.rate))
  sparsity2 <- c(sparsity2, (sum(model$importance[,1]<=0))/(ncol(train2)-1))
}
model <- randomForest(phenotype ~ ., data=train2, ntree=10000, mtry=param/10)
png(paste(directory, "/model_training/m2_error_vs_ntree.png", sep=""), width=350, height = 350)
plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=error, xlab="Number of trees", ylab="OOB error rate", pch=16)
dev.off()
png(paste(directory, "/model_training/m2_sparsity_vs_ntree.png", sep=""), width=350, height = 350)
plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=sparsity, xlab="Number of trees", ylab="% genes uninformative", pch=16)
dev.off()
png(paste(directory, "/model_training/m2_error_vs_mtry.png", sep=""), width=350, height = 350)
plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=error2, xlab="Number of genes sampled per tree", ylab="OOB error rate", pch=16)
dev.off()
png(paste(directory, "/model_training/m2_sparsity_vs_mtry.png", sep=""), width=350, height = 350)
plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=sparsity2, xlab="Number of genes sampled per tree", ylab="% genes uninformative", pch=16)
dev.off()
train3 <- train2[,match(names(model$importance[model$importance[,1]>quantile(model$importance[,1], 0.5),]), colnames(train2))]
train3 <- cbind(train3, phenotype=train2$phenotype)
error <- vector()
sparsity <- vector()
param <- ncol(train3)-1
for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) {
  model <- randomForest(phenotype ~ ., data=train3, ntree=i)
  error <- c(error, median(model$err.rate))
  sparsity <- c(sparsity, (sum(model$importance[,1]<=0))/(ncol(train3)-1))
}
error2 <- vector()
sparsity2 <- vector()
for(i in c(1, round(param/10), round(param/5), round(param/3), round(param/2), param)) {
  model <- randomForest(phenotype ~ ., data=train3, ntree=10000, mtry=i)
  error2 <- c(error2, median(model$err.rate))
  sparsity2 <- c(sparsity2, (sum(model$importance[,1]<=0))/(ncol(train3)-1))
}
model <- randomForest(phenotype ~ ., data=train3, ntree=10000, mtry=param/10)
png(paste(directory, "/model_training/m3_error_vs_ntree.png", sep=""), width=350, height = 350)
plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=error, xlab="Number of trees", ylab="OOB error rate", pch=16)
dev.off()
png(paste(directory, "/model_training/m3_sparsity_vs_ntree.png", sep=""), width=350, height = 350)
plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=sparsity, xlab="Number of trees", ylab="% genes uninformative", pch=16)
dev.off()
png(paste(directory, "/model_training/m3_error_vs_mtry.png", sep=""), width=350, height = 350)
plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=error2, xlab="Number of genes sampled per tree", ylab="OOB error rate", pch=16)
dev.off()
png(paste(directory, "/model_training/m3_sparsity_vs_mtry.png", sep=""), width=350, height = 350)
plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=sparsity2, xlab="Number of genes sampled per tree", ylab="% genes uninformative", pch=16)
dev.off()
train4 <- train3[,match(names(model$importance[model$importance[,1]>quantile(model$importance[,1], 0.5),]), colnames(train3))]
train4 <- cbind(train4, phenotype=train3$phenotype)
error <- vector()
sparsity <- vector()
for(i in c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000)) {
  model <- randomForest(phenotype ~ ., data=train4, ntree=i)
  error <- c(error, median(model$err.rate))
  sparsity <- c(sparsity, (sum(model$importance[,1]<=0))/(ncol(train4)-1))
}
error2 <- vector()
sparsity2 <- vector()
param <- ncol(train4)-1
for(i in c(1, round(param/10), round(param/5), round(param/3), round(param/2), param)) {
  model <- randomForest(phenotype ~ ., data=train4, ntree=10000, mtry=i)
  error2 <- c(error2, median(model$err.rate))
  sparsity2 <- c(sparsity2, (sum(model$importance[,1]<=0))/(ncol(train4)-1))
}
model <- randomForest(phenotype ~ ., data=train4, ntree=10000, mtry=param/10)
png(paste(directory, "/model_training/m4_error_vs_ntree.png", sep=""), width=350, height = 350)
plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=error, xlab="Number of trees", ylab="OOB error rate", pch=16)
dev.off()
png(paste(directory, "/model_training/m4_sparsity_vs_ntree.png", sep=""), width=350, height = 350)
plot(x=c(1, 10, 50, 250, 500, 1000, 1500, 2000, 5000, 10000), y=sparsity, xlab="Number of trees", ylab="% genes uninformative", pch=16)
dev.off()
png(paste(directory, "/model_training/m4_error_vs_mtry.png", sep=""), width=350, height = 350)
plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=error2, xlab="Number of genes sampled per tree", ylab="OOB error rate", pch=16)
dev.off()
png(paste(directory, "/model_training/m4_sparsity_vs_mtry.png", sep=""), width=350, height = 350)
plot(x=c(1, round(param/10), round(param/5), round(param/3), round(param/2), param), y=sparsity2, xlab="Number of genes sampled per tree", ylab="% genes uninformative", pch=16)
dev.off()
model$predicted
names(model$importance[order(model$importance, decreasing=T),][1:10])
save(model, train2, train3, train4, file=paste(directory, "/traindata.Rdata", sep=""))
set.seed(1)
param <- ncol(traindata)-1
model1 <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=param/10)
model1
png(paste(directory, "/VI_full.png", sep=""), width=400, height=350)
plot(1:param, model1$importance[order(model1$importance, decreasing=T)], xlim=c(1,1000), ylab="Variable importance", xlab="Top genes")
dev.off()

train2 <- traindata[,match(names(model1$importance[model1$importance[,1]>0,]), colnames(traindata))]
train2 <- cbind(train2, phenotype=traindata$phenotype)
param <- ncol(train2)-1
model2 <- randomForest(phenotype ~ ., data=train2, ntree=10000, mtry=param/10)
model2
train3 <- train2[,match(names(model2$importance[model2$importance[,1]>quantile(model2$importance[,1], 0.5),]), colnames(train2))]
train3 <- cbind(train3, phenotype=train2$phenotype)
param <- ncol(train3)-1
model3 <- randomForest(phenotype ~ ., data=train3, ntree=10000, mtry=param/10)
model3
train4 <- train3[,match(names(model3$importance[model3$importance[,1]>quantile(model3$importance[,1], 0.5),]), colnames(train3))]
train4 <- cbind(train4, phenotype=train3$phenotype)
param <- ncol(train4)-1
model4 <- randomForest(phenotype ~ ., data=train4, ntree=10000, mtry=param/10)
model4
train5 <- train4[,match(names(model4$importance[model4$importance[,1]>quantile(model4$importance[,1], 0.5),]), colnames(train4))]
train5 <- cbind(train5, phenotype=train4$phenotype)
param <- ncol(train5)-1
model5 <- randomForest(phenotype ~ ., data=train5, ntree=10000, mtry=param/10, proximity=T)
model5
train6 <- train5[,match(names(model5$importance[model5$importance[,1]>quantile(model5$importance[,1], 0.5),]), colnames(train5))]
train6 <- cbind(train6, phenotype=train5$phenotype)
param <- ncol(train6)-1
model6 <- randomForest(phenotype ~ ., data=train6, ntree=10000, mtry=param/10, proximity=T)
model6
train7 <- train6[,match(names(model6$importance[model6$importance[,1]>quantile(model6$importance[,1], 0.5),]), colnames(train6))]
train7 <- cbind(train7, phenotype=train6$phenotype)
param <- ncol(train7)-1
model7 <- randomForest(phenotype ~ ., data=train7, ntree=10000, mtry=param/10, proximity=T)
model7
model7$predicted
names(model7$importance[order(model7$importance[,1], decreasing=T),])[1:10]
png(paste(directory, "final_model.png", sep=""), width=400, height=350)
plot(1:param, model7$importance[order(model7$importance, decreasing=T),], xlab="", ylab="Variable importance")
dev.off()
save(model1, model2, model3, model4, model5,model6, model7, traindata, train2, train3, train4, train5, train6, train7, file=paste("finalmodel.Rdata", sep=""))

topgenes <- vector()
topgenes <- c(topgenes, colnames(traindata))
table(topgenes)
write(topgenes,"detected_topgenes_list_train1.csv", sep="")


topgenes <- vector()
topgenes <- c(topgenes, colnames(train2))
table(topgenes)
write(topgenes,"detected_topgenes_list_train2.csv", sep="")

topgenes <- vector()
topgenes <- c(topgenes, colnames(train3))
table(topgenes)
write(topgenes,"detected_topgenes_list_train3.csv", sep="")

topgenes <- vector()
topgenes <- c(topgenes, colnames(train4))
table(topgenes)
write(topgenes,"detected_topgenes_list_train4.csv", sep="")

topgenes <- vector()
topgenes <- c(topgenes, colnames(train5))
table(topgenes)
write(topgenes,"detected_topgenes_list_train5.csv", sep="")

topgenes <- vector()
topgenes <- c(topgenes, colnames(train6))
table(topgenes)
write(topgenes,"detected_topgenes_list_train6.csv", sep="")

topgenes <- vector()
topgenes <- c(topgenes, colnames(train7))
table(topgenes)
write(topgenes,"detected_topgenes_list_train7.csv", sep="")

votedata <- rbind.data.frame(cbind.data.frame(model=rep("1", 72), Serovar=row.names(model1$votes), Invasive=model1$votes[,2]), cbind.data.frame(model=rep("2", 72), Serovar=row.names(model1$votes), Invasive=model2$votes[,2]), cbind.data.frame(model=rep("3", 72), Serovar=row.names(model1$votes), Invasive=model3$votes[,2]), cbind.data.frame(model=rep("4", 72), Serovar=row.names(model1$votes), Invasive=model4$votes[,2]), cbind.data.frame(model=rep("5", 72), Serovar=row.names(model1$votes), Invasive=model5$votes[,2]), cbind.data.frame(model=rep("6", 72), Serovar=row.names(model1$votes), Invasive=model6$votes[,2]), cbind.data.frame(model=rep("7", 72), Serovar=row.names(model1$votes), Invasive=model7$votes[,2]))

votedata$Phenotype <- phenotype[match(votedata$Serovar, phenotype[,1]),2]
ggplot(votedata, aes(x=model, y=Invasive, col=Serovar)) + geom_jitter(width=0.1) + theme_classic(8) + ylab("Proportion of votes for non-BCG wild-type") + xlab("Model iteration") + geom_hline(yintercept=0.5, lty=2, col="grey") + theme(legend.key.size = unit(0.3, "cm"))
ggsave("votes.png")