library(readr) brca_metabric_clinical_data <- read_tsv("/Users/goncamert/Desktop/Gonca research/comparison of coxph and random survival forest/metabric dataset and papers/brca_metabric_clinical_data.tsv") View(brca_metabric_clinical_data) br <- data.frame(brca_metabric_clinical_data) library(dplyr) ###########data preprocessing########## brnew <- br[complete.cases(br$Patient.s.Vital.Status),] brnew$age <- as.integer(round(brnew$Age.at.Diagnosis)) brnew1 <- brnew %>% select(-Study.ID, -Sample.ID,-Integrative.Cluster, -Primary.Tumor.Laterality,-Oncotree.Code, -Age.at.Diagnosis,-Type.of.Breast.Surgery, -Cancer.Type, -Pam50...Claudin.low.subtype, -Cancer.Type.Detailed, -Cohort, -ER.status.measured.by.IHC, -HER2.status.measured.by.SNP6, -Tumor.Other.Histologic.Subtype, -Number.of.Samples.Per.Patient,-Sample.Type, -Sex, -Patient.s.Vital.Status, -X3.Gene.classifier.subtype) brnew1$Cellularity <- factor(brnew1$Cellularity, levels = c("Low", "Moderate", "High")) brnew1$Chemotherapy <- factor(brnew1$Chemotherapy, levels = c("NO", "YES")) brnew1$ER.Status <- factor(brnew1$ER.Status, levels = c("Negative", "Positive")) brnew1$HER2.Status <- factor(brnew1$HER2.Status, levels = c("Negative", "Positive")) brnew1$Hormone.Therapy <- factor(brnew1$Hormone.Therapy, levels = c("NO", "YES")) brnew1$Inferred.Menopausal.State <- factor(brnew1$Inferred.Menopausal.State, levels = c("Post", "Pre")) brnew1$PR.Status <- factor(brnew1$PR.Status, levels = c("Negative", "Positive")) brnew1$Radio.Therapy <- factor(brnew1$Radio.Therapy, levels = c("NO", "YES")) brnew1$Tumor.Stage[brnew1$Tumor.Stage==4] <-3 brnew1$Tumor.Stage[brnew1$Tumor.Stage==0] <-1 brnew1$Tumor.Stage <- factor(brnew1$Tumor.Stage) brnew1$Relapse.Free.Status <- substr(brnew1$Relapse.Free.Status, 1,1) brnew1$Relapse.Free.Status<- as.integer(brnew1$Relapse.Free.Status) brnew1$Overall.Survival.Status <- substr(brnew1$Overall.Survival.Status, 1,1) brnew1$Neoplasm.Histologic.Grade <- factor(brnew1$Neoplasm.Histologic.Grade) #brnew1 <- subset(brnew1, select=-c(Overall.Survival.Status, Overall.Survival..Months.)) #####delete missing values brn <- na.omit(brnew1) na_count <-sapply(brn, function(y) sum(length(which(is.na(y))))) ###pool the event ii <- brn$Relapse.Free.Status==1 & brn$Overall.Survival.Status==1 io <- brn$Relapse.Free.Status==1 & brn$Overall.Survival.Status==0 oi <- brn$Relapse.Free.Status==0 & brn$Overall.Survival.Status==1 oo <- brn$Relapse.Free.Status==0 & brn$Overall.Survival.Status==0 brn$time <- rep(0, dim(brn)[1]) brn$time[ii] <- brn$Relapse.Free.Status..Months.[ii] brn$time[io] <- brn$Relapse.Free.Status..Months.[io] brn$time[oi] <- brn$Overall.Survival..Months.[oi] brn$time[oo] <- brn$Overall.Survival..Months.[oo] brn$time <- 30*brn$time brn$Status <- rep(0, dim(brn)[1]) brn$Status[!oo] <- 1 brn <- brn %>% select(-c(Patient.ID, Relapse.Free.Status, Relapse.Free.Status..Months., Overall.Survival.Status, Overall.Survival..Months., TMB..nonsynonymous.)) ####OUR DATA IS brn #############baseline characteristics of the data brn %>% group_by(Status) %>% summarise(avg = mean(age)) brn %>% group_by(Status) %>% summarise(avg = sd(age)) brn %>% group_by(Status) %>% summarise(avg = mean(Lymph.nodes.examined.positive)) brn %>% group_by(Status) %>% summarise(avg = sd(Lymph.nodes.examined.positive)) brn %>% group_by(Status) %>% summarise(avg1= mean(Mutation.Count), avg = sd(Mutation.Count)) brn %>% group_by(Status) %>% summarise(avg1= mean(Nottingham.prognostic.index), avg = sd(Nottingham.prognostic.index)) brn %>% group_by(Status) %>% summarise(avg1= mean(Tumor.Size), avg = sd(Tumor.Size)) brn %>% group_by(Status) %>% summarise(percent = Radio.Therapy/sum(Radio.Therapy)) table(brn$ER.Status, brn$Status) ##------------------------------------------------------------ ## compare RF-SRC to Cox regression ## Illustrates C-index and Brier score measures of performance ## assumes "pec" and "survival" libraries are loaded ##------------------------------------------------------------ library(survival) library(pec) library(party) library(randomForestSRC) library(prodlim) library(ipred) library(vip) trainrows <- sample(1:nrow(brn), .7*nrow(brn)) # GBSG2 <- GBSG2 %>% mutate(event = factor(cens, # labels = c("censored", "failed"))) #table(GBSG2$event) #GBSG2 <- GBSG2[,-10] surv.f.br <- as.formula(Surv(time, Status) ~.) pec.f.br <- as.formula(Hist(time, Status) ~ 1) ## run cox/rfsrc models ## for illustration we use a small number of trees cox.obj.br <- coxph(surv.f.br, data = brn, x = TRUE) rfsrc.obj.br <- rfsrc(surv.f.br, brn, ntree = 1000) fitcforest.br <- pecCforest(surv.f.br, data = brn, controls = cforest_classical(ntree = 1000)) ### estimate conditional Kaplan-Meier curves bst.br <- cforest(Surv(time, Status) ~ ., data = brn, control = cforest_unbiased(ntree = 50)) treeresponse(bst, newdata = GBSG2[1:2,], OOB = TRUE) ### if you can't resist to look at individual trees ... party:::prettytree(bst@ensemble[[1]], names(bst@data@get("input"))) vi <- varimp(bst.br) ### compare variable importances and absolute z-statistics layout(matrix(1:2)) barplot(vi) barplot(abs(summary(coxph(Surv(time, cens) ~ ., data = GBSG2))$coeff[,"z"])) ### looks more or less the same plot.variable(rfsrc.obj, surv.type = "rel.freq", partial = TRUE) vip(bst.br)#Variable importance for survival forests (cforest) bst.importance <- varimp(bst.br, conditional = TRUE) names(bst.importance) <- c("Cell", "Chemo", "ER", "Neo", "HER2", "HormoneT", "Menostate","Lymph", "Mut.count","Notti.Prog", "PR", "Radio", "Tsize", "Tgrade", "Age") par(mfrow=c(1,1)) dotchart(sort(bst.importance), main = "Conditional Importance of Variables for METABRIC Data") rsf.cindex.br <- get.cindex(time = brn$time, censoring = brn$Status, predicted = rfsrc.obj.br$predicted.oob) calPlot(rfsrc.obj, data = ) newData.br <- data.frame(Cellularity=factor("Moderate",levels=c("Low", "Moderate", "High")), Chemotherapy=factor("NO",levels=c("NO", "YES")), ER.Status=factor("Negative",levels=c("Negative", "Positive")), Neoplasm.Histologic.Grade=factor(2, levels=c(1,2,3)), HER2.Status=factor("Negative",levels=c("Negative", "Positive")), Hormone.Therapy=factor("NO",levels=c("NO", "YES")), age=as.integer(c(24,53,73)), Inferred.Menopausal.State=factor(cbind("Pre", "Post", "Post"),levels=c("Post","Pre")), Lymph.nodes.examined.positive =rep(3, 3), Mutation.Count=rep(4,3), Nottingham.prognostic.index=rep(4.12,3), PR.Status=factor("Negative",levels=c("Negative", "Positive")), Radio.Therapy=factor("NO",levels=c("NO", "YES")), TMB..nonsynonymous. = rep(7.16, 3), Tumor.Size=as.numeric(rep(25, 3)), Tumor.Stage=factor(2,levels=c(1,2,3))) print(newData.br) cox.obj2.br <- coxph(surv.f.br, data = brn[trainrows,], x = TRUE) rfsrc.obj2.br <- rfsrc(surv.f.br, brn[trainrows,], ntree = 1000) fitcforest2.br <- pecCforest(surv.f.br, data = brn[trainrows,], controls = cforest_classical(ntree = 1000)) cforestImpPlot <- function(x) { cforest_importance <<- v <- varimp(x) dotchart(v[order(v)]) } cforestImpPlot(bst.br) ######################################################################################## #################predicted survival curves for newdata############################# ######################################################################################## pcox <- predictSurvProb(cox.obj.br, newdata = newData.br, times = quantile(brn$time)) prsf <- predictSurvProb(rfsrc.obj.br, newdata = newData.br, times = quantile(brn$time)) pcf <- predictSurvProb(fitcforest.br, newdata = newData.br, times = quantile(brn$time)) psp <- round(cbind(pcox,prsf, pcf),2) newNames <- paste("newData", 1:3) mat <- cbind( newData$age, psp) colnames(mat) <- c( "Age", rep("Coxph", 5), rep("rsf",5), rep("cforest",5)) mat1 <- data.frame(newNames, mat) print(mat1) par(mfrow = c(1, 3)) lapply(1:3, function(x) { plotPredictSurvProb(cox.obj.br, newdata = newData.br[x, ], col = 1) plotPredictSurvProb(rfsrc.obj.br, newdata = newData.br[x, ], add = TRUE, col = 2) plotPredictSurvProb(fitcforest.br, newdata = newData.br[x, ], add = TRUE,col = 3) }) legend(x = "bottom",inset = 0, legend = c("Cox PH", "randomSurvivalForest", "cforest"), lwd = 3, col=1:3,bty="n",cex=1.1) ######################################################################################## ######################################################################################### ######################################################################################## ######################################################################################## ##################pec curves for bootstrap and bootstrap .632+ estimates############# ######################################################################################## ## compute bootstrap cross-validation estimate of expected Brier score ## see Mogensen, Ishwaran and Gerds (2012) Journal of Statistical Software set.seed(13) ptm <- proc.time() prederror.brn <- pec(list(cox.obj2.br,rfsrc.obj2.br, fitcforest2.br), data = brn[-trainrows,], formula = pec.f.br, splitMethod = "bootcv", B = 500, M=200) print(prederror.brn, times = seq(8, 2659,50)) plot(prederror.brn) stoptime<- proc.time() - ptm # Stop the clock print(stoptime[3]/60) set.seed(13) ptm <- proc.time() prederror.brn.632 <- pec(list(cox.obj2.br,rfsrc.obj2.br, fitcforest2.br), data = brn[-trainrows,], formula = pec.f.br, splitMethod = "Boot632", B = 500, M=200) print(prederror.brn.632, times = seq(8, 2659,50)) plot(prederror.brn.632) stoptime<- proc.time() - ptm # Stop the clock print(stoptime[3]/60) plot(prederror.brn, legend=FALSE) legend("bottomright", legend=c("Reference", "Cox PH","RSF", "Cforest"), col=c("black", "red","green", "blue"), lty=1) title(main = "METABRIC bootstrap estimates") plot(prederror.brn.632, legend=FALSE) legend("bottomright", legend=c("Reference", "Cox PH","RSF", "Cforest"), col=c("black", "red","green", "blue"), lty=1) title(main = "METABRIC bootstrap.632 estimates") ######################################################################################## ######################################################################################### ######################################################################################## ######################################################################################### ###########cindex############################################################################## ######################################################################################## # compute the bootstrap-crossvalidation estimate of # the C-index at different time points # ptm <- proc.time() set.seed(13) bcvCindex.br <- pec::cindex(list("Cox"=cox.obj.br, "RSF"=rfsrc.obj.br, "Cforest"= fitcforest.br), formula=surv.f.br, data=brn, splitMethod="Bootcv", B = 500, M=200, eval.times=c(50,quantile(brn$time)[-1])) bcvCindex.632.br <- pec::cindex(list("Cox"=cox.obj.br, "RSF"=rfsrc.obj.br, "Cforest"= fitcforest.br), formula=surv.f.br, data=brn, splitMethod="Boot632", B = 500, M=200, eval.times=c(50,quantile(brn$time)[-1])) print(bcvCindex.br) par(mfrow = c(2,2)) plot(bcvCindex.br, legend=FALSE) legend("topleft", legend=c("Cox PH","RSF", "Cforest"), col=c("black", "red","green"), lty=1) title(main = "METABRIC bootstrap estimates") print(bcvCindex.632.br) plot(bcvCindex.632.br, legend=FALSE) legend("topleft", legend=c("Cox PH","RSF", "Cforest"), col=c("black", "red","green"), lty=1) title(main = "METABRIC bootstrap .632 estimates") stoptime<- proc.time() - ptm # Stop the clock print(stoptime[3]/60) ######################################################################################## ######################################################################################### ######################################################################################## #Rkare <- R2(prederror.GBSG2,times=seq(8, 2659,50)) ######calibration plot cizdirmeye calistim ama olmadi.... #veriyi train diye bolup oyle model kurdum hata bulmaya calisiyorum set.seed(14) prederror2lam <- pec(list(cox.obj2,rfsrc.obj2, fitcforest2), data = GBSG2[trainrows,], formula = pec.f, splitMethod = "bootcv", B = 50) print(prederror2) plot(prederror2) # oo <- subsample(rfsrc.obj) # # take a delete-d-jackknife procedure for example # vimpCI <- extract.subsample(oo)$var.jk.sel.Z # # Confidence Intervals for VIMP # plot.subsample(oo) # # take the variable "Month" for example for partial plot # plot.variable(rfsrc.obj, xvar.names = "days", partial = TRUE) ## compute out-of-bag C-index for cox regression and compare to rfsrc rfsrc.obj.brr <- rfsrc(surv.f.br, brn) cat("out-of-bag Cox Analysis ...", "\n") cox.err <- sapply(1:100, function(b) { if (b%%10 == 0) cat("cox bootstrap:", b, "\n") train <- sample(1:nrow(brn), nrow(brn), replace = TRUE) cox.obj <- tryCatch({coxph(surv.f.br, brn[train, ])}, error=function(ex){NULL}) if (!is.null(cox.obj.br)) { get.cindex(brn$time[-train], brn$Status[-train], predict(cox.obj.br, brn[-train, ])) } else NA }) cat("\n\tOOB error rates\n\n") cat("\tRSF : ", rfsrc.obj.br$err.rate[rfsrc.obj.br$ntree], "\n") cat("\tCox regression : ", mean(cox.err, na.rm = TRUE), "\n") ## illustrates the various splitting rules ## illustrates event specific and non-event specific variable selection ## events are transplant (1) and death (2) GBSG2$id <- NULL ## modified Gray's weighted log-rank splitting ## (equivalent to cause=c(1,1) and splitrule="logrankCR") GBSG2.cr <- rfsrc(Surv(time, cens) ~ ., GBSG2) ## log-rank cause-1 specific splitting and targeted VIMP for cause 1 GBSG2.log1 <- rfsrc(Surv(time, cens) ~ ., GBSG2, splitrule = "logrank",importance="permute") brn.new <- brn colnames(brn.new) <- c("Cell", "Chemo", "ER", "Neo", "HER2", "HormoneT", "Menostate","Lymph", "Mut.count","Notti.Prog", "PR", "Radio", "Tsize", "Tgrade", "Age", "time", "Status") brn.log1 <- rfsrc(Surv(time, Status) ~ ., brn.new, importance="permute") brn.cr <- rfsrc(Surv(time, Status) ~ ., brn.new) plot(brn.log1)# bunu ekledim sonuclara ## log-rank cause-2 specific splitting and targeted VIMP for cause 2 #GBSG2.log2 <- rfsrc(Surv(time, cens) ~ ., GBSG2, # splitrule = "logrank", cause = c(0,1), importance = TRUE) ## extract VIMP from the log-rank forests: event-specific ## extract minimal depth from the Gray log-rank forest: non-event specific var.perf.br <- data.frame(md = max.subtree(brn.cr)$order[, 1], vimp = 100 * brn.log1$importance) lap.br <- print(var.perf.br[order(var.perf.br$vimp), ]) stargazer(cox.obj,summary=TRUE, type = "text", title = "var per", out="cox.obj.txt") plot(GBSG2.log1) p<-ggplot(data=lap.br, aes(x=md, y=vimp)) + geom_bar(stat="identity")+ geom_text(aes(label=vimp), vjust=1.6, color="white", size=3.5)+ theme_minimal() p + coord_flip() # Inside bars ggplot(data=df, aes(x=dose, y=len)) + geom_bar(stat="identity", fill="steelblue")+ geom_text(aes(label=len), vjust=1.6, color="white", size=3.5)+ theme_minimal() ############print results A <- summary(cox.obj.br) write.table(round(A$coefficients,4), file = "cox.obj.br.txt", sep = ",", quote = FALSE) B <- as.data.frame(t(mat1)) write.table(B, "metabric.newdata.txt", sep = ",", quote = FALSE) A2 <- summary(cox.obj2.br) write.table(round(A2$coefficients,4), file = "cox.obj.br2.txt", sep = ",", quote = FALSE) save.image("cforest.rf_br.RData") save.image("cforest.rf_brandGBSG2.RData")