##########use the bypt dataset (it is a full data set that I ignored before) library(sas7bdat); bypt.sails<-read.csv("/Users/zhang/Documents/work2017/ARDStrend/data/sails/data/csv/bypt.csv",header=T,sep=",") bypt.Omega<-read.sas7bdat("/Users/zhang/Documents/work2017/ARDStrend/data/Omega/data/bypt.sas7bdat") vars<-c("age","gender","admtype","admitfrom","reside", "surgel","icureadmit","chrondial","aids","leuk","lymph","tumor", "immune","hepa","cirr","diab","hyper","myocard","heart","vascular", "aestroke","dementia","chrpulm","arthritis","ulcer","vasol24", "templ","temph","sysbpl","sysbph","mapl","maph","hratel", "hrateh","respl","resph","ventl","venth","urineout_0","fluidout_0", "fluidin_0","hctl","hcth","wbcl","wbch","plate","sodiuml", "sodiumh", "potasl", "potash","bun","creatl", "creath","glucl", "gluch", "albuml", "albumh","bilih","bicarbl","ck_0","alt_0","ast", "crp_0","simv_0","prvc_0","pressup_0","volassist_0", "pcirv_0","aprv_0","ventoth_0","hfov_0","tidal_0","setrate_0", "resp_0","minvent_0","peep_0","fio2_0","SpO2_0","pplat_0", "pip_0","meanair_0","status") bypt.sails$study.tag <- 'sails'; bypt.Omega$study.tag <- 'Omega'; dfall <- rbind(bypt.sails[,c(vars,'study.tag')], bypt.Omega[,c(vars,'study.tag')]) mvname1<-c("simv_0","prvc_0","pressup_0","volassist_0", "pcirv_0","aprv_0","ventoth_0","hfov_0") for(var in mvname1){ dfall[,var]<-ifelse(is.na(dfall[,var]),0,1) } dfall <- data.frame(lapply(dfall, function(xx){ if(length(unique(xx))<=10){ ddd <- as.factor(xx) }else{ddd <- xx} return(ddd) })) dfall$status <- factor(ifelse(dfall$status==2,2,1), labels=c("alive","die")) # ##show the difference between the two datasets library(tableone) tableone <- CreateTableOne(vars=vars,strata = 'study.tag',dfall) print(tableone, quote = T) library(mice)# to perform multiple imputation set.seed(123)#allow replication dfm <- mice(dfall,m=1) df <- mice::complete(dfm, action=1L) names(df)[2]<-'sex'; names(df) <- gsub('_0','',names(df)) ##split the dataset trainSet <- df[df$study.tag=='sails',] testSet <- df[df$study.tag=='Omega',] library(dummies) admtype<-data.frame(dummy(trainSet$admtype)) admitfrom<-data.frame(dummy(trainSet$admitfrom)) reside<-data.frame(dummy(trainSet$reside)) sex<-data.frame(dummy(trainSet$sex)) trainSet1<-cbind(trainSet[,-c(2:5)],admtype,admitfrom,reside,sex) y<-trainSet1$status X<-t(subset(trainSet1,select=-c(status,study.tag))) ###bild a model library(galgo) df.nc <- configBB.VarSel( data=X, classes=y, classification.method="nnet", chromosomeSize=15, maxSolutions=200, goalFitness = 0.77, saveVariable="df.nc", saveFrequency=30, saveFile="df.nc.Rdata") blast(df.nc) plot(df.nc, type="fitness")# Evolution of the maximum fitness across generations plot(df.nc, type="confusion")#overall acuracy plot(df.nc, type="confusion", set=c(1,0), splits=1, filter="solutions")#evaluate the error in the first training set rchr <- lapply(df.nc$bestChromosomes[1:10], robustGeneBackwardElimination, df.nc, result="shortest") fsm <- forwardSelectionModels(df.nc)#Developing Representative Models heatmapModels(df.nc, fsm, subset=2)#view the best model ##logistic regression model library(MASS); full.model <- glm(status ~., data = trainSet[,-83],family = 'binomial') step.model <- stepAIC(full.model, direction = "both", scope = list(lower = ~1), trace = FALSE) summary(step.model) modLogit <- glm(status~., data = trainSet[,all.vars(formula(step.model))], family = 'binomial') pred.Logit <- predict(modLogit,testSet,type='response') rocLogit <- roc(testSet$status,pred.Logit) ci.auc(rocLogit) ShowRegTable(modLogit,quote = T) ##univariate analysis in overall cohorts tableCom <- CreateTableOne(vars=xvars,strata = 'status',dfall) print(tableCom,quote = T)