##########use the bypt dataset (it is a full data set that I ignored before)
library(sas7bdat);
bypt.sails<-read.csv("/Users/zhang/Documents/work2017/ARDStrend/data/sails/data/csv/bypt.csv",header=T,sep=",")
bypt.Omega<-read.sas7bdat("/Users/zhang/Documents/work2017/ARDStrend/data/Omega/data/bypt.sas7bdat")
vars<-c("age","gender","admtype","admitfrom","reside",
    "surgel","icureadmit","chrondial","aids","leuk","lymph","tumor",
    "immune","hepa","cirr","diab","hyper","myocard","heart","vascular",
    "aestroke","dementia","chrpulm","arthritis","ulcer","vasol24",
    "templ","temph","sysbpl","sysbph","mapl","maph","hratel",
    "hrateh","respl","resph","ventl","venth","urineout_0","fluidout_0",
    "fluidin_0","hctl","hcth","wbcl","wbch","plate","sodiuml", "sodiumh",
    "potasl", "potash","bun","creatl", "creath","glucl", "gluch",
    "albuml", "albumh","bilih","bicarbl","ck_0","alt_0","ast",
    "crp_0","simv_0","prvc_0","pressup_0","volassist_0",
    "pcirv_0","aprv_0","ventoth_0","hfov_0","tidal_0","setrate_0",
    "resp_0","minvent_0","peep_0","fio2_0","SpO2_0","pplat_0",
    "pip_0","meanair_0","status")
bypt.sails$study.tag <- 'sails'; 
bypt.Omega$study.tag <- 'Omega';
dfall <- rbind(bypt.sails[,c(vars,'study.tag')],
               bypt.Omega[,c(vars,'study.tag')])
mvname1<-c("simv_0","prvc_0","pressup_0","volassist_0",
          "pcirv_0","aprv_0","ventoth_0","hfov_0")
for(var in mvname1){
  dfall[,var]<-ifelse(is.na(dfall[,var]),0,1)
}
dfall <- data.frame(lapply(dfall, function(xx){
  if(length(unique(xx))<=10){
    ddd <- as.factor(xx)
  }else{ddd <- xx}
  return(ddd)
}))
dfall$status <- factor(ifelse(dfall$status==2,2,1),
                  labels=c("alive","die"))
#

##show the difference between the two datasets
library(tableone)
tableone <- CreateTableOne(vars=vars,strata = 'study.tag',dfall)
print(tableone,
      quote = T)

library(mice)# to perform multiple imputation
set.seed(123)#allow replication
dfm <- mice(dfall,m=1)
df <- mice::complete(dfm, action=1L)
names(df)[2]<-'sex';
names(df) <- gsub('_0','',names(df))
##split the dataset
trainSet <- df[df$study.tag=='sails',]
testSet <- df[df$study.tag=='Omega',]
library(dummies)
admtype<-data.frame(dummy(trainSet$admtype))
admitfrom<-data.frame(dummy(trainSet$admitfrom))
reside<-data.frame(dummy(trainSet$reside))
sex<-data.frame(dummy(trainSet$sex))
trainSet1<-cbind(trainSet[,-c(2:5)],admtype,admitfrom,reside,sex)
y<-trainSet1$status
X<-t(subset(trainSet1,select=-c(status,study.tag)))

###bild a model
library(galgo)
df.nc <- configBB.VarSel(
data=X,
classes=y,
classification.method="nnet",
chromosomeSize=15,
maxSolutions=200,
goalFitness = 0.77,
saveVariable="df.nc",
saveFrequency=30,
saveFile="df.nc.Rdata")
blast(df.nc)
plot(df.nc, type="fitness")# Evolution of the maximum fitness across generations
plot(df.nc, type="confusion")#overall acuracy
plot(df.nc, type="confusion", set=c(1,0), splits=1,
     filter="solutions")#evaluate the error in the first training set
rchr <- lapply(df.nc$bestChromosomes[1:10],
     robustGeneBackwardElimination, df.nc, result="shortest")
fsm <- forwardSelectionModels(df.nc)#Developing Representative Models
heatmapModels(df.nc, fsm, subset=2)#view the best model 
##logistic regression model
library(MASS);
full.model <- glm(status ~., data = trainSet[,-83],family = 'binomial')
step.model <- stepAIC(full.model, direction = "both", 
                      scope = list(lower = ~1),
                      trace = FALSE)
summary(step.model)
modLogit <- glm(status~.,
                data = trainSet[,all.vars(formula(step.model))],
                family = 'binomial')
pred.Logit <- predict(modLogit,testSet,type='response')
rocLogit <- roc(testSet$status,pred.Logit)
ci.auc(rocLogit)
ShowRegTable(modLogit,quote = T)
##univariate analysis in overall cohorts
tableCom <- CreateTableOne(vars=xvars,strata = 'status',dfall)
print(tableCom,quote = T)