library(rstan) library(dplyr) library(stringr) library(ggplot2) library(reshape2) library(Hmisc) library(ggpubr) ## estimation with regularization,which fit the no additive model,i.e. exp(beta[3]) = sum(exp(beta[1:2]))-1 add_code <-" data{ int N; int P; int y[N]; matrix[N,P] x; } parameters{ real alpha; vector[P] beta; } transformed parameters{ vector[P] tbeta = beta; tbeta[3] = log(exp(beta[1]) + exp(beta[2]) - 1); } model{ y ~ bernoulli_logit(alpha + x*tbeta); } " model_obj <- stan_model(model_code = add_code,model_name = 'add' ) save(model_obj,file = "model_add.RData") ## estimation with classical logistic regression , Consistent with the glm fit. addno_code <-" data{ int N; int P; int y[N]; matrix[N,P] x; } parameters{ real alpha; vector[P] beta; } model{ y ~ bernoulli_logit(alpha + x*beta); } " ### detal method ### # the length of beta is 3, omitting the intercept # the dimention of bvar is 3 by 3 detal_method <- function(beta,bvar){ RERI = sum(exp(beta)*c(-1,-1,1)) + 1 AP = RERI/exp(beta[3]) S = (exp(beta[3])-1)/(sum(exp(beta[1:2]))-2) h1 <- exp(beta)*(c(-1,-1,1)) h2 <- c(-exp(beta[1:2])/exp(beta[3]), (sum(exp(beta[1:2]))-1)/exp(beta[3])) h3 <- c(-exp(beta[1:2])/(sum(exp(beta[1:2]))-2), exp(beta[3])/(exp(beta[3])-1)) h <- rbind(h1,h2,h3) dv <- diag( h %*% bvar %*% t(h) ) point <- c(RERI,AP,S) up <- c(RERI,AP,log(S)) + 1.96*sqrt(dv) up[3] <- exp(up[3]) low <- c(RERI,AP,log(S)) - 1.96*sqrt(dv) low[3] <- exp(low[3]) res <- c(point,low,up) names(res) <- c("d_RERI","d_AP","d_S","d_RERI_low","d_AP_low","d_S_low","d_RERI_up","d_AP_up","d_S_up") return(res) } # x : the intercept indice omits bootstrap_add <- function(x,y,nb=300){ boot_fun <- function(){ indices <- c(sample(which(glm_obj$y==1),replace = TRUE),sample(which(glm_obj$y==0),replace = TRUE)) coef <- update(glm_obj,data=df_b[indices,])$coef[2:4] coef } df_b <- data.frame(x=x,y=y) glm_obj <- glm(y~.,data=df_b,family = binomial()) b_res <- t(replicate(nb,boot_fun())) add_res <- as.data.frame(b_res) %>% transmute(RERI=exp(x.fx4)-exp(x.fx3)-exp(x.fx2) +1, AP = RERI/exp(x.fx4), S = abs((exp(x.fx4)-1)/(exp(x.fx3)+exp(x.fx2)-2))) # 防止有负数存在,由于模型要求以最低的为对照才会一致 boot_res <- as.vector(t(apply(add_res, 2, quantile, probs=c(0.5,0.025,0.975),na.rm=TRUE))) names(boot_res) <- c("b_RERI","b_AP","b_S","b_RERI_low","b_AP_low","b_S_low","b_RERI_up","b_AP_up","b_S_up") return(boot_res) } # omit the intercept,and the first three is the value considered # nsize : the sample size # nsimulation : the number of simulation generate_data <- function(beta,nsize,ith=1,nsimulation=1000,nb=300,seed=123){ # generate the x matrix, whose dimension depent on the length of true_beta np = length(beta) set.seed(seed = seed) fx = as.factor(sample(1:4,nsize,replace=TRUE)) fx_matrix = model.matrix(~fx)[,-1] cx = matrix(rnorm(nsize*(np-3),mean=0,sd=1),nrow = nsize) if(np > 3) x=cbind(fx_matrix,matrix(rnorm(nsize*(np-3),mean=0,sd=1),nrow = nsize)) else x= fx_matrix y =matrix(rbinom(nsize*nsimulation,1,prob = plogis(scale(x%*%beta))),ncol = nsimulation) result = lapply(1:nsimulation, function(index) { dataf <- list(yi=y[,index],x=x) glm_fit = glm(yi~x,data=dataf,family = binomial()) delta_res = detal_method(beta = glm_fit$coefficients[2:4],bvar = vcov(glm_fit)[2:4,2:4] ) boot_res = bootstrap_add(x=dataf$x,y=dataf$yi,nb=nb) stan_data = list(N=nsize,P=np,y=dataf$yi,x=dataf$x) stan_fit = optimizing(model_obj,data=stan_data,init=list(alpha=coef(glm_fit)[1],beta=coef(glm_fit)[-1])) res = c(glm_fit$coefficients[2:4],glm_fit$deviance,stan_fit$par[(np+2):(np+4)],-2*stan_fit$value) res_names = c('glm_a','glm_b','glm_ab','glm_deviance','stan_a','stan_b','stan_ab','stan_deviance') names(res) <- res_names if(index %% 10 == 0) print(index) return(c(res,delta_res,boot_res)) } ) df <- do.call("rbind",result) df_dir <- paste("./result/n",nsize, sep = "") if(!dir.exists(df_dir)) dir.create(df_dir) save_name <- paste(df_dir,"/simulation_",ith,".csv", sep = "") write.csv(df, save_name) return(df) } ## 模拟数据 # 模拟1设置 nsize = 600 # 模拟2设置 nsize = 1000 # 模拟3 设置 nsize=400 simulate_1 <- generate_data(beta=log(c(4,5,20)), nsize = nsize,ith=1) simulate_2 <- generate_data(beta=log(c(4,5,16)), nsize = nsize,ith=2) simulate_3 <- generate_data(beta=log(c(4,5,12)), nsize = nsize,ith=3) simulate_4 <- generate_data(beta=log(c(4,5,8)), nsize = nsize,ith=4) simulate_5 <- generate_data(beta=log(c(4,5,6)), nsize = nsize,ith=5) simulate_6 <- generate_data(beta=log(c(4,5,4)), nsize = nsize,ith=6) simulate_7 <- generate_data(beta=log(c(4,5,2)), nsize = nsize,ith=7) ## 读入数据 df_files <- list.files(path="./result", full.names = TRUE, recursive = TRUE) # df_file <- df_files[1] # 计算没有交互作用的比率 # df_file 文件路径名 # nsize 样本量 show_res <- function(df_file, nsize){ df <- read.csv(df_file) df_res <- as.tbl(df) %>% mutate(ic_add = stan_deviance-glm_deviance, aic_add = ic_add < 2, bic_add = ic_add < log(nsize), hq_add = ic_add < 2*log(log(nsize)), dre_add = d_RERI_low*d_RERI_up<0, dap_add = d_AP_low*d_AP_up<0, ds_add = (d_S_low-1)*(d_S_up-1)<0, bre_add = b_RERI_low*b_RERI_up<0, bap_add = b_AP_low*b_AP_up<0, bs_add = (b_S_low-1)*(b_S_up-1)<0) %>% select(ends_with("_add")) %>% summarise_all(mean) } df_res <- matrix(0,ncol=10) df_res <- df_res[-1,] nsizes <-as.numeric(do.call("c",str_extract_all(do.call("rbind",strsplit(df_files,"/"))[,3] ,"[0-9]+[0-9]"))) for(i in 1:length(df_files)){ df_res <- rbind(df_res,show_res(df_files[i],nsizes[i])) } df_res <- cbind(df_res,nsizes) vars <- c( AIC = "aic_add", BIC = "bic_add", HQ = "hq_add", RERI_Delta = "dre_add", AP_Delta = "dap_add", S_Delta = "ds_add", RERI_Bootstrap = "bre_add", AP_Bootstrap = "bap_add", S_Bootstrap = "bs_add" ) df_plot_alpha <- df_res %>% filter(aic_add > 0.7) %>% select(-c(X,ic_add)) %>% rename(!!vars) %>% melt(id.vars=c("nsizes")) %>% mutate(nsizes= as.factor(nsizes)) plot_alpha <- ggplot(df_plot_alpha,aes(x=variable,y=value,group=nsizes)) + geom_line(aes(linetype=nsizes),size=1) + geom_point(size=2) + theme_bw() + theme(legend.position = c(0.9,0.85),legend.text = element_text(size = 15) , legend.key.size=unit(1,'cm'), legend.title = element_text(size=15), axis.text = element_text(size = 15), title=element_text(size=15), panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black") ) + labs(title=" ", x=" ", y="percentage of no interaction") ggsave("plot_alpha.jpeg",plot_alpha,width = 14.6,height = 8.8) ################################################################### ###### regularized logistic mle.logreg = function(fmla, data ,penalty=0) { # Define the negative log likelihood function logl <- function(theta,x,y,penalty){ y <- y x <- as.matrix(x) beta <- theta[1:ncol(x)] # Use the log-likelihood of the Bernouilli distribution, where p is # defined as the logistic transformation of a linear combination # of predictors, according to logit(p)=(x%*%beta) pen <- abs(exp(beta[4])-exp(beta[3])-exp(beta[2])+1) pen <- pen*(pen>0.00001) loglik <- sum(-y*log(1 + exp(-(x%*%beta))) - (1-y)*log(1 + exp(x%*%beta))) - penalty*pen return(-loglik) } # Prepare the data outcome = rownames(attr(terms(fmla),"factors"))[1] dfrTmp = model.frame(data) x = as.matrix(model.matrix(fmla, data=dfrTmp)) y = as.matrix(data[,match(outcome,colnames(data))]) # Define initial values for the parameters theta.start = glm(fmla,data,family = binomial())$coef #rep(0,(dim(x)[2])) # names(theta.start) = colnames(x) # Calculate the maximum likelihood mle = optim(theta.start,logl,x=x,y=y,penalty=penalty,hessian = T) out = list(beta=mle$par,vcov=solve(mle$hessian),ll=2*mle$value) } ################# ## without interaction ################# set.seed(123) n_rl <- 600 x1 <- factor(sample(0:3,n_rl,rep=TRUE)) L <- (x1==1) + (x1==2) + 1.48988*(x1==3) y <- ifelse(runif(n_rl)<=plogis(L), 1, 0) df_rl <- data.frame(y=y,x=x1)[1:400,] x2 <- model.matrix(y~x1) Xnew <- x2[401:n_rl,] Ynew <- y[401:n_rl] penalty <- seq(0,0.5,by=.01) np <- length(penalty) rbeta <- matrix(,ncol=np,nrow=4) colnames(rbeta) <- penalty gaic <- matrix(,ncol=np,nrow=3) for(i in 1:np) { f <- mle.logreg(y~x,df_rl,penalty[i]) rbeta[,i] <- f$beta pred.logit <- Xnew %*% f$beta pred <- plogis(pred.logit) C.index <- somers2(pred, Ynew)["C"] Brier <- mean((pred-Ynew)^2) Deviance<- -2*sum( Ynew*log(pred) + (1-Ynew)*log(1-pred) ) gaic[,i] <- c(C.index,Brier,Deviance) } apply(gaic,1,which.min) rbeta_plot <- data.frame(t(rbind(penalty,rbeta))) colnames(rbeta_plot) <- c("penalty","theta0","theta1","theta2","theta3") df_plot <- as.tbl(rbeta_plot) %>% mutate(reri=abs(exp(theta3)-exp(theta2)-exp(theta1)+1)) %>% select(-theta0) %>% melt(id=c("penalty")) p_without <- ggplot(df_plot,mapping=aes(x=penalty,y=value,group=variable)) + geom_line(aes(linetype=variable),size=1.2) + geom_vline(xintercept = 0.49, linetype="dashed") + xlab(label=expression(lambda)) + scale_linetype_manual(values=c("dotted","dotted","dotted","solid")) + annotate(geom = "text", x=0.51,y=rbeta[2,np],label=expression(hat(theta)[1]),size=8) + annotate(geom = "text", x=0.51,y=rbeta[3,np],label=expression(hat(theta)[2]),size=8) + annotate(geom = "text", x=0.51,y=rbeta[4,np],label=expression(hat(theta)[3]),size=8) + annotate(geom = "text", x=0.44,y=0.12,label=expression(abs(e^hat(theta)[3]-e^hat(theta)[1]-e^hat(theta)[2]+1)),size=8) + scale_x_continuous(limits = c(0,0.52),breaks = seq(0,1,0.1)) + theme_classic() + theme(legend.position = "none", axis.text = element_text(size=15), axis.title = element_text(size = 15), plot.margin=unit(rep(1,4),'lines')) ggsave("regularized_without_plot.jpeg",p_without,width =7.3,height = 8.8) ggsave("regularized_without_plot.eps",p_without,width = 7.3,height = 8.8) ggarrange(p_without,p_with,ncol=2,nrow=1,labels=c("A","B")) ################# ## with interaction ################# set.seed(123) n_rl <- 600 x1 <- factor(sample(0:3,n_rl,rep=TRUE)) L <- (x1==1) + (x1==2) + 1.9*(x1==3) y <- ifelse(runif(n_rl)<=plogis(L), 1, 0) df_rl <- data.frame(y=y,x=x1)[1:400,] x2 <- model.matrix(y~x1) Xnew <- x2[401:n_rl,] Ynew <- y[401:n_rl] penalty <- seq(0,0.06,by=.005) np <- length(penalty) rbeta <- matrix(,ncol=np,nrow=4) colnames(rbeta) <- penalty gaic <- matrix(,ncol=np,nrow=3) for(i in 1:np) { f <- mle.logreg(y~x,df_rl,penalty[i]) rbeta[,i] <- f$beta pred.logit <- Xnew %*% f$beta pred <- plogis(pred.logit) C.index <- somers2(pred, Ynew)["C"] Brier <- mean((pred-Ynew)^2) Deviance<- -2*sum( Ynew*log(pred) + (1-Ynew)*log(1-pred) ) gaic[,i] <- c(C.index,Brier,Deviance) } apply(gaic, 1, which.min) rbeta_plot <- data.frame(t(rbind(penalty,rbeta))) colnames(rbeta_plot) <- c("penalty","theta0","theta1","theta2","theta3") df_plot <- as.tbl(rbeta_plot) %>% mutate(reri=abs(exp(theta3)-exp(theta2)-exp(theta1)+1)) %>% select(-theta0) %>% melt(id=c("penalty")) p_with <- ggplot(df_plot,mapping=aes(x=penalty,y=value,group=variable)) + geom_line(aes(linetype=variable),size=1.2) + geom_vline(xintercept = 0, linetype="dashed") + xlab(label=expression(lambda)) + scale_linetype_manual(values=c("dotted","dotted","dotted","solid")) + annotate(geom = "text", x=0.061,y=rbeta[2,np],label=expression(hat(theta)[1]),size=8) + annotate(geom = "text", x=0.061,y=rbeta[3,np],label=expression(hat(theta)[2]),size=8) + annotate(geom = "text", x=0.061,y=rbeta[4,np],label=expression(hat(theta)[3]),size=8) + annotate(geom = "text", x=0.052,y=0.1,label=expression(abs(e^hat(theta)[3]-e^hat(theta)[1]-e^hat(theta)[2]+1)),size=8) + scale_x_continuous(limits = c(0,0.062),breaks = seq(0,1,0.01)) + theme_classic() + theme(legend.position = "none", axis.text = element_text(size=15), axis.title = element_text(size = 15), plot.margin=unit(rep(1,4),'lines')) ggsave("regularized_with_plot.jpeg",p_with,width = 7.3,height = 8.8) # 14.6 ggsave("regularized_with_plot.eps",p_with,width = 7.3,height = 8.8) # 14.6