save_tree<-function(inp_data,fit_methods){ tree_list=list() if (fit_methods==7){ if (sum(is.na(inp_data[,response_var]))>0){ train_data=inp_data[-which(is.na(inp_data[,response_var])),] row.names(train_data)=1:nrow(train_data) } else {train_data=inp_data} X=train_data[,-response_var] Y=train_data[,response_var] Pass_or_not=F if (is.element(response_var,Num_Var)){Num_or_Cat=1} else{Num_or_Cat=2} tree_list=build_tree(X,Y,1,Pass_or_not,Num_or_Cat) } else{ inp_data1=inp_data=data.frame(inp_data) for (j in 1:m1){ inp_data=inp_data1 if(is.element(j,imp_var)){ train_data=inp_data[-which(is.na(inp_data[,j])),] row.names(train_data)=1:nrow(train_data) X=train_data[,-j] Y=train_data[,j] if (is.element(j,Num_Var)){Num_or_Cat=1} else{Num_or_Cat=2} if (fit_methods==1 || fit_methods==4){ Pass_or_not=T tree_list[[j]]=build_tree(X,Y,fit_methods,Pass_or_not,Num_or_Cat) } else if (fit_methods==2 || fit_methods==5){ tree_list[[j]]=build_tree2(X,Y,fit_methods,Num_or_Cat) } else if (fit_methods==3 || fit_methods==6){ tree_list[[j]]=build_tree3(X,Y,fit_methods,Num_or_Cat) } } } } return(tree_list) } Predict <- function(new.point,treelist,Data) { Tre=treelist[[1]] SPLis=treelist[[2]] CatIn=treelist[[4]] Reg=1 Pred=0 if (length(Tre)<=10){LN=Tre[9]} else{ LN=Tre[Reg,9]} while (LN == 0) { SVV=Tre[Reg,2] if ( CatIn[SVV] == 1 ) { if (is.na(new.point[SVV])){ u=which(is.na(Data[,SVV])==F) new.point[SVV]=getmode(Data[u,SVV]) if (is.element(levels(Data[,SVV])[as.integer(new.point[SVV])],SPLis[[Reg]])) { Reg=Tre[Reg,4] } else { Reg=Tre[Reg,5] } new.point[SVV]=NA rm(u) } else{ if (is.element(levels(Data[,SVV])[as.integer(new.point[SVV])],SPLis[[Reg]])) { Reg=Tre[Reg,4] } else { Reg=Tre[Reg,5] } } } else { if (is.na(new.point[SVV])){ v=which(is.na(Data[,SVV])==F) new.point[SVV]=mean(Data[v,SVV]) if (new.point[SVV]<=Tre[Reg,3]) { Reg=Tre[Reg,4] } else { Reg=Tre[Reg,5] } new.point[SVV]=NA rm(v) } else{ if (new.point[SVV]<=Tre[Reg,3]) { Reg=Tre[Reg,4] } else { Reg=Tre[Reg,5] } } } LN=Tre[Reg,9] } if (length(Tre)<=10){Pred=Tre[6]} else {Pred=Tre[Reg,6]} return(Pred) } Imp<-function(C) { return(1-sum(((C[,2])/sum(C[,2]))^2)) } getmode <- function(xx) { ux = unique(xx) ux[which.max(tabulate(match(xx, ux)))] } Modes <- function(x) { ux <- unique(x) tab <- tabulate(match(x, ux)) ux[tab == max(tab)] } CatIndex <- function(CatV,CatVNo) { m2=ncol(CatV) n2=nrow(CatV) Index=list() for ( j in 1:m2 ) { Index[[CatVNo[j]]]=list() for ( i in 1:length(levels(CatV[,j])) ) { Index[[CatVNo[j]]][[i]]=(CatV[,j]==levels(CatV[,j])[i])*(1:n2) Index[[CatVNo[j]]][[i]]=Index[[CatVNo[j]]][[i]][ !is.na(Index[[CatVNo[j]]][[i]]) ] Index[[CatVNo[j]]][[i]]=Index[[CatVNo[j]]][[i]][ Index[[CatVNo[j]]][[i]] >0 ] } } return(Index) } odd=function(inte){ ifelse(inte%%2==0,F,T) } BestNum<-function(NumVA,DATA1,meThod,NUM_or_CAT){ NumVA1=NumVA DATA11=DATA1 n2=nrow(NumVA) m2=ncol(NumVA)-1 X1=data.frame(NumVA[,(1:m2)]) tmp_num_list=list() for ( i in 1:m2) { if (meThod==2 || meThod==3){ DATA1=DATA11 if(sum(which(is.na(DATA1[,i])))>0){ DATA1=DATA1[-which(is.na(DATA1[,i])),] } } NumVA=NumVA1 if(sum(which(is.na(NumVA[,i])))>0){ NumVA=NumVA[-which(is.na(NumVA[,i])),] } ni=nrow(NumVA) if(ni<=12){tmp_num_list[[i]]=c(0,0,0)} if(ni>12){ NImpurity=0 SV1=0 SP1=0 ImpRed=0 NumVA=NumVA[order(NumVA[,i]),] Xi=data.frame(NumVA[,(1:m2)]) if (NUM_or_CAT==1){ Yi=NumVA[,(m2+1)] #response Y Impurity1=sum((Yi-mean(Yi))^2) } else if (NUM_or_CAT==2){ Yi=as.integer(factor(NumVA[,(m2+1)])) CM=plyr::count(Yi) Impurity1=Imp(CM) } XV=NumVA[1,i] Index=1 if (NUM_or_CAT==2){ CM1=matrix(data=rep(0,nrow(CM)*2),ncol=2) CM1[(Yi[1]),2]=1 CM2=CM CM2[(Yi[1]),2]=CM2[(Yi[1]),2]-1 } while ( Index < (ni-6)) { if ( round(NumVA[(Index+1),i],10) != round(XV,10) && Index > 5 ){ if (NUM_or_CAT==1){ NImpurity=sum((Yi[1:Index]-mean(Yi[1:Index]))^2)+sum((Yi[(Index+1):ni]-mean(Yi[(Index+1):ni]))^2) } else if (NUM_or_CAT==2){ NImpurity=((Index/ni)*Imp(CM1)+(1-Index/ni)*Imp(CM2)) } if (meThod==1){TF=(NImpurity < Impurity1)} else if (meThod==2 || meThod==3){ sp=(XV+NumVA[(Index+1),i])/2 cond1=(NImpurity < Impurity1) cond2=((sum(DATA1[,i]<=sp)>re_sam_min_obs)&&(sum(DATA1[,i]>sp)>re_sam_min_obs)) TF=(cond1 && cond2) } if ( TF ){ SV1=i SP1=(XV+NumVA[(Index+1),i])/2 ImpRed=ImpRed+(Impurity1-NImpurity) Impurity1=NImpurity } XV=NumVA[(Index+1),i] } XV=NumVA[(Index+1),i] Index=Index+1 if (NUM_or_CAT==2){ CM1[(Yi[Index]),2]=CM1[(Yi[Index]),2]+1 CM2[(Yi[Index]),2]=CM2[(Yi[Index]),2]-1 } } tmp_num_list[[i]]=c(SV1,SP1,ImpRed) } } BNum=tmp_num_list[[1]] for (i in 2:m2){ if (tmp_num_list[[i]][3]>BNum[3]){BNum=tmp_num_list[[i]]} } return(BNum) } BestNum2<-function(NumVA,DATA1,meThod,NUM_or_CAT){ NumVA1=NumVA DATA11=DATA1 n2=nrow(NumVA) m2=ncol(NumVA)-1 X1=data.frame(NumVA[,(1:m2)]) SV1=0 SP1=0 ImpRed=0 tmp_num_list=list() PV=c() for (i in 1:m2) { XI=c() NumVA=NumVA1 if(sum(which(is.na(NumVA[,i])))>0){ NumVA=NumVA[-which(is.na(NumVA[,i])),] } ni=nrow(NumVA) if (ni>=chi_min){ quan=quantile(NumVA[,i]) if (length(unique(quan))==1){XI=rep(1,ni)} else{ for (j in 1:ni){ if (NumVA[j,i]<=quan[2]){XI[j]=1} else if (quan[2]0 && sum(zero2)>0){table_i=table_i[-which(zero2==1),-which(zero==1)]} else if (sum(zero)>0 && sum(zero2)==0){table_i=table_i[,-which(zero==1)]} else if (sum(zero)==0 && sum(zero2)>0){table_i=table_i[-which(zero2==1),]} if ((sum(zero==1)<(nc-1)) && (sum(zero2==1)<(nr-1))){ chisq_i=chisq.test(table_i,simulate.p=T) PV[i]=chisq_i$p.value } else{PV[i]=10} } else {PV[i]=10} } if (sum(PV==10)!=length(PV)){ Bestcol=which(PV==min(PV)) for (k in (1:m2)[-Bestcol]){tmp_num_list[[k]]=c(0,0,0,PV[k])} for (k in Bestcol){ if (meThod==5 || meThod==6){ DATA1=DATA11 if(sum(which(is.na(DATA1[,k])))>0){ DATA1=DATA1[-which(is.na(DATA1[,k])),] } } NumVA=NumVA1 if(sum(which(is.na(NumVA[,k])))>0){ NumVA=NumVA[-which(is.na(NumVA[,k])),] } nk=nrow(NumVA) if(nk<=12){tmp_num_list[[k]]=c(0,0,0,PV[k])} if(nk>12){ NImpurity=0 SV1=0 SP1=0 ImpRed=0 NumVA=NumVA[order(NumVA[,k]),] Xi=data.frame(NumVA[,(1:m2)]) if (NUM_or_CAT==1){ Yi=NumVA[,(m2+1)] Impurity1=sum((Yi-mean(Yi))^2) } else if (NUM_or_CAT==2){ Yi=as.integer(factor(NumVA[,(m2+1)])) CM=plyr::count(Yi) Impurity1=Imp(CM) } XV=NumVA[1,k] Index=1 if (NUM_or_CAT==2){ CM1=matrix(data=rep(0,nrow(CM)*2),ncol=2) CM1[(Yi[1]),2]=1 CM2=CM CM2[(Yi[1]),2]=CM2[(Yi[1]),2]-1 } while ( Index < nk-6) { if ( round(NumVA[(Index+1),k],10) != round(XV,10) && Index > 5 ){ if (NUM_or_CAT==1){ NImpurity=sum((Yi[1:Index]-mean(Yi[1:Index]))^2)+sum((Yi[(Index+1):nk]-mean(Yi[(Index+1):nk]))^2) } else if (NUM_or_CAT==2){ NImpurity=((Index/nk)*Imp(CM1)+(1-Index/nk)*Imp(CM2)) } if (meThod==4){TF=(NImpurity < Impurity1)} else if (meThod==5 || meThod==6){ sp=(XV+NumVA[(Index+1),k])/2 cond1=(NImpurity < Impurity1) cond2=((sum(DATA1[,k]<=sp)>re_sam_min_obs)&&(sum(DATA1[,k]>sp)>re_sam_min_obs)) TF=(cond1 && cond2) } if ( TF ){ SV1=k SP1=(XV+NumVA[(Index+1),k])/2 ImpRed=ImpRed+(Impurity1-NImpurity) Impurity1=NImpurity } XV=NumVA[(Index+1),k] } XV=NumVA[(Index+1),k] Index=Index+1 if (NUM_or_CAT==2){ CM1[(Yi[Index]),2]=CM1[(Yi[Index]),2]+1 CM2[(Yi[Index]),2]=CM2[(Yi[Index]),2]-1 } } tmp_num_list[[k]]=c(SV1,SP1,ImpRed,PV[k]) } } BNum=tmp_num_list[[1]] for (k in 2:m2){ if (tmp_num_list[[k]][3]>BNum[3]){BNum=tmp_num_list[[k]]} } } else {BNum=c(0,0,0,10)} return(BNum) } BestCat<-function(CatVAR,RegionIndex,CatrList,CaVANo,DATA1,meThod,NUM_or_CAT) { nn=nrow(DATA1) RegionIndex1=RegionIndex n2=nrow(CatVAR[RegionIndex,]) m2=ncol(CatVAR)-1 X1=data.frame(CatVAR[,(1:m2)]) Y1=CatVAR[,(m2+1)] tmp_cat_list=list() for ( j in 1:m2) { tmp_cat_list[[j]]=list() RegionIndex=RegionIndex1 if (length(which(is.na(X1[RegionIndex,j])))>0){RegionIndex=RegionIndex[-(which(is.na(X1[RegionIndex,j])))]} if (length(RegionIndex)==0){tmp_cat_list[[j]][[1]]=tmp_cat_list[[j]][[2]]=tmp_cat_list[[j]][[3]]=0} if (length(RegionIndex)>0){ ImpRed=0 SV1=0 LCat=0 if (NUM_or_CAT==1){ Impurity1=sum((Y1[RegionIndex]-mean(Y1[RegionIndex]))^2) } else if (NUM_or_CAT==2){ CM=plyr::count(CatVAR[RegionIndex,(m2+1)]) Impurity1=Imp(CM) } NoLvl=length(levels(factor(X1[RegionIndex,j]))) NoSplit=2^(NoLvl-1)-1 if ( NoSplit > 0 ) { IndexL=intersect(RegionIndex,CatrList[[CaVANo[j]]][[max(as.integer(X1[RegionIndex,j]))]]) IndexR=setdiff(RegionIndex,IndexL) if (NUM_or_CAT==1){NImpurity=sum((Y1[IndexL]-mean(Y1[IndexL]))^2)+sum((Y1[IndexR]-mean(Y1[IndexR]))^2)} else if (NUM_or_CAT==2){NImpurity=(length(IndexL)/(length(IndexL)+length(IndexR)))*Imp(plyr::count(Y1[IndexL]))+(length(IndexR)/(length(IndexL)+length(IndexR)))*Imp(plyr::count(Y1[IndexR]))} if (meThod==1){TF=(NImpurity < Impurity1)} else if (meThod==2 || meThod==3){ LCAT=levels(factor(X1[RegionIndex,j]))[NoLvl] RCAT=levels(factor(X1[RegionIndex,j]))[1:(NoLvl-1)] cond1=(NImpurity < Impurity1) cond2=((sum(is.element(DATA1[,j],LCAT))>re_sam_min_obs)&&(sum(is.element(DATA1[,j],RCAT))>re_sam_min_obs)) TF=(cond1 && cond2) } if ( TF ) { SV1=j LCat=levels(factor(X1[RegionIndex,j]))[NoLvl] ImpRed=ImpRed+(Impurity1-NImpurity) Impurity1=NImpurity } if ( NoLvl > 2 ) { for ( s in 1:(NoLvl-2)) { ToAdd=utils::combn((NoLvl-1),s) for ( i in 1:ncol(ToAdd)) { NIndexL=IndexL for ( k in 1:s) { NIndexL=c(NIndexL,intersect(RegionIndex,CatrList[[CaVANo[j]]][[(sort(unique(as.integer(X1[RegionIndex,j]))))[ToAdd[k,i]]]])) } NIndexR=setdiff(RegionIndex,NIndexL) if (NUM_or_CAT==1){NImpurity=sum((Y1[NIndexL]-mean(Y1[NIndexL]))^2)+sum((Y1[NIndexR]-mean(Y1[NIndexR]))^2)} else if (NUM_or_CAT==2){NImpurity=(length(NIndexL)/(length(NIndexL)+length(NIndexR)))*Imp(plyr::count(Y1[NIndexL]))+(length(NIndexR)/(length(NIndexL)+length(NIndexR)))*Imp(plyr::count(Y1[NIndexR]))} if (meThod==1){TF1=(NImpurity < Impurity1)} else if (meThod==2 || meThod==3){ LCAT1=c(levels(factor(X1[RegionIndex,j]))[NoLvl],levels(factor(X1[RegionIndex,j]))[ToAdd[,i]]) RCAT1=levels(factor(X1[RegionIndex,j]))[setdiff((1:NoLvl),c(NoLvl,ToAdd[,i]))] cond11=(NImpurity < Impurity1) cond21=((sum(is.element(DATA1[,j],LCAT1))>re_sam_min_obs)&&(sum(is.element(DATA1[,j],RCAT1))>re_sam_min_obs)) TF1=(cond11 && cond21) } if ( TF1 ) { SV1=j LCat=c(levels(factor(X1[RegionIndex,j]))[NoLvl],levels(factor(X1[RegionIndex,j]))[ToAdd[,i]]) ImpRed=ImpRed+(Impurity1-NImpurity) Impurity1=NImpurity } } } } } tmp_cat_list[[j]][[1]]=SV1 tmp_cat_list[[j]][[2]]=LCat tmp_cat_list[[j]][[3]]=ImpRed } } ToReturn=tmp_cat_list[[1]] for (j in 2:m2){ if (tmp_cat_list[[j]][[3]]>ToReturn[[3]]){ToReturn=tmp_cat_list[[j]]} } return(ToReturn) } BestCat2<-function(CatVAR,RegionIndex,CatrList,CaVANo,DATA1,meThod,NUM_or_CAT) { RegionIndex1=RegionIndex n2=nrow(CatVAR[RegionIndex,]) m2=ncol(CatVAR)-1 X1=data.frame(CatVAR[,(1:m2)]) Y1=CatVAR[,(m2+1)] ImpRed=0 SV1=0 LCat=0 PV=c() tmp_cat_list=list() for ( i in 1:m2) { RegionIndex=RegionIndex1 if (length(which(is.na(X1[RegionIndex,i])))>0){RegionIndex=RegionIndex[-(which(is.na(X1[RegionIndex,i])))]} if (length(RegionIndex)>=chi_min) { if (NUM_or_CAT==1){ tmp_y=CatVAR[RegionIndex,(m2+1)] YI=c() quany=quantile(tmp_y) if (length(unique(quany))==1){YI=rep(1,length(RegionIndex))} else{ for (j in 1:length(RegionIndex)){ if (tmp_y[j]<=quany[2]){YI[j]=1} else if (quany[2]0 && sum(zero2)>0){table_i=table_i[-which(zero2==1),-which(zero==1)]} else if (sum(zero)>0 && sum(zero2)==0){table_i=table_i[,-which(zero==1)]} else if (sum(zero)==0 && sum(zero2)>0){table_i=table_i[-which(zero2==1),]} if ((sum(zero==1)0){RegionIndex=RegionIndex[-(which(is.na(X1[RegionIndex,j])))]} if (length(RegionIndex)==0){ tmp_cat_list[[j]][[1]]=tmp_cat_list[[j]][[2]]=tmp_cat_list[[j]][[3]]=0 tmp_cat_list[[j]][[4]]=PV[j] } if (length(RegionIndex)>0){ ImpRed=0 SV1=0 LCat=0 if (NUM_or_CAT==1){ Impurity1=sum((Y1[RegionIndex]-mean(Y1[RegionIndex]))^2) } else if (NUM_or_CAT==2){ CM=plyr::count(CatVAR[RegionIndex,(m2+1)]) Impurity1=Imp(CM) } NoLvl=length(levels(factor(X1[RegionIndex,j]))) NoSplit=2^(NoLvl-1)-1 if ( NoSplit > 0 ) { IndexL=intersect(RegionIndex,CatrList[[CaVANo[j]]][[max(as.integer(X1[RegionIndex,j]))]]) IndexR=setdiff(RegionIndex,IndexL) if (NUM_or_CAT==1){NImpurity=sum((Y1[IndexL]-mean(Y1[IndexL]))^2)+sum((Y1[IndexR]-mean(Y1[IndexR]))^2)} else if (NUM_or_CAT==2){NImpurity=(length(IndexL)/(length(IndexL)+length(IndexR)))*Imp(plyr::count(Y1[IndexL]))+(length(IndexR)/(length(IndexL)+length(IndexR)))*Imp(plyr::count(Y1[IndexR]))} if (meThod==4){TF=(NImpurity < Impurity1)} else if (meThod==5 || meThod==6){ LCAT=levels(factor(X1[RegionIndex,j]))[NoLvl] RCAT=levels(factor(X1[RegionIndex,j]))[1:(NoLvl-1)] cond1=(NImpurity < Impurity1) cond2=((sum(is.element(DATA1[,j],LCAT))>re_sam_min_obs)&&(sum(is.element(DATA1[,j],RCAT))>re_sam_min_obs)) TF=(cond1 && cond2) } if ( TF ) { SV1=j LCat=levels(factor(X1[RegionIndex,j]))[NoLvl] ImpRed=ImpRed+(Impurity1-NImpurity) Impurity1=NImpurity } if ( NoLvl > 2 ) { for ( s in 1:(NoLvl-2)) { ToAdd=utils::combn((NoLvl-1),s) for ( i in 1:ncol(ToAdd)) { NIndexL=IndexL for ( k in 1:s) { NIndexL=c(NIndexL,intersect(RegionIndex,CatrList[[CaVANo[j]]][[(sort(unique(as.integer(X1[RegionIndex,j]))))[ToAdd[k,i]]]])) } NIndexR=setdiff(RegionIndex,NIndexL) if (NUM_or_CAT==1){NImpurity=sum((Y1[NIndexL]-mean(Y1[NIndexL]))^2)+sum((Y1[NIndexR]-mean(Y1[NIndexR]))^2)} else if (NUM_or_CAT==2){NImpurity=(length(NIndexL)/(length(NIndexL)+length(NIndexR)))*Imp(plyr::count(Y1[NIndexL]))+(length(NIndexR)/(length(NIndexL)+length(NIndexR)))*Imp(plyr::count(Y1[NIndexR]))} if (meThod==4){TF1=(NImpurity < Impurity1)} else if (meThod==5 || meThod==6){ LCAT1=c(levels(factor(X1[RegionIndex,j]))[NoLvl],levels(factor(X1[RegionIndex,j]))[ToAdd[,i]]) RCAT1=levels(factor(X1[RegionIndex,j]))[setdiff((1:NoLvl),c(NoLvl,ToAdd[,i]))] cond11=(NImpurity < Impurity1) cond21=((sum(is.element(DATA1[,j],LCAT1))>re_sam_min_obs)&&(sum(is.element(DATA1[,j],RCAT1))>re_sam_min_obs)) TF1=(cond11 && cond21) } if ( TF1 ) { SV1=j LCat=c(levels(factor(X1[RegionIndex,j]))[NoLvl],levels(factor(X1[RegionIndex,j]))[ToAdd[,i]]) ImpRed=ImpRed+(Impurity1-NImpurity) Impurity1=NImpurity } } } } } tmp_cat_list[[j]][[1]]=SV1 tmp_cat_list[[j]][[2]]=LCat tmp_cat_list[[j]][[3]]=ImpRed tmp_cat_list[[j]][[4]]=PV[j] } } ToReturn=tmp_cat_list[[1]] for (j in 2:m2){ if (tmp_cat_list[[j]][[3]]>ToReturn[[3]]){ToReturn=tmp_cat_list[[j]]} } } else { ToReturn=list() ToReturn[[1]]=0 ToReturn[[2]]=0 ToReturn[[3]]=0 ToReturn[[4]]=10 } return(ToReturn) } build_tree<-function(data.x,data.y,fit_method1,pass_or_not,Num_OR_Cat){ data.xy=cbind(data.x,data.y) Iter=1 CatInd=c() N=nrow(data.x) M=ncol(data.x) for (i in 1:M){ if(class(data.x[,i])=='factor'){ CatInd[i]=1 } else {CatInd[i]=0} } CatNo=which(CatInd==1) if(sum(CatNo)>0){ CatList=CatIndex(data.frame(data.x[,CatNo]),CatNo) } NumNo=which(CatInd==0) if (Num_OR_Cat==1){ tmp_Y=c() tmp_catgory=quantile(data.y,seq(0.1,1,0.1)) for (nn in 1:N){ if (data.y[nn]<=tmp_catgory[1]){tmp_Y[nn]=1} else{ need_more=T tmp_num=1 while(need_more==T){ if ((data.y[nn]>tmp_catgory[tmp_num]) && (data.y[nn]<=tmp_catgory[tmp_num+1])){ tmp_Y[nn]=tmp_num+1 need_more=F } else{tmp_num=tmp_num+1} } } } } Regions=list() Regions[[Iter]]=seq(1:N) SPList=list() Tree=matrix(data=c(0,0,0,0,0,0,0,0,1,1),nrow=1) while(Iter<=length(Regions)){ n=length(Regions[[Iter]]) X=data.x[Regions[[Iter]],] Y=data.y[Regions[[Iter]]] XY=cbind(X,Y) if (Num_OR_Cat==1){ Prediction=mean(Y) Impurity=sum((Y-Prediction)^2) } else if (Num_OR_Cat==2){ Response=plyr::count(Y) Prediction=Response$x[which.max(Response$freq)] Prediction=which(levels(data.y)==Prediction) Impurity=Imp(Response) } Cont=FALSE for (t in 1:ncol(X)){ Cont = Cont || (length(unique(X[,t])) > 1) } Cond=((n>node_default) && (length(unique(Y))>1) && Cont) if(!Cond){ Tree=rbind(Tree,c(Iter,0,0,0,0,Prediction,n,0,1,Impurity)) SPList[[Iter]]=0 determ_N_C=determ_N_C2=0 } else{ if (fit_method1==1){ BN=BestNum(data.frame(XY[,c(NumNo,(M+1))]),0,fit_method1,Num_OR_Cat) BC=BestCat(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,0,fit_method1,Num_OR_Cat) determ_N_C=(BN[3]>=BC[[3]]) determ_N_C2=(BC[[3]]>BN[3]) } else if (fit_method1==4){ set.seed(seed) BN=BestNum2(data.frame(XY[,c(NumNo,(M+1))]),0,fit_method1,Num_OR_Cat) BC=BestCat2(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,0,fit_method1,Num_OR_Cat) determ_N_C=(((BN[4]<=BC[[4]])&&BN[3]>0) || (BC[[3]]==0 && BN[3]>0)) determ_N_C2=(((BN[4]>BC[[4]])&&BC[[3]]>0) || (BC[[3]]>0 && BN[3]==0)) } if((BN[3]==0)&&(BC[[3]]==0)){ Tree=rbind(Tree,c(Iter,0,0,0,0,Prediction,n,0,1,Impurity)) SPList[[Iter]]=0 } else if (determ_N_C){ SV=NumNo[BN[1]] SP=BN[2] LB=length(Regions)+1 BB=length(Regions)+2 if (pass_or_not==T){ data.xyc=data.xyi=data.xy if (Num_OR_Cat==1){ for (k in Regions[[Iter]]){ if(is.na(data.xyi[k,SV])){ same_class=(tmp_Y[Regions[[Iter]]])[which(tmp_Y[Regions[[Iter]]]==tmp_Y[k])] same_class=same_class[which(!is.na(data.xy[same_class,SV]))] if (sum(same_class)>0){ data.xyi[k,SV]=mean(data.xy[same_class,SV]) } else{data.xyi[k,SV]=mean(data.xy[Regions[[Iter]][which(!is.na(data.xy[Regions[[Iter]],SV]))],SV])} } } } else if (Num_OR_Cat==2){ for (k in Regions[[Iter]]){ if(is.na(data.xyi[k,SV])){ same.class=Regions[[Iter]][which(data.xyc[Regions[[Iter]],(M+1)]==data.xyi[k,(M+1)])] same.class=same.class[which(!is.na(data.xyc[same.class,SV]))] if (sum(same.class)>0){ data.xyi[k,SV]=mean(data.xyc[same.class,SV]) } else{data.xyi[k,SV]=mean(data.xyc[Regions[[Iter]][which(!is.na(data.xyc[Regions[[Iter]],SV]))],SV])} } } } Regions[[LB]]=intersect(which(data.xyi[,SV]<=SP),Regions[[Iter]]) Regions[[BB]]=setdiff(Regions[[Iter]],Regions[[LB]]) } else if (pass_or_not==F){ Reg_miss=which(is.na(data.xy[Regions[[Iter]],SV])) if (length(Reg_miss)>0){ Reg_new=Regions[[Iter]][-Reg_miss] Regions[[LB]]=intersect(Reg_new[which(data.xy[Reg_new,SV]<=SP)],Reg_new) Regions[[BB]]=setdiff(Reg_new,Regions[[LB]]) } else{ Regions[[LB]]=intersect(Regions[[Iter]][which(data.xy[Regions[[Iter]],SV]<=SP)],Regions[[Iter]]) Regions[[BB]]=setdiff(Regions[[Iter]],Regions[[LB]]) } } Tree=rbind(Tree,c(Iter,SV,SP,LB,BB,Prediction,n,BN[3],0,Impurity)) SPList[[Iter]]=SP } else if (determ_N_C2) { SV=CatNo[BC[[1]]] SP=BC[[2]] LB=length(Regions)+1 BB=length(Regions)+2 if (pass_or_not==T){ data.xyc1=data.xyi1=data.xy if (Num_OR_Cat==1){ for (s in Regions[[Iter]]){ if(is.na(data.xyi1[s,SV])){ same_class=(tmp_Y[Regions[[Iter]]])[which(tmp_Y[Regions[[Iter]]]==tmp_Y[s])] same_class=same_class[which(!is.na(data.xy[same_class,SV]))] if (sum(same_class)>0){ data.xyi1[s,SV]=getmode(data.xy[same_class,SV]) } else{data.xyi1[s,SV]=getmode(data.xy[Regions[[Iter]][which(!is.na(data.xy[Regions[[Iter]],SV]))],SV])} } } } else if (Num_OR_Cat==2){ for (s in Regions[[Iter]]){ if(is.na(data.xyi1[s,SV])){ same.class=Regions[[Iter]][which(data.xyc1[Regions[[Iter]],(M+1)]==data.xyi1[s,(M+1)])] same.class=same.class[which(!is.na(data.xyc1[same.class,SV]))] if (sum(same.class)>0){ data.xyi1[s,SV]=getmode(data.xyc1[same.class,SV]) } else{data.xyi1[s,SV]=getmode(data.xyc1[Regions[[Iter]][which(!is.na(data.xyc1[Regions[[Iter]],SV]))],SV])} } } } data.xyi1=data.xyi1[,-(M+1)] CatList1=CatIndex(data.frame(data.xyi1[,CatNo]),CatNo) Regions[[LB]]=intersect(Regions[[Iter]],unlist(CatList1[[SV]][which(match(levels(data.xyi1[,SV]),SP)>0)])) Regions[[BB]]=setdiff(Regions[[Iter]],Regions[[LB]]) } else if (pass_or_not==F){ Reg_miss=which(is.na(data.xy[Regions[[Iter]],SV])) if (length(Reg_miss)>0){ Reg_new=Regions[[Iter]][-Reg_miss] Regions[[LB]]=intersect(Reg_new,unlist(CatList[[SV]][which(match(levels(data.xy[,SV]),SP)>0)])) Regions[[BB]]=setdiff(Reg_new,Regions[[LB]]) } else{ Regions[[LB]]=intersect(Regions[[Iter]],unlist(CatList[[SV]][which(match(levels(data.xy[,SV]),SP)>0)])) Regions[[BB]]=setdiff(Regions[[Iter]],Regions[[LB]]) } } Tree=rbind(Tree,c(Iter,SV,-1,LB,BB,Prediction,n,BC[[3]],0,Impurity)) SPList[[Iter]]=SP } } rm(determ_N_C) rm(determ_N_C2) Iter=Iter+1 } colnames(Tree)=c('RegNo','SV','SP','LLeaf','RLeaf','Pred','NoObs','ImpRed','Leaf/Node','Imp') ToReturn=list() ToReturn[[1]]=Tree[2:nrow(Tree),] ToReturn[[2]]=SPList ToReturn[[3]]=Regions ToReturn[[4]]=CatInd return(ToReturn) } build_tree2<-function(data.x,data.y,MEthod,Num_OR_Cat){ data.xy=cbind(data.x,data.y) Iter=1 CatInd=c() N=nrow(data.x) M=ncol(data.x) for (i in 1:M){ if(class(data.x[,i])=='factor'){ CatInd[i]=1 } else {CatInd[i]=0} } CatNo=which(CatInd==1) if(sum(CatNo)>0){ CatList=CatIndex(data.frame(data.x[,CatNo]),CatNo) } NumNo=which(CatInd==0) Regions=list() Regions[[Iter]]=seq(1:N) SPList=list() Tree=matrix(data=c(0,0,0,0,0,0,0,0,1,1),nrow=1) miss_set=list() right_set=list() while(Iter<=length(Regions)){ IterLB=odd(Iter) n=length(Regions[[Iter]]) X=data.x[Regions[[Iter]],] Y=data.y[Regions[[Iter]]] XY=cbind(X,Y) if (Num_OR_Cat==1){ Prediction=mean(Y) Impurity=sum((Y-Prediction)^2) } else if (Num_OR_Cat==2){ Response=plyr::count(Y) Prediction=Response$x[which.max(Response$freq)] Prediction=which(levels(data.y)==Prediction) Impurity=Imp(Response) } Cont=FALSE for (t in 1:ncol(X)){ Cont = Cont || (length(unique(X[,t])) > 1) } Cond=((n>node_default) && (length(unique(Y))>1) && Cont) if(!Cond){ Tree=rbind(Tree,c(Iter,0,0,0,0,Prediction,n,0,1,Impurity)) SPList[[Iter]]=0 deterNC=deterNC2=0 if (IterLB==F){ prob_set=c() Tree1=Tree[,4:5] u1=Iter while(u1>1){ if (ceiling((which(Tree1==u1))/nrow(Tree1))==1){ v1=which(Tree[,4]==u1)-1 } else {v1=which(Tree[,5]==u1)-1} if (length(miss_set[[v1]]>0)){prob_set=c(prob_set,miss_set[[v1]])} u1=v1 } if (length(prob_set)>0){ right_set[[Iter]]=list() for (i1 in 1:times_default){ iset=c() for (i2 in 1:length(prob_set)){ uu=runif(1) if(uu>0.5){iset=c(iset,prob_set[i2])} } right_set[[Iter]][[i1]]=sort(setdiff(prob_set,iset)) } } else{right_set[[Iter]]=list()} } } else{ if (Iter==1){ if(MEthod==2){ BN=BestNum(data.frame(XY[,c(NumNo,(M+1))]),0,1,Num_OR_Cat) BC=BestCat(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,0,1,Num_OR_Cat) deterNC=(BN[3]>=BC[[3]]) deterNC2=(BC[[3]]>BN[3]) } else if (MEthod==5){ set.seed(seed) BN=BestNum2(data.frame(XY[,c(NumNo,(M+1))]),0,4,Num_OR_Cat) BC=BestCat2(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,0,4,Num_OR_Cat) deterNC=(((BN[4]<=BC[[4]])&&BN[3]>0) || (BC[[3]]==0 && BN[3]>0)) deterNC2=(((BN[4]>BC[[4]])&&BC[[3]]>0) || (BC[[3]]>0 && BN[3]==0)) } } else if (IterLB==F && Iter>1){ prob_set=c() Tree1=Tree[,4:5] u1=Iter while(u1>1){ if (ceiling((which(Tree1==u1))/nrow(Tree1))==1){ v1=which(Tree[,4]==u1)-1 } else {v1=which(Tree[,5]==u1)-1} if (length(miss_set[[v1]]>0)){prob_set=c(prob_set,miss_set[[v1]])} u1=v1 } if (length(prob_set)>0){ BNN=list() BCC=list() right_set[[Iter]]=list() for (i1 in 1:times_default){ iset=c() for (i2 in 1:length(prob_set)){ uu=runif(1) if(uu>0.5){iset=c(iset,prob_set[i2])} } Re1=sort(c(iset,Regions[[Iter]])) X11=data.x[Re1,] Y11=data.y[Re1] XY11=cbind(X11,Y11) if (MEthod==2){ BNN[[i1]]=BestNum(data.frame(XY11[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BCC[[i1]]=BestCat(data.frame(data.xy[,c(CatNo,(M+1))]),Re1,CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) } else if (MEthod==5){ set.seed(seed) BNN[[i1]]=BestNum2(data.frame(XY11[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BCC[[i1]]=BestCat2(data.frame(data.xy[,c(CatNo,(M+1))]),Re1,CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) } right_set[[Iter]][[i1]]=sort(setdiff(prob_set,iset)) } BN=BNN[[1]] for (i3 in 2:times_default){ if (MEthod==2){ if (BNN[[i3]][3]>BN[3]){BN=BNN[[i3]]} } else if (MEthod==5){ if (BNN[[i3]][4]BN[3]){BN=BNN[[i3]]} } } } BC=BCC[[1]] for (i4 in 2:times_default){ if (MEthod==2){ if (BCC[[i4]][[3]]>BC[[3]]){BC=BCC[[i4]]} } else if (MEthod==5){ if (BCC[[i4]][[4]]BC[[3]]){BC=BCC[[i4]]} } } } if (MEthod==2){ deterNC=(BN[3]>=BC[[3]]) deterNC2=(BC[[3]]>BN[3]) } else if (MEthod==5){ deterNC=(((BN[4]<=BC[[4]])&&BN[3]>0) || (BC[[3]]==0 && BN[3]>0)) deterNC2=(((BN[4]>BC[[4]])&&BC[[3]]>0) || (BC[[3]]>0 && BN[3]==0)) } } else{ right_set[[Iter]]=list() if (MEthod==2){ BN=BestNum(data.frame(XY[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BC=BestCat(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) deterNC=(BN[3]>=BC[[3]]) deterNC2=(BC[[3]]>BN[3]) } else if(MEthod==5){ set.seed(seed) BN=BestNum2(data.frame(XY[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BC=BestCat2(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) deterNC=(((BN[4]<=BC[[4]])&&BN[3]>0) || (BC[[3]]==0 && BN[3]>0)) deterNC2=(((BN[4]>BC[[4]])&&BC[[3]]>0) || (BC[[3]]>0 && BN[3]==0)) } } } else if (IterLB==T && Iter>1){ if (length(right_set[[Iter-1]])>0){ BNN=list() BCC=list() for (i1 in 1:times_default){ Re1=sort(c(unlist(right_set[[Iter-1]][[i1]]),Regions[[Iter]])) X11=data.x[Re1,] Y11=data.y[Re1] XY11=cbind(X11,Y11) if (MEthod==2){ BNN[[i1]]=BestNum(data.frame(XY11[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BCC[[i1]]=BestCat(data.frame(data.xy[,c(CatNo,(M+1))]),Re1,CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) } else if (MEthod==5){ set.seed(seed) BNN[[i1]]=BestNum2(data.frame(XY11[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BCC[[i1]]=BestCat2(data.frame(data.xy[,c(CatNo,(M+1))]),Re1,CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) } } BN=BNN[[1]] for (i3 in 2:times_default){ if (MEthod==2){ if (BNN[[i3]][3]>BN[3]){BN=BNN[[i3]]} } else if (MEthod==5){ if (BNN[[i3]][4]BN[3]){BN=BNN[[i3]]} } } } BC=BCC[[1]] for (i4 in 2:times_default){ if (MEthod==2){ if (BCC[[i4]][[3]]>BC[[3]]){BC=BCC[[i4]]} } else if (MEthod==5){ if (BCC[[i4]][[4]]BC[[3]]){BC=BCC[[i4]]} } } } if (MEthod==2){ deterNC=(BN[3]>=BC[[3]]) deterNC2=(BC[[3]]>BN[3]) } else if (MEthod==5){ deterNC=(((BN[4]<=BC[[4]])&&BN[3]>0) || (BC[[3]]==0 && BN[3]>0)) deterNC2=(((BN[4]>BC[[4]])&&BC[[3]]>0) || (BC[[3]]>0 && BN[3]==0)) } } else{ if (MEthod==2){ BN=BestNum(data.frame(XY[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BC=BestCat(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) deterNC=(BN[3]>=BC[[3]]) deterNC2=(BC[[3]]>BN[3]) } else if (MEthod==5){ set.seed(seed) BN=BestNum2(data.frame(XY[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BC=BestCat2(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) deterNC=(((BN[4]<=BC[[4]])&&BN[3]>0) || (BC[[3]]==0 && BN[3]>0)) deterNC2=(((BN[4]>BC[[4]])&&BC[[3]]>0) || (BC[[3]]>0 && BN[3]==0)) } } } if((BN[3]==0)&&(BC[[3]]==0)){ Tree=rbind(Tree,c(Iter,0,0,0,0,Prediction,n,0,1,Impurity)) SPList[[Iter]]=0 } else if (deterNC){ SV=NumNo[BN[1]] SP=BN[2] LB=length(Regions)+1 BB=length(Regions)+2 miss_set1=c() miss_pos1=c() for (k in Regions[[Iter]]){ if(is.na(data.xy[k,SV])){ miss_set1=c(miss_set1,k) miss_pos1=c(miss_pos1,which(Regions[[Iter]]==k)) } } if(length(miss_set1)>0){miss_set[[Iter]]=miss_set1} else{miss_set[[Iter]]=list()} if (length(miss_pos1)>0){ Regions[[LB]]=intersect(which(data.xy[,SV]<=SP),Regions[[Iter]]) Regions[[BB]]=setdiff(Regions[[Iter]][-miss_pos1],Regions[[LB]]) } else{ Regions[[LB]]=intersect(which(data.xy[,SV]<=SP),Regions[[Iter]]) Regions[[BB]]=setdiff(Regions[[Iter]],Regions[[LB]]) } Tree=rbind(Tree,c(Iter,SV,SP,LB,BB,Prediction,n,BN[3],0,Impurity)) SPList[[Iter]]=SP } else if (deterNC2) { SV=CatNo[BC[[1]]] SP=BC[[2]] LB=length(Regions)+1 BB=length(Regions)+2 miss_set1=c() miss_pos1=c() for (k in Regions[[Iter]]){ if(is.na(data.xy[k,SV])==TRUE){ miss_set1=c(miss_set1,k) miss_pos1=c(miss_pos1,which(Regions[[Iter]]==k)) } } if(length(miss_set1)>0){miss_set[[Iter]]=miss_set1} else{miss_set[[Iter]]=list()} if (length(miss_pos1)>0){ Regions[[LB]]=intersect(Regions[[Iter]],unlist(CatList[[SV]][which(match(levels(data.xy[,SV]),SP)>0)])) Regions[[BB]]=setdiff(Regions[[Iter]][-miss_pos1],Regions[[LB]]) } else{ Regions[[LB]]=intersect(Regions[[Iter]],unlist(CatList[[SV]][which(match(levels(data.xy[,SV]),SP)>0)])) Regions[[BB]]=setdiff(Regions[[Iter]],Regions[[LB]]) } Tree=rbind(Tree,c(Iter,SV,-1,LB,BB,Prediction,n,BC[[3]],0,Impurity)) SPList[[Iter]]=SP } } rm(deterNC) rm(deterNC2) Iter=Iter+1 } colnames(Tree)=c('RegNo','SV','SP','LLeaf','RLeaf','Pred','NoObs','ImpRed','Leaf/Node','Imp') ToReturn=list() ToReturn[[1]]=Tree[2:nrow(Tree),] ToReturn[[2]]=SPList ToReturn[[3]]=Regions ToReturn[[4]]=CatInd return(ToReturn) } build_tree3<-function(data.x,data.y,MEthod,Num_OR_Cat){ data.xy=cbind(data.x,data.y) Iter=1 CatInd=c() N=nrow(data.x) M=ncol(data.x) for (i in 1:M){ if(class(data.x[,i])=='factor'){ CatInd[i]=1 } else {CatInd[i]=0} } CatNo=which(CatInd==1) if(sum(CatNo)>0){ CatList=CatIndex(data.frame(data.x[,CatNo]),CatNo) } NumNo=which(CatInd==0) Regions=list() Regions[[Iter]]=seq(1:N) SPList=list() Tree=matrix(data=c(0,0,0,0,0,0,0,0,1,1),nrow=1) miss_set=list() prob_set=list() LR_set=list() prob_set[[1]]=list() for (ii in 1:times_default){ prob_set[[1]][[ii]]=list() } while(Iter<=length(Regions)){ IterLB=odd(Iter) if(Iter>1 && IterLB==F){ prob_set[[Iter]]=list() prob_set[[Iter+1]]=list() } n=length(Regions[[Iter]]) X=data.x[Regions[[Iter]],] Y=data.y[Regions[[Iter]]] XY=cbind(X,Y) if (Num_OR_Cat==1){ Prediction=mean(Y) Impurity=sum((Y-Prediction)^2) } else if (Num_OR_Cat==2){ Response=plyr::count(Y) Prediction=Response$x[which.max(Response$freq)] Prediction=which(levels(data.y)==Prediction) Impurity=Imp(Response) } Cont=FALSE for (t in 1:ncol(X)){ Cont = Cont || (length(unique(X[,t])) > 1) } Cond=((n>node_default) && (length(unique(Y))>1) && Cont) if(!Cond){ Tree=rbind(Tree,c(Iter,0,0,0,0,Prediction,n,0,1,Impurity)) SPList[[Iter]]=0 deterNC=deterNC2=0 if (IterLB==F){ Tree1=Tree[,4:5] if (ceiling((which(Tree1==Iter))/nrow(Tree1))==1){ v1=which(Tree[,4]==Iter)-1 } else {v1=which(Tree[,5]==Iter)-1} if (length(miss_set[[v1]])>0){draw_set=miss_set[[v1]]} else {draw_set=c()} for (i1 in 1:times_default){ iset=c() draw_set1=draw_set draw_set1=c(draw_set1,unlist(prob_set[[v1]][[i1]])) if (length(draw_set1)>0){ for (i2 in 1:length(draw_set1)){ uu=runif(1) if(uu>0.5){iset=c(iset,draw_set1[i2])} } if (length(setdiff(draw_set1,iset))>0){ prob_set[[Iter+1]][[i1]]=sort(setdiff(draw_set1,iset)) } else {prob_set[[Iter+1]][[i1]]=list()} } else {prob_set[[Iter+1]][[i1]]=list()} } } } else{ if (Iter==1){ if(MEthod==3){ BN=BestNum(data.frame(XY[,c(NumNo,(M+1))]),0,1,Num_OR_Cat) BC=BestCat(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,0,1,Num_OR_Cat) deterNC=(BN[3]>=BC[[3]]) deterNC2=(BC[[3]]>BN[3]) } else if (MEthod==6){ set.seed(seed) BN=BestNum2(data.frame(XY[,c(NumNo,(M+1))]),0,4,Num_OR_Cat) BC=BestCat2(data.frame(data.xy[,c(CatNo,(M+1))]),Regions[[Iter]],CatList,CatNo,0,4,Num_OR_Cat) deterNC=(((BN[4]<=BC[[4]])&&BN[3]>0) || (BC[[3]]==0 && BN[3]>0)) deterNC2=(((BN[4]>BC[[4]])&&BC[[3]]>0) || (BC[[3]]>0 && BN[3]==0)) } } else if (IterLB==F && Iter>1){ BNN=list() BCC=list() Tree1=Tree[,4:5] if (ceiling((which(Tree1==Iter))/nrow(Tree1))==1){ v1=which(Tree[,4]==Iter)-1 } else {v1=which(Tree[,5]==Iter)-1} if (length(miss_set[[v1]])>0){draw_set=miss_set[[v1]]} else {draw_set=c()} for (i1 in 1:times_default){ iset=c() draw_set1=draw_set draw_set1=c(draw_set1,unlist(prob_set[[v1]][[i1]])) if (length(draw_set1)>0){ for (i2 in 1:length(draw_set1)){ uu=runif(1) if(uu>0.5){iset=c(iset,draw_set1[i2])} } if (length(iset)>0){ prob_set[[Iter]][[i1]]=iset if (length(setdiff(draw_set1,iset))>0){ prob_set[[Iter+1]][[i1]]=sort(setdiff(draw_set1,iset)) } else {prob_set[[Iter+1]][[i1]]=list()} } else{ prob_set[[Iter]][[i1]]=list() prob_set[[Iter+1]][[i1]]=draw_set1 } } else {prob_set[[Iter]][[i1]]=prob_set[[Iter+1]][[i1]]=list()} Re1=sort(c(iset,Regions[[Iter]],unlist(LR_set[[Iter]][[i1]]))) X11=data.x[Re1,] Y11=data.y[Re1] XY11=cbind(X11,Y11) if (MEthod==3){ BNN[[i1]]=BestNum(data.frame(XY11[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BCC[[i1]]=BestCat(data.frame(data.xy[,c(CatNo,(M+1))]),Re1,CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) } else if (MEthod==6){ set.seed(seed) BNN[[i1]]=BestNum2(data.frame(XY11[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BCC[[i1]]=BestCat2(data.frame(data.xy[,c(CatNo,(M+1))]),Re1,CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) } } BN=BNN[[1]] for (i3 in 2:times_default){ if (MEthod==3){ if (BNN[[i3]][3]>BN[3]){BN=BNN[[i3]]} } else if (MEthod==6){ if (BNN[[i3]][4]BN[3]){BN=BNN[[i3]]} } } } BC=BCC[[1]] for (i4 in 2:times_default){ if (MEthod==3){ if (BCC[[i4]][[3]]>BC[[3]]){BC=BCC[[i4]]} } else if (MEthod==6){ if (BCC[[i4]][[4]]BC[[3]]){BC=BCC[[i4]]} } } } if (MEthod==3){ deterNC=(BN[3]>=BC[[3]]) deterNC2=(BC[[3]]>BN[3]) } else if (MEthod==6){ deterNC=(((BN[4]<=BC[[4]])&&BN[3]>0) || (BC[[3]]==0 && BN[3]>0)) deterNC2=(((BN[4]>BC[[4]])&&BC[[3]]>0) || (BC[[3]]>0 && BN[3]==0)) } } else if (IterLB==T && Iter>1){ BNN=list() BCC=list() for (i1 in 1:times_default){ Re1=sort(c(unlist(prob_set[[Iter]][[i1]]),Regions[[Iter]],unlist(LR_set[[Iter]][[i1]]))) X11=data.x[Re1,] Y11=data.y[Re1] XY11=cbind(X11,Y11) if (MEthod==3){ BNN[[i1]]=BestNum(data.frame(XY11[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BCC[[i1]]=BestCat(data.frame(data.xy[,c(CatNo,(M+1))]),Re1,CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) } else if (MEthod==6){ set.seed(seed) BNN[[i1]]=BestNum2(data.frame(XY11[,c(NumNo,(M+1))]),data.x[Regions[[Iter]],NumNo],MEthod,Num_OR_Cat) BCC[[i1]]=BestCat2(data.frame(data.xy[,c(CatNo,(M+1))]),Re1,CatList,CatNo,data.x[Regions[[Iter]],CatNo],MEthod,Num_OR_Cat) } } BN=BNN[[1]] for (i3 in 2:times_default){ if (MEthod==3){ if (BNN[[i3]][3]>BN[3]){BN=BNN[[i3]]} } else if (MEthod==6){ if (BNN[[i3]][4]BN[3]){BN=BNN[[i3]]} } } } BC=BCC[[1]] for (i4 in 2:times_default){ if (MEthod==3){ if (BCC[[i4]][[3]]>BC[[3]]){BC=BCC[[i4]]} } else if (MEthod==6){ if (BCC[[i4]][[4]]BC[[3]]){BC=BCC[[i4]]} } } } if (MEthod==3){ deterNC=(BN[3]>=BC[[3]]) deterNC2=(BC[[3]]>BN[3]) } else if (MEthod==6){ deterNC=(((BN[4]<=BC[[4]])&&BN[3]>0) || (BC[[3]]==0 && BN[3]>0)) deterNC2=(((BN[4]>BC[[4]])&&BC[[3]]>0) || (BC[[3]]>0 && BN[3]==0)) } } if((BN[3]==0)&&(BC[[3]]==0)){ Tree=rbind(Tree,c(Iter,0,0,0,0,Prediction,n,0,1,Impurity)) SPList[[Iter]]=0 } else if (deterNC){ SV=NumNo[BN[1]] SP=BN[2] LB=length(Regions)+1 BB=length(Regions)+2 LR_set[[LB]]=LR_set[[BB]]=list() prob_set1=prob_set for (t in 1:times_default){ LR_set[[LB]][[t]]=LR_set[[BB]][[t]]=list() if (length(prob_set1[[Iter]][[t]])>0){ for (tt in 1:length(prob_set1[[Iter]][[t]])){ if (!is.na(data.x[(prob_set1[[Iter]][[t]][[tt]]),SV])){ if ((data.x[(prob_set1[[Iter]][[t]][[tt]]),SV])<=SP){ LR_set[[LB]][[t]]=c(LR_set[[LB]][[t]],prob_set1[[Iter]][[t]][[tt]]) prob_set[[Iter]][[t]]=prob_set[[Iter]][[t]][-which((prob_set[[Iter]][[t]])==prob_set1[[Iter]][[t]][[tt]])] } else{ LR_set[[BB]][[t]]=c(LR_set[[BB]][[t]],prob_set1[[Iter]][[t]][[tt]]) prob_set[[Iter]][[t]]=prob_set[[Iter]][[t]][-which((prob_set[[Iter]][[t]])==prob_set1[[Iter]][[t]][[tt]])] } } } } } for (s in 1:times_default){ if (length(LR_set[[Iter]][[s]])>0){ for (ss in 1:length(LR_set[[Iter]][[s]])){ if (!is.na(data.x[(LR_set[[Iter]][[s]][[ss]]),SV])){ if ((data.x[(LR_set[[Iter]][[s]][[ss]]),SV])<=SP){ LR_set[[LB]][[s]]=c(LR_set[[LB]][[s]],LR_set[[Iter]][[s]][[ss]]) } else { LR_set[[BB]][[s]]=c(LR_set[[BB]][[s]],LR_set[[Iter]][[s]][[ss]]) } } else { prob_set[[Iter]][[s]]=c(prob_set[[Iter]][[s]],LR_set[[Iter]][[s]][[ss]]) } } } } miss_set1=c() miss_pos1=c() for (k in Regions[[Iter]]){ if(is.na(data.xy[k,SV])){ miss_set1=c(miss_set1,k) miss_pos1=c(miss_pos1,which(Regions[[Iter]]==k)) } } if(length(miss_set1)>0){miss_set[[Iter]]=miss_set1} else{miss_set[[Iter]]=list()} if (length(miss_pos1)>0){ Regions[[LB]]=intersect(which(data.xy[,SV]<=SP),Regions[[Iter]]) Regions[[BB]]=setdiff(Regions[[Iter]][-miss_pos1],Regions[[LB]]) } else{ Regions[[LB]]=intersect(which(data.xy[,SV]<=SP),Regions[[Iter]]) Regions[[BB]]=setdiff(Regions[[Iter]],Regions[[LB]]) } Tree=rbind(Tree,c(Iter,SV,SP,LB,BB,Prediction,n,BN[3],0,Impurity)) SPList[[Iter]]=SP } else if (deterNC2) { SV=CatNo[BC[[1]]] SP=BC[[2]] LB=length(Regions)+1 BB=length(Regions)+2 LR_set[[LB]]=LR_set[[BB]]=list() prob_set1=prob_set for (t in 1:times_default){ LR_set[[LB]][[t]]=LR_set[[BB]][[t]]=list() if (length(prob_set1[[Iter]][[t]])>0){ for (tt in 1:length(prob_set1[[Iter]][[t]])){ if (!is.na(data.x[(prob_set1[[Iter]][[t]][[tt]]),SV])){ if (is.element((data.x[(prob_set1[[Iter]][[t]][[tt]]),SV]),SP)){ LR_set[[LB]][[t]]=c(LR_set[[LB]][[t]],prob_set1[[Iter]][[t]][[tt]]) prob_set[[Iter]][[t]]=prob_set[[Iter]][[t]][-which((prob_set[[Iter]][[t]])==prob_set1[[Iter]][[t]][[tt]])] } else{ LR_set[[BB]][[t]]=c(LR_set[[BB]][[t]],prob_set1[[Iter]][[t]][[tt]]) prob_set[[Iter]][[t]]=prob_set[[Iter]][[t]][-which((prob_set[[Iter]][[t]])==prob_set1[[Iter]][[t]][[tt]])] } } } } } for (s in 1:times_default){ if (length(LR_set[[Iter]][[s]])>0){ for (ss in 1:length(LR_set[[Iter]][[s]])){ if (!is.na(data.x[(LR_set[[Iter]][[s]][[ss]]),SV])){ if (is.element((data.x[(LR_set[[Iter]][[s]][[ss]]),SV]),SP)){ LR_set[[LB]][[s]]=c(LR_set[[LB]][[s]],LR_set[[Iter]][[s]][[ss]]) } else { LR_set[[BB]][[s]]=c(LR_set[[BB]][[s]],LR_set[[Iter]][[s]][[ss]]) } } else { prob_set[[Iter]][[s]]=c(prob_set[[Iter]][[s]],LR_set[[Iter]][[s]][[ss]]) } } } } miss_set1=c() miss_pos1=c() for (k in Regions[[Iter]]){ if(is.na(data.xy[k,SV])){ miss_set1=c(miss_set1,k) miss_pos1=c(miss_pos1,which(Regions[[Iter]]==k)) } } if(length(miss_set1)>0){miss_set[[Iter]]=miss_set1} else{miss_set[[Iter]]=list()} if (length(miss_pos1)>0){ Regions[[LB]]=intersect(Regions[[Iter]],unlist(CatList[[SV]][which(match(levels(data.xy[,SV]),SP)>0)])) Regions[[BB]]=setdiff(Regions[[Iter]][-miss_pos1],Regions[[LB]]) } else{ Regions[[LB]]=intersect(Regions[[Iter]],unlist(CatList[[SV]][which(match(levels(data.xy[,SV]),SP)>0)])) Regions[[BB]]=setdiff(Regions[[Iter]],Regions[[LB]]) } Tree=rbind(Tree,c(Iter,SV,-1,LB,BB,Prediction,n,BC[[3]],0,Impurity)) SPList[[Iter]]=SP } } rm(deterNC) rm(deterNC2) Iter=Iter+1 } colnames(Tree)=c('RegNo','SV','SP','LLeaf','RLeaf','Pred','NoObs','ImpRed','Leaf/Node','Imp') ToReturn=list() ToReturn[[1]]=Tree[2:nrow(Tree),] ToReturn[[2]]=SPList ToReturn[[3]]=Regions ToReturn[[4]]=CatInd return(ToReturn) } #MCAR MCAR<-function(miss_data){ miss_matrix=matrix(0,nrow=n1,ncol=m1) for (j in miss_var){ for (nn in 1:n1){ if (!is.na(miss_data[nn,j])){ u=sample(c(0,1),1,prob=c(p_default,1-p_default)) if(u==0){ miss_data[nn,j]=NA miss_matrix[nn,j]=1 } } } } Toreturn=list() Toreturn[[1]]=miss_data Toreturn[[2]]=miss_matrix return(Toreturn) } #MAR MAR<-function(miss_data){ miss_matrix=matrix(0,nrow=n1,ncol=m1) for (s in miss_var){ v=miss_relate_var[which(miss_var==s)] if (simu_or_dataana==1 && Null_or_not==1){Sig_v=sigma[v]} else if (simu_or_dataana==1 && Null_or_not!=1){Sig_v=sigma[v,v]} if (small_or_large==1){ if (p_default<=0.2){ CUT_point=qnorm(c(0.1,0.2,0.3),mu_vector[v],Sig_v) } else if (p_default>0.2 && p_default<=0.4){ CUT_point=qnorm(c(0.1,0.2,0.3,0.4,0.5),mu_vector[v],Sig_v) } } else if (small_or_large==2){ if (p_default<=0.2){ CUT_point=qnorm(c(0.7,0.8,0.9),mu_vector[v],Sig_v) } else if (p_default>0.2 && p_default<=0.4){ CUT_point=qnorm(c(0.5,0.6,0.7,0.8,0.9),mu_vector[v],Sig_v) } } for (nn in 1:n1){ if (!is.na(miss_data[nn,s])){ Int_num=findInterval(miss_data[nn,v],CUT_point) if (small_or_large==1){ if (p_default<=0.2){ if (Int_num<3){prob_of_missing=miss_prob[Int_num+1]} else {prob_of_missing=0} } else if (p_default>0.2 && p_default<=0.4){ if (Int_num<5){prob_of_missing=miss_prob[Int_num+1]} else {prob_of_missing=0} } } else if (small_or_large==2){ if (Int_num>0){prob_of_missing=miss_prob[Int_num]} else {prob_of_missing=0} } if (prob_of_missing>0){ u=sample(c(0,1),1,prob=c(prob_of_missing,1-prob_of_missing)) if(u==0){ miss_data[nn,s]=NA miss_matrix[nn,s]=1 } } } } } Toreturn=list() Toreturn[[1]]=miss_data Toreturn[[2]]=miss_matrix return(Toreturn) } #MNAR MNAR<-function(miss_data){ miss_matrix=matrix(0,nrow=n1,ncol=m1) for (s in miss_var){ if (class(miss_data[,s])=="factor"){ if(s==m1 || s==(m1-1)){cat_to_miss=Cat_to_miss[2]} else{cat_to_miss=Cat_to_miss[1]} if (sum(is.na(miss_data[,s]))>0){ w=which(is.na(miss_data[,s])) prob_of_miss=(n1/(sum(miss_data[-w,s]==cat_to_miss)))*p_default if (prob_of_miss>=1){ if (small_or_large==1){ prob_of_miss=(n1/(sum(miss_data[-w,s]==cat_to_miss)+sum(miss_data[-w,s]==cat_to_miss+1)))*p_default cat_to_miss=c(cat_to_miss,cat_to_miss+1) } else if (small_or_large==2){ prob_of_miss=(n1/(sum(miss_data[-w,s]==cat_to_miss)+sum(miss_data[-w,s]==cat_to_miss-1)))*p_default cat_to_miss=c(cat_to_miss,cat_to_miss-1) } } } else{ prob_of_miss=(n1/(sum(miss_data[,s]==cat_to_miss)))*p_default if (prob_of_miss>=1){ if (small_or_large==1){ prob_of_miss=(n1/(sum(miss_data[,s]==cat_to_miss)+sum(miss_data[,s]==cat_to_miss+1)))*p_default cat_to_miss=c(cat_to_miss,cat_to_miss+1) } else if (small_or_large==2){ prob_of_miss=(n1/(sum(miss_data[,s]==cat_to_miss)+sum(miss_data[,s]==cat_to_miss-1)))*p_default cat_to_miss=c(cat_to_miss,cat_to_miss-1) } } } for (nn in 1:n1){ if (!is.na(miss_data[nn,s])){ if (is.element(miss_data[nn,s],cat_to_miss)){ u=sample(c(0,1),1,prob=c(prob_of_miss,1-prob_of_miss)) if (u==0){ miss_data[nn,s]=NA miss_matrix[nn,s]=1 } } } } } else { if (simu_or_dataana==1 && Null_or_not==1){Sig_s=sigma[s]} else if (simu_or_dataana==1 && Null_or_not!=1){Sig_s=sigma[s,s]} if (small_or_large==1){ if (p_default<=0.2){ CUT_point=qnorm(c(0.1,0.2,0.3),mu_vector[s],Sig_s) } else if (p_default>0.2 && p_default<=0.4){ CUT_point=qnorm(c(0.1,0.2,0.3,0.4,0.5),mu_vector[s],Sig_s) } } else if (small_or_large==2){ if (p_default<=0.2){ CUT_point=qnorm(c(0.7,0.8,0.9),mu_vector[s],Sig_s) } else if (p_default>0.2 && p_default<=0.4){ CUT_point=qnorm(c(0.5,0.6,0.7,0.8,0.9),mu_vector[s],Sig_s) } } for (nn in 1:n1){ if (!is.na(miss_data[nn,s])){ Int_num=findInterval(miss_data[nn,s],CUT_point) if (small_or_large==1){ if (p_default<=0.2){ if (Int_num<3){prob_of_missing=miss_prob[Int_num+1]} else {prob_of_missing=0} } else if (p_default>0.2 && p_default<=0.4){ if (Int_num<5){prob_of_missing=miss_prob[Int_num+1]} else {prob_of_missing=0} } } else if (small_or_large==2){ if (Int_num>0){prob_of_missing=miss_prob[Int_num]} else {prob_of_missing=0} } if (prob_of_missing>0){ u=sample(c(0,1),1,prob=c(prob_of_missing,1-prob_of_missing)) if(u==0){ miss_data[nn,s]=NA miss_matrix[nn,s]=1 } } } } } } Toreturn=list() Toreturn[[1]]=miss_data Toreturn[[2]]=miss_matrix return(Toreturn) }