library(glmnet)
#筛选特征变量
eset1<-read.table("data.txt",sep="\t",header=T)
eset1$Label = as.factor(eset1$Label)
x <- data.matrix(eset1[, 1:56])
y <- eset1[, 57]
Lasso <- glmnet(x, y, alpha=1,family = 'binomial')
set.seed(123)
Lasso.cv<-cv.glmnet(x,y,alpha = 1,family = "binomial",type.measure = "auc" )
plot(Lasso.cv)
get_coe <- function(the_fit,the_lamb){
	Coefficients <- coef(the_fit, s = the_lamb)
	Active.Index <- which(Coefficients != 0)
	Active.Coefficients <- Coefficients[Active.Index]
	re <- data.frame(rownames(Coefficients)[Active.Index],Active.Coefficients)
	re <- data.table('var_names'=rownames(Coefficients)[Active.Index],'coef'=Active.Coefficients)
	re$expcoef <- exp(re$coef)
	return(re[order(expcoef)])
}
get_coe(Lasso.cv,Lasso.cv$lambda.min)
coef(Lasso.cv, s = "lambda.1se")
eset55<-select(eset1,c("BFR", "WHR", "TG", "HbA1c", "LDL_C", "TC", "PSQI", "VFI", "Age", "SBP", "FBG", "ALT", "Creatinine", "ECGNOTE1", "ECGNOTE2", "Smoking1", "Smoking2", "Drinking1", "Drinking2","Label"))
#将数据集划分为训练集和测试集
set.seed(123)#设置随机种子
ind <- sample(2, nrow(eset55), replace=TRUE, prob=c(0.7, 0.3))
train_data_new <-eset55[ind==1,]#训练数据
test_data_new <- eset55[ind==2,]#测试数据
#利用LASSO构建模型
x <- as.matrix(train_data_new[, 1:19])
y <- train_data_new[, 20]
Lasso <- glmnet(x, y, alpha=1,family = 'binomial')
Lasso_cv <-cv.glmnet(x,y,alpha = 1,family = "binomial",type.measure = "class" )
pre.train.Lasso = predict(Lasso_cv, newx=as.matrix(train_data_new[,1:19]), type = "response",s = Lasso_cv$lambda.min)
reg_roc_Lasso <- roc(train_data_new$Label ~ pre.train.Lasso)
pre.test.Lasso = predict(Lasso_cv, newx=as.matrix(test_data_new[,1:19]), type = "response",s = Lasso_cv$lambda.min)
reg_roc_Lasso_test <- roc(test_data_new$Label ~ pre.test.Lasso)

#利用svm建立支持向量机分类模型
library(e1071)#加载e1071包
#利用tune.svm()函数，来寻找svm()函数的最优参数。
tuned <- tune.svm(Label ~., data = train_data_new, gamma = 10^(-6:-1), cost = 10^(-2:2))
set.seed(123)
svm.model<-svm(Label~., train_data_new,type="C",kernel = 'radial',gamma=tuned$best.parameters$gamma,cost=tuned$best.parameters$cost,probability = TRUE)
summary(svm.model)
#训练集
pre.train.SVM <- predict(svm.model,newdata = train_data_new)
reg_roc_SVM <- roc(train_data_new$Label ~ as.numeric(as.character(pre.train.SVM)))
#测试集
pre.test.SVM <- predict(svm.model,newdata = test_data_new)
reg_roc_SVM_test <- roc(test_data_new$Label ~ as.numeric(as.character(pre.test.SVM)))
pre.vali.SVM <- predict(svm.model,newdata = vali_data_new)
reg_roc_SVM_vali <- roc(vali_data_new$Label ~ as.numeric(as.character(pre.vali.SVM)))
temp_matrix <- data.frame(y = train_data_new$Label, predictor = reg_roc_SVM$predictor, stringsAsFactors = FALSE);
temp_matrix[temp_matrix$predictor < 0.5, "predictor"] <- 0;
temp_matrix[temp_matrix$predictor >= 0.5, "predictor"] <- 1;
TP <- as.double(nrow(temp_matrix[temp_matrix$y == 1 & temp_matrix$predictor == 1, ]));
FP <- as.double(nrow(temp_matrix[temp_matrix$y == 0 & temp_matrix$predictor == 1, ]));
TN <- as.double(nrow(temp_matrix[temp_matrix$y == 0 & temp_matrix$predictor == 0, ]));
FN <- as.double(nrow(temp_matrix[temp_matrix$y == 1 & temp_matrix$predictor == 0, ]));
MCC <- (TP * TN - FP * FN) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN));
#利用randomForest建立随机森林分类模型
library(randomForest)
n<-length(names(train_data_new))
set.seed(9)
min=100
num=0
for (i in 1:(n-1)){
	mtry_fit<- randomForest(Label~., data=train_data_new, mtry=i)
	err<-mean(mtry_fit$err.rate)
	print(err)
	if(err<min){
		min =err
		num=i
	}
}
print(min)
print(num)
#num即为mtry
#带入mtry，尝试寻找ntree
ntree_fit<-randomForest(Label~.,data=train_data_new,mtry=8,ntree=1000)
plot(ntree_fit)
RF.model<-randomForest(Label~., data = train_data_new,ntree =150,mtry=8,importance=TRUE,proximity=TRUE)
summary(RF.model)
RF.model$importance
pre.train.RF <- predict(RF.model,newdata = train_data_new,type="prob")
reg_roc_RF <- roc(train_data_new$Label ~ pre.train.RF[,1])
temp_matrix <- data.frame(y = train_data_new$Label, predictor = reg_roc_RF$predictor, stringsAsFactors = FALSE);
temp_matrix[temp_matrix$predictor < 0.5, "predictor"] <- 0;
temp_matrix[temp_matrix$predictor >= 0.5, "predictor"] <- 1;
TP <- as.double(nrow(temp_matrix[temp_matrix$y == 1 & temp_matrix$predictor == 1, ]));
FP <- as.double(nrow(temp_matrix[temp_matrix$y == 0 & temp_matrix$predictor == 1, ]));
TN <- as.double(nrow(temp_matrix[temp_matrix$y == 0 & temp_matrix$predictor == 0, ]));
FN <- as.double(nrow(temp_matrix[temp_matrix$y == 1 & temp_matrix$predictor == 0, ]));
MCC <- (TP * TN - FP * FN) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN));
pre.test.RF <- predict(RF.model,newdata = test_data_new,type="prob")
reg_roc_RF_test <- roc(test_data_new$Label ~ pre.test.RF)

library(PredictABEL)
train_data_new$Label<-as.numeric(as.character(train_data_new$Label))
reclassification(data=train_data_new,cOutcome=20,predrisk1 = pre.train.Lasso, predrisk2 = pre.train.SVM,cutoff = c(0,0.5,1))
reclassification(data=train_data_new,cOutcome=20,predrisk1 = pre.train.Lasso, predrisk2 = pre.train.RF,cutoff = c(0,0.5,1))
reclassification(data=train_data_new,cOutcome=20,predrisk1 = pre.train.SVM, predrisk2 = pre.train.RF,cutoff = c(0,0.5,1))
test_data_new$Label<-as.numeric(as.character(test_data_new$Label))
reclassification(data=test_data_new,cOutcome=20,predrisk1 = pre.test.Lasso, predrisk2 = pre.test.SVM,cutoff = c(0,0.5,1))
reclassification(data=test_data_new,cOutcome=20,predrisk1 = pre.test.Lasso, predrisk2 = pre.test.RF,cutoff = c(0,0.5,1))
reclassification(data=test_data_new,cOutcome=20,predrisk1 = pre.test.SVM, predrisk2 = pre.test.RF,cutoff = c(0,0.5,1)