library("mcca") library(zoo) #S3 Infrastructure for Regular and Irregular Time Series (Z's Ordered Observations) library(xts) #eXtensible Time Series library(TTR) #Technical Trading Rules library(quantmod) library(glmnet) library(Matrix) library("caret") library(MASS) library(yardstick) library(keras) library(tensorflow) use_condaenv("r-reticulate") Data_process_t<-function(diffy){ for(i in 1:length(diffy)){ if (diffy[i]< 0){ diffy[i] = 0 } else{ diffy[i]=1 } } return(diffy) } df<-read.csv("Stock_MSFT.csv",encoding = "UTF-8") df<-df[,-1] colnames(df) <- c("Open","High","Low","Close","Volume","numArticles","neutral","positive","negative") simdata<-df n<-length(simdata[,1]) head(simdata) y_price = df$Close OHLC <- simdata[,c("Open","High","Low","Close")] HLC<-simdata[,c("High","Low","Close")] HL<-simdata[,c("High","Low")] C1<-simdata[,"Close"] V<-simdata[,"Volume"] #ADX x1 = ADX(HLC)[,"ADX"] #Aroon x2 = aroon(HL)[,"aroonUp"] x3 = aroon(HL)[,"aroonDn"] x4 = aroon(HL)[,"oscillator"] #ATR x5 = ATR(HLC)[,"atr"] #BBands x6 = BBands(HLC)[,"mavg"] #CCI x7 = CCI(HLC) #chaikinAD x8 = chaikinAD(HLC, V) #chaikinVo x9 = chaikinVolatility(HL) #CLV x10 = CLV(HLC) #CMF x11 = CMF(HLC,V) #CMO x12 = CMO(C1) #dpo V x13 = DPO(V) #DonchianChannel x14 = DonchianChannel(HL)[,"mid"] #DPO P x15 = DPO(C1) #DVI x16 = DVI(C1)[,"dvi"] #dpo V x17<-GMMA(C1)[,"short lag 5"] #KST x18 = KST(C1)[,"kst"] #MACD x19 = MACD(C1, 12, 26, 9, maType="EMA" )[,"macd"] #MFI x20 = MFI(HLC, V) #OBV x21 = OBV(C1, V) #ROC x22 = ROC(C1) #RSI x23 = RSI(C1) #SAR x24 = SAR(HL)[,"sar"] #WMA x25 = WMA(C1) #DEMA x26 = DEMA(C1) #SMA x27 = SMA(C1) #EMA x28 = EMA(C1) #TDI x29 = TDI(C1)[,"tdi"] #VHF x30 = VHF(C1) #EVWMA x31 = EVWMA(C1,V) #ZLEMA x32 = ZLEMA(C1) #WAD x33 = williamsAD(HLC) #HMA x34 = HMA(C1) #SMI x35 = SMI(HLC)[,"SMI"] #AR BR ta.sum<-function(x,t){ ta<-vector() for(i in t:(length(x))){ ta[i] = sum(x[t:i-t]) } return(ta) } arbr<-function(stock){ HO=df$High-df$Open OL=df$Open-df$Low HCY=df$High - (c(df$Close[-1],0)) CYL=c(df$Close[-1],0)-df$Low AR = ta.sum(HO,26)/ta.sum(OL,26) *100 BR = ta.sum(HCY,26)/ta.sum(CYL,26) *100 ARBR=list("AR"=AR,"BR"=BR) return(ARBR) } ar = arbr(df)$AR br = arbr(df)$BR X0<-cbind(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20, x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,ar,br,df$numArticles,df$neutral,df$positive,df$negative) colnames(X0) <- c(paste("X", 1:41, sep = "")) #y_p = (simdata[,"Close"]) Y<-diff((simdata[,"Close"])) y_p<-Data_process_t(Y) x<-as.matrix(X0) x<-scale(x,center =T,scale =T) #df_aapl = cbind(x[-1,],y_p) #write.csv(df_aapl,file = "df_aapl.csv") x_train<-x[(401:1150),] y_train <-y_p[401:1150] x_test<-x[(1151:1400),] y_test <-y_p[1151:1400] x_df = x[(401:1400),] 'library(ggcorrplot) df = read.csv("mb.csv") xdf = df[,5:ncol(df)] xdf1 = subset(xdf,select=-c(effch效率变化,effch纯技术效率指数,techch技术进步)) xdf1 = scale(xdf1) cordf = cor(xdf1) p.mat <- cor_pmat(xdf1) ggcorrplot(cordf,outline.color = "black",hc.order = TRUE,p.mat = p.mat,type = "upper",sig.level = 0.1,legend.title = "Corr", tl.cex = 10,tl.srt = 90,digits = 2) p.mat write.csv(p.mat,file = "p.mat.csv") a = xdf1$数字经济水平.标准化. b = xdf1$tfpchTPF指数变化率 cor.test(a,b,method = "spearman") train_df = xdf[(1:(0.7*nrow(xdf))),] test_df = xdf[(0.7*nrow(xdf)):(nrow(xdf)),] test_x = subset(test_df,select = -c(tfpchTPF指数变化率)) svm_m<-svm(tfpchTPF指数变化率~.,data = train_df,probability=T,kernel = "radial") pred_svm<-predict(svm_m,newdata = test_x) pred_csvm rf<-randomForest(tfpchTPF指数变化率~.,data = train_df,importance=TRUE,ntree=300,mtry = sqrt(ncol(train_df))) pred_rf<-predict(rf,newdata = test_x) rf$importanceSD rf$importance barplot(rf$importance[,2],horiz=T) varImpPlot(rf, main = "variable importance")' corhcv = cor(x_df) p.mat <- cor_pmat(x_df) ggcorrplot(corhcv,outline.color = "black",hc.order = TRUE,type = "upper",legend.title = "Corr",pch.cex = 10, tl.cex = 10,tl.srt = 90,pch = 5,digits = 2) y_trainls <- to_categorical(as.factor(y_train)) y_testls <- to_categorical(as.factor(y_test)) x_trainls <- x_train x_testls <- x_test dim(x_trainls)<-c(dim(x_trainls)[1], dim(x_trainls)[2],1) dim(x_testls) <- c(dim(x_testls)[1], dim(x_testls)[2],1) #SVM library(e1071) mac_svm = vector() tpb_svm = list() for (i in 1:30) { svm_m<-svm(as.factor(y_train)~.,data = cbind(x_train,y_train),probability=T,kernel = "radial") pred_svm<-predict(svm_m,newdata = x_test,probability = T) mac_svm[i] = as.numeric(confusionMatrix( as.factor(pred_svm),as.factor(y_test))$overall[1]) tpb_svm[[i]] = attr(pred_svm,"probabilities") } mean(mac_svm) sd(mac_svm) max(mac_svm) confusionMatrix(as.factor(as.numeric( tpb_svm[[which(mac_svm == max(mac_svm))[[1]]]][,1] >0.5)),as.factor(y_test)) #Random Forest library(randomForest) mac_rf = vector() tpb_rf = list() for (i in 1:30) { rf<-randomForest(as.factor(y_train)~.,data=cbind(x_train,y_train),ntree=10,mtry = sqrt(ncol(x_train))) pred_rf<-predict(rf,newdata = x_test) mac_rf[i] = as.numeric(confusionMatrix( as.factor(pred_rf),as.factor(y_test))$overall[1]) tpb_rf[[i]] = predict(rf,newdata = x_test,type="prob") } max(mac_rf) mean(mac_rf) sd(mac_rf) confusionMatrix(as.factor(as.numeric(tpb_rf[which(mac_rf == max(mac_rf))][[1]][,1] <0.5)),as.factor(y_test)) #LASSO-LSTM fit1=glmnet(x_train,y_train,alpha = 1 ,family="binomial") cv_fit1 <- cv.glmnet(x_train,y_train,alpha =1,family = "binomial", type.measure = "class") plot(cv_fit1) pred.fit1<-predict(fit1,x_test,type = "class",s = 0.001) confusionMatrix(as.factor(pred.fit1),as.factor(y_test)) plot(fit1, xvar = "lambda") k2<-coef(fit1,s=cv_fit1$lambda.min)[-1] inde<-vector() for (i in 1:length(k2)) { if ( (k2[i])==0 ){ inde = c(inde,i) } } x_trainv<-x_trainls[,-inde,1] x_testv<-x_testls[,-inde,1] dim(x_trainv)<-c(dim(x_trainv)[1], dim(x_trainv)[2],1) dim(x_testv) <- c(dim(x_testv)[1], dim(x_testv)[2],1) set_random_seed(3507) model_lstm_lasso<- keras_model_sequential() model_lstm_lasso %>% #layer_lstm(units = 64, dropout = 0.2,return_sequences = FALSE ,input_shape =c(dim(x_trainv)[2:3])) %>% #layer_lstm(units = 32,return_sequences = TRUE)%>% #layer_lstm(units = 32, return_sequences = FALSE)%>% #layer_dropout(rate = 0.2) %>% layer_dense(units = 32, activation = 'relu',input_shape =c(dim(x_trainv)[2])) %>% layer_dense(units = 16, activation = 'relu') %>% #layer_dropout(rate = 0.2) %>% layer_dropout(rate = 0.35) %>% layer_dense(units = 1, activation = 'sigmoid') model_lstm_lasso %>% compile( loss = 'binary_crossentropy', optimizer = optimizer_adam(learning_rate = 0.0015), metrics = c('accuracy') ) model_lstm_lasso %>% fit( x_trainv, y_train, epochs = 39, batch_size = 100, validation_data = list(x_testv,y_test)) acc_ll = model_lstm_lasso %>% evaluate(x_testv,y_test) prob_ll = model_lstm_lasso%>%predict(x_testv) confusionMatrix(as.factor(as.numeric(prob_ll>0.5)),as.factor(y_test)) mac_la = vector() prob_la = list() for (i in 1:30) { model_lstm_lasso<- keras_model_sequential() model_lstm_lasso %>% layer_lstm(units = 64, dropout = 0.2,return_sequences = TRUE ,input_shape =c(dim(x_trainv)[2:3])) %>% layer_lstm(units = 32,return_sequences = TRUE)%>% #layer_lstm(units = 32, return_sequences = FALSE)%>% #layer_dropout(rate = 0.2) %>% layer_dense(units = 32, activation = 'relu',input_shape =c(dim(x_trainv)[2])) %>% #layer_dense(units = 16, activation = 'relu') %>% #layer_dropout(rate = 0.2) %>% layer_dropout(rate = 0.35) %>% layer_dense(units = 1, activation = 'sigmoid') model_lstm_lasso %>% compile( loss = 'binary_crossentropy', optimizer = optimizer_adam(learning_rate = 0.0015), metrics = c('accuracy') ) model_lstm_lasso %>% fit( x_trainv, y_train, epochs = 39, batch_size = 100, validation_data = list(x_testv,y_test)) res = model_lstm_lasso %>% evaluate(x_testv,y_test) mac_la[i] = as.numeric(res[2]) prob_la[[i]] = predict(model_lstm_lasso,x_testv) } #TI LSTM model ti_train = x_train[,1:37] ti_test = x_test[,1:37] dim(ti_train)<-c(dim(ti_train)[1], dim(ti_train)[2],1) dim(ti_test) <- c(dim(ti_test)[1], dim(ti_test)[2],1) model_ti <- keras_model_sequential() model_ti %>% #layer_lstm(units = 32, return_sequences = FALSE ,input_shape =c(dim(ti_train)[2:3])) %>% layer_dense(units = 32,activation = "relu",input_shape =c(dim(ti_train)[2]))%>% layer_dropout(rate = 0.3)%>% layer_dense(units = 1,activation = "sigmoid") model_ti%>%compile( loss = 'binary_crossentropy', optimizer = optimizer_adam(learning_rate = 0.001), metrics = c("accuracy") ) model_ti%>% fit(ti_train,y_train,epochs = 20,batch_size = 100,validation_data = list(ti_test,y_test),verbose = 1) res_ti <- predict(model_ti,ti_test) confusionMatrix(as.factor(as.numeric(res_ti>0.5)),as.factor(y_test)) ti_acc = vector() ti_prob = list() for (i in 1:30) { model_ti <- keras_model_sequential() model_ti %>% layer_lstm(units = 32, return_sequences = TRUE ,input_shape =c(dim(ti_train)[2:3])) %>% layer_lstm(units = 16,return_sequences = FALSE)%>% layer_dense(units = 48,activation = "relu")%>% layer_dropout(rate = 0.2)%>% layer_dense(units = 1,activation = "sigmoid") model_ti%>%compile( loss = 'binary_crossentropy', optimizer = optimizer_adam(learning_rate = 0.003), metrics = c("accuracy") ) model_ti%>% fit(ti_train,y_train,epochs = 30,batch_size = 100,validation_data = list(ti_test,y_test),verbose = 1) res = model_ti %>% evaluate(ti_test,y_test) ti_acc[i] = as.numeric(res[2]) ti_prob[[i]] = predict(model_ti,ti_test) } #SI LSTM model si_train = x_train[,38:41] si_test = x_test[,38:41] dim(si_train)<-c(dim(si_train)[1], dim(si_train)[2],1) dim(si_test) <- c(dim(si_test)[1], dim(si_test)[2],1) si_acc = vector() si_prob = list() model_si <- keras_model_sequential() model_si %>% #layer_lstm(units = 16, return_sequences = FALSE ,input_shape =c(dim(si_train)[2:3])) %>% layer_dense(units = 32,activation = "relu",input_shape =c(dim(si_train)[2]))%>% layer_dense(units = 32,activation = "relu")%>% layer_dropout(rate = 0.25)%>% layer_dense(units = 1,activation = "sigmoid") model_si%>%compile( loss = 'binary_crossentropy', optimizer = optimizer_adam(learning_rate = 0.001), metrics = c("accuracy") ) model_si%>% fit(si_train,y_train,epochs =20,batch_size = 100,validation_data = list(si_test,y_test),verbose = 1) res_si <- predict(model_si,si_test) confusionMatrix(as.factor(as.numeric(res_si>0.5)),as.factor(y_test)) for (i in 1:30) { model_si <- keras_model_sequential() model_si %>% layer_lstm(units = 32, return_sequences = TRUE ,input_shape =c(dim(si_train)[2:3])) %>% layer_lstm(units = 16,return_sequences = FALSE)%>% layer_dense(units = 48,activation = "relu")%>% layer_dropout(rate = 0.2)%>% layer_dense(units = 1,activation = "sigmoid") model_si%>%compile( loss = 'binary_crossentropy', optimizer = optimizer_adam(learning_rate = 0.003), metrics = c("accuracy") ) model_si%>% fit(si_train,y_train,epochs = 30,batch_size = 100,validation_data = list(si_test,y_test),verbose = 1) res = model_si %>% evaluate(si_test,y_test) si_acc[i] = as.numeric(res[2]) si_prob[[i]] = predict(model_si,si_test) } confusionMatrix( as.factor(as.numeric(si_prob[which(si_acc==max(si_acc))][[1]]>0.5)),as.factor(y_test)) ###LSTM set_random_seed(3) model_l <- keras_model_sequential() model_l %>% #layer_lstm(units = 32, return_sequences = TRUE ,input_shape =c(dim(x_trainls)[2:3])) %>% #layer_lstm(units = 16,return_sequences = FALSE)%>% layer_dense(units = 16,activation = "relu",input_shape =c(dim(x_trainls)[2]))%>% layer_dropout(rate = 0.3)%>% layer_dense(units = 16,activation = "relu")%>% layer_dropout(rate = 0.3)%>% layer_dense(units = 1,activation = "sigmoid") model_l%>%compile( loss = 'binary_crossentropy', optimizer = optimizer_adam(learning_rate = 0.001), metrics = c("accuracy") ) model_l %>% fit(x_trainls,y_train,epochs = 30,batch_size = 100,validation_data = list(x_testls,y_test),verbose = 1) bacc_lstm = model_l %>% evaluate(x_testls,y_test) bprob_lstm = predict(model_l,x_testls) confusionMatrix(as.factor(as.numeric(bprob_lstm>0.5)),as.factor(y_test)) acc_l = vector() prob_l =list() for (i in 1:30) { model_l <- keras_model_sequential() model_l %>% layer_lstm(units = 32, return_sequences = TRUE ,input_shape =c(dim(x_trainls)[2:3])) %>% layer_lstm(units = 16,return_sequences = FALSE)%>% layer_dense(units = 16,activation = "relu")%>% layer_dropout(rate = 0.3)%>% layer_dense(units = 1,activation = "sigmoid") model_l%>%compile( loss = 'binary_crossentropy', optimizer = optimizer_adam(learning_rate = 0.001), metrics = c("accuracy") ) model_l%>% fit(x_trainls,y_train,epochs = 25,batch_size = 100,validation_data = list(x_testls,y_test),verbose = 1) res = model_l %>% evaluate(x_testls,y_test) acc_l[i] = as.numeric(res[2]) prob_l[[i]] = predict(model_l,x_testls) } mean(mac_rf) sd(mac_rf) mean(mac_svm) sd(mac_svm) mean(mac_la) sd(mac_la) mean(acc_l) sd(acc_l) mean(ti_acc) sd(ti_acc) mean(si_acc) sd(si_acc) library("precrec") library("ROCit") roc_ls = rocit( as.numeric(bprob_lstm ) ,as.factor(y_test),method = "bin") roc_la = rocit( as.numeric( prob_ll ) ,as.factor(y_test),method = "bin") roc_rf = rocit( as.numeric( tpb_rf[which(mac_rf==max(mac_rf))][[1]][,1] ),as.factor(y_test),method = "bin") roc_svm = rocit( as.numeric( tpb_svm[which(mac_svm==max(mac_svm))][[1]][,1] ),as.factor(y_test),method = "bin") roc_si = rocit( as.numeric( si_prob[which(si_acc==max(si_acc))][[1]] ) ,as.factor(y_test),method = "bin") roc_ti = rocit( as.numeric( ti_prob[which(ti_acc==max(ti_acc))][[1]] ) ,as.factor(y_test),method = "bin") plot(roc_la,c(4,2),YIndex = F,legend = FALSE,auc.polygon=FALSE,reuse.auc=FALSE) lines(roc_ls$TPR~roc_ls$FPR, col = 3, lwd = 2) lines(roc_rf$TPR~roc_rf$FPR, col = 6, lwd = 2) lines(roc_svm$TPR~roc_svm$FPR, col = 7, lwd = 2) lines(roc_si$TPR~roc_svm$FPR, col = 5, lwd = 2) lines(roc_ti$TPR~roc_svm$FPR, col =8 , lwd = 2) legend("bottomright",col = c(7,6,3,4,2,5,8), c("SVM","RF","LSTM","LASSO-LSTM","Chance Line","LSTMSI","LSTMTI"), text.width = 0.3,lty = c(1,1,1,1,2), lwd = 2,border = FALSE,bty="o") #Plot lstm accuracy par(mfrow=c(1,3)) p1=plot(mod_history)+theme_bw()+ggtitle("LSTM")+theme(legend.position="top", legend.direction="horizontal",plot.title = element_text(hjust = 0.5), legend.key.size = unit(15, "pt"),legend.title = element_blank()) p2=plot(mod_history1)+theme_bw()+ggtitle("LASSO-LSTM")+theme(legend.position="top", legend.direction="horizontal",plot.title = element_text(hjust = 0.5), legend.key.size = unit(15, "pt"),legend.title = element_blank()) p3=plot(mod_history2)+theme_bw()+ggtitle("EN-LSTM")+theme(legend.position="top", legend.direction="horizontal",plot.title = element_text(hjust = 0.5), legend.key.size = unit(15, "pt"),legend.title = element_blank()) pushViewport(viewport(layout = grid.layout(1,3))) print(p1, vp = vplayout(1,1)) print(p2, vp = vplayout(1,2)) print(p3, vp = vplayout(1,3)) #plot value selection mfit1=glmnet(x_train,y_train,alpha = 1 ,family="multinomial",type.multinomial = "grouped") mfit2=glmnet(x_train,y_train,alpha = 0.5 ,family="multinomial",type.multinomial = "grouped") plot(mfit1, xvar = "lambda") plot(mfit2,xvar = "lambda") plot(mfit3,xvar = "lambda") #plot sotck price Sys.setlocale("LC_TIME","English") df<-read.csv("AAPL.csv") stock <- xts(df[,-1], order.by=as.Date(as.character(df[,1])),descr = "AAPL") chartSeries(stock[,2:4], name = "AAPL",line.type="l", bar.type="ohcl",theme="white",TA="addVo();addSMA(5);addSMA();addSMA(20);addBBands();") xgb_cv_bayes <- function(max_depth, min_child_weight, subsample) { cv <- xgb.cv(params = list(booster = "gbtree", eta = 0.01, max_depth = max_depth, min_child_weight = min_child_weight, subsample = subsample, colsample_bytree = 0.3, lambda = 1, alpha = 0, objective = "binary:logistic", eval_metric = "auc"), data = dtrain, nround = 100, folds = cv_folds, prediction = TRUE, showsd = TRUE, early_stopping_rounds = 5, maximize = TRUE, verbose = 0) list(Score = cv$evaluation_log$test_auc_mean[cv$best_iteration], Pred = cv$pred) } OPT_Res <- BayesianOptimization(xgb_cv_bayes, bounds = list(max_depth = c(2L, 6L), min_child_weight = c(1L, 10L), subsample = c(0.5, 0.8)), init_grid_dt = NULL, init_points = 10, n_iter = 20, acq = "ucb", kappa = 2.576, eps = 0.0, verbose = TRUE) ##test wilcox.test(mac_la,mac_rf,paired = TRUE,conf.int=TRUE) wilcox.test(mac_la,mac_svm,paired = TRUE,conf.int=TRUE) wilcox.test(mac_la,acc_l,paired = TRUE,conf.int=TRUE) wilcox.test(mac_la,ti_acc,paired = TRUE,conf.int=TRUE) wilcox.test(mac_la,si_acc,paired = TRUE,conf.int=TRUE)