#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier as GBDT
from xgboost import XGBClassifier as XGB
from xgboost import plot_importance
from sklearn.model_selection import cross_val_score, GridSearchCV,StratifiedShuffleSplit
from sklearn.metrics import f1_score,accuracy_score,recall_score,\
precision_score,confusion_matrix,roc_auc_score,matthews_corrcoef,roc_curve
from sklearn.preprocessing import MinMaxScaler
import time
import pickle
import matplotlib.pyplot as plt
# help(GridSearchCV)
# exit()

'''
This code is the final model with top 10 of 65 dimensional features.
The top 10 features were chosen by feature_importance from xgboost.
The best model is xgboost with parameters as follows: 
colsample_bytree=0.2, learning_rate=0.04, max_depth=4,n_estimators=100,and random_state=5.
'''

def get_model(estimator, parameters, X_train, y_train, scoring):  
    '''
    Gridsearch for the best model
    '''

    model = GridSearchCV(estimator, param_grid=parameters, cv=StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=1), scoring=scoring)
    model.fit(X_train, y_train)
    return model.best_estimator_

def scores(y_test,y_pred,th=0.5):
    '''
    Evaluation
    '''

    y_predlabel=[(0 if item<th else 1) for item in y_pred]
    tn,fp,fn,tp=confusion_matrix(y_test,y_predlabel).flatten()
    SPE=tn*1./(tn+fp)
    MCC=matthews_corrcoef(y_test,y_predlabel)
    Recall=recall_score(y_test, y_predlabel)
    Precision=precision_score(y_test, y_predlabel)
    F1=f1_score(y_test, y_predlabel)
    Acc=accuracy_score(y_test, y_predlabel)
    AUC=roc_auc_score(y_test, y_pred)
    return [Recall,SPE,Precision,F1,MCC,Acc,AUC,tp,fn,tn,fp]


train=pd.read_csv("../data/Train-50bp-sameTranscript-Result.csv",sep=",")
trainX,trainY=train.values[:,1:],train.values[:,0]

test=pd.read_csv("../data/Independent.csv",sep=",")
testX,testY=test.values[:,1:],test.values[:,0]




# define the model
model = XGB(random_state=1)
# fit the model
model.fit(trainX, trainY)
# get importance
importance = model.feature_importances_
importance_desend = sorted(range(len(importance)), key=lambda k: importance[k], reverse=True)
print(importance_desend)

# summarize feature importance
# for i,v in enumerate(importance):
# 	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
# plt.bar([x for x in range(len(importance))], importance)
# plt.show()
# plot_importance(model)
# plt.savefig('feature_importance.png',dpi=300)
# plot_importance(model,importance_type='gain')
# plt.show()
# plt.savefig('feature_importance_gain.png',dpi=300)

# start = time.time()

# Random Forest
# param={"n_estimators":[300,400,500],"max_depth":[4,6,8,10,12]}
# clf=get_model(RF(random_state=1, n_jobs = 4),param,trainX,trainY,scoring="roc_auc")
# print clf

# SVM
# param={"C":[0.5,1,2,4,6,8],"gamma":[0.001,0.01,0.1,1,10]}
# clf=get_model(SVC(probability=True),param,trainX,trainY,scoring="roc_auc")
# print clf

# Multilayer Perceptron
# param={"activation":['identity','logistic','tanh','relu'],"alpha":[0.0001,0.001,0.1]}
# param={"activation":['tanh'],"alpha":[0.35,0.4,0.45]}
# clf=get_model(MLPClassifier(max_iter=500,random_state=1),param,trainX,trainY,scoring="roc_auc")
# print clf

# Linear Discriminant Analysis
# param={"solver":['lsqr','lsqr'],"shrinkage":[None,'auto',0.5]}
# clf=get_model(LDA(n_components=None),param,trainX,trainY,scoring="roc_auc")
# print clf

# GaussianNB
# param={"priors":[None]}
# clf=get_model(GaussianNB(),param,trainX,trainY,scoring="roc_auc")
# print clf

# Gradient Boost decesion Trees
# param={"n_estimators":[50,100,200,300,400],"learning_rate":[0.01,0.02,0.04,0.06,0.08,0.1,0.2,0.4]}
# clf=get_model(GBDT(random_state=1),param,trainX,trainY,scoring="roc_auc")
# print clf

# XGBoost
# param={"colsample_bytree":[0.2,0.4,0.6,0.8,1],"learning_rate":[0.004,0.04,0.4],"max_depth":[4,6,8,10,12,20]}#train auc: 0.8354154795821461;test auc: 0.7400324422718398;the grid search has costed 306.0738196372986 seconds.
# # param={"colsample_bytree":[0.2,0.6,1],"learning_rate":[0.004,0.04,0.4],"max_depth":[4,12,20]}#train auc: 0.8354154795821461;test auc: 0.7400324422718398;the grid search has costed 99.7772889137268 seconds.
# param = {"colsample_bytree": [0.2], "learning_rate": [0.04], "max_depth": [4], 'n_estimators': [100]}  # train auc:0.8156054131054132; test auc:0.7903032838625387;the grid search has costed 20.41343331336975 seconds.
# clf = get_model(XGB(n_jobs=-1, random_state=1), param, trainX, trainY, scoring="roc_auc")

# end = time.time()

# print('the grid search has costed', end - start,'seconds.')
# print(clf)

# clf = XGB(colsample_bytree=0.2, learning_rate=0.04, max_depth=4)
# clf = RF(random_state=1, n_jobs = 4)
def train_test_model(clf,trainX,trainY,testX,testY):
    scaler = MinMaxScaler()
    trainX = scaler.fit_transform(trainX)
    testX = scaler.transform(testX)
    aucs=cross_val_score(clf,trainX,trainY,cv=StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=1),scoring="roc_auc",n_jobs=-1)
  
    clf.fit(trainX,trainY)

    pTest=clf.predict_proba(testX)[:,1]
    y_pred=clf.predict(testX)

    score=scores(testY,pTest)
    # print("test auc:",score[6])
    with open('../results/test1031.txt', 'w') as f:
        for i,j,k in zip(testY, y_pred,pTest):
            f.writelines(str(i)+'\t'+str(j)+'\t'+str(k)+'\n')

    return aucs.mean(),score[6]

# result_list = []
# with open('../results/feature_subset_result_100.txt','w') as fp:
#     for i in range(len(importance_desend)):
#         feature_subset_index = [j+1 for j in importance_desend[:(i+1)]]
#         # print(feature_subset_index)
#         # print(train.columns[feature_subset_index])
#         trainX,trainY=train[train.columns[feature_subset_index]].values,train.values[:,0]
#         testX, testY = test[test.columns[feature_subset_index]].values, test.values[:, 0]
#         # print(train.columns[feature_subset_index])
#         clf = XGB(colsample_bytree=0.2, learning_rate=0.04, max_depth=4,n_estimators=100,random_state=5)
#         train_auc, test_auc = train_test_model(clf,trainX,trainY,testX,testY)
#         result_list.append([feature_subset_index,train_auc,test_auc])
#         fp.writelines('\t'.join(map(str,[feature_subset_index,train_auc,test_auc]))+'\n')
# # print(result_list)



feature_subset_index = [j + 1 for j in importance_desend[:10]]
print(train.columns[feature_subset_index])
trainX,trainY=train[train.columns[feature_subset_index]].values,train.values[:,0]
testX, testY = test[test.columns[feature_subset_index]].values, test.values[:, 0]
clf = XGB(colsample_bytree=0.2, learning_rate=0.04, max_depth=4,n_estimators=100,random_state=5)
train_auc, test_auc = train_test_model(clf,trainX,trainY,testX,testY)
print(train_auc,test_auc)

# # save the model to disk
# filename = '../results/finalized_model_top10features1031.pkl'
# pickle.dump(clf, open(filename, 'wb'))