{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df = pd.read_csv(\"../input/ptdata/mixdata.csv\") ","metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.describe()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#df.drop(df.columns[[7]], axis=1, inplace=True)\n#df.drop(df.columns[[8]], axis=1, inplace=True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"dataset2 = df.drop(columns = ['complexity'])\ndataset2.corrwith(df.complexity).plot.bar(\n figsize = (20, 10), title = \"Correlation with complexitylevel\", fontsize = 20,\n rot = 45, grid = True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.describe()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.drop(df.columns[[9]], axis=1, inplace=True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.head()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df[2:18822] ","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import seaborn as sns\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import seaborn as sns\nimport matplotlib.pyplot as plt\nno_id_data = df.copy()\nno_id_data.drop(\"id\", axis = 1, inplace = True)\nsns.heatmap(data = no_id_data.corr(), annot = True)\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import train_test_split","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"X = df.iloc[:, :-1].values\ny = df.iloc[:, -1].values\n\n# Splitting the dataset into the Training set and Test set\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.linear_model import LogisticRegression\nclassifier = LogisticRegression()\nclassifier.fit(X_train, y_train)\n\nlog_pred = classifier.predict(X_test)\n\n# Summary of the predictions made by the classifier\nprint(classification_report(y_test, log_pred))\nprint(confusion_matrix(y_test, log_pred))\n# Accuracy score\nfrom sklearn.metrics import accuracy_score\nprint('accuracy is',accuracy_score(log_pred,y_test))\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.metrics import roc_curve, auc\nfrom sklearn import datasets\nfrom sklearn.multiclass import OneVsRestClassifier\n#from sklearn.svm import LinearSVC\nfrom sklearn.preprocessing import label_binarize\n#from sklearn.cross_validation import train_test_split\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\n#from sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.naive_bayes import GaussianNB\n\nimport numpy as np # linear algebra\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(\"../input/ptdata/mixdata.csv\") \n\nX = df.iloc[:, :1].values\ny = df.iloc[:, 2].values\n\ny = label_binarize(y, classes=[0,1,2,3,4,5,6])\nn_classes = 7\nmy_classes = [\"Plain_English\", \"Very_Easy\", \"Fairly_difficuly\", \"Fairly_Easy\", \"Difficult\", \" Easy\", \"very_Difficult\"]\n\n# shuffle and split training and test sets\nX_train, X_test, y_train, y_test =\\\n train_test_split(X, y, test_size=0.33, random_state=0)\n\n# classifier\n#clf = OneVsRestClassifier(LinearSVC(random_state=0))\n#y_score = clf.fit(X_train, y_train).decision_function(X_test)\n\n\n#clf = OneVsRestClassifier(KNeighborsClassifier(random_state=0))\n#y_score= clf.fit(X_train, y_train).decision_function(X_test)\n\nclf = OneVsRestClassifier(LogisticRegression(random_state=0))\ny_score= clf.fit(X_train, y_train).decision_function(X_test)\n\n#clf = OneVsRestClassifier(GaussianNB(random_state=0))\n#y_score = clf.fit(X_train, y_train).decision_function(X_test)\n\n# Compute ROC curve and ROC area for each class\nfpr = dict()\ntpr = dict()\nroc_auc = dict()\nfor i in range(n_classes):\n fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])\n roc_auc[i] = auc(fpr[i], tpr[i])\n # Plot of a ROC curve for a specific class\nfor i in range(n_classes):\n plt.figure()\n plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])\n plt.plot([0, 1], [0, 1], 'k--')\n plt.xlim([0.0, 1.0])\n plt.ylim([0.0, 1.05])\n plt.xlabel('False Positive Rate')\n plt.ylabel('True Positive Rate')\n plt.title('(Receiver operating characteristic)'+ my_classes[i])\n plt.legend(loc=\"lower right\")\n plt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#roctry\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\npd.DataFrame({'Data Type': df.dtypes, 'Null Value': pd.isnull(df).any(), \n 'Count': list(map(lambda column: len(df[column].unique()), df.columns))})","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#roctry\nfrom sklearn.model_selection import train_test_split\n\nunique_cat = set(df['Class_Label'].unique())\ndf['Class_Label_'] = pd.Categorical(df['Class_Label'], categories=unique_cat).codes\n\ny = df['Class_Label_']\nX = df.drop(['Class_Label', 'Class_Label_', 'id'], axis=1)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#roctry\nfrom sklearn.decomposition import PCA\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\n\n\npca_pipe = Pipeline(steps=[('norm', StandardScaler()), ('pca', PCA(n_components=2))]).fit(X_train)\npc = pca_pipe.transform(X_train)\npc_1, pc_2 = list(zip(*pc))\n\ngroups = pd.DataFrame({'pc_1' : pc_1, 'pc_2': pc_2, 'y': y_train}).groupby('y')\n\nplt.figure(figsize=(10,8))\nfor group, color, name in zip(df['Class_Label_'].unique(), ['#0080ff', '#ff6600', '#9966ff','#DFFF00','#40E0D0','#000000','#00FF00'], \n df['Class_Label'].unique()):\n pc_group = groups.get_group(group)\n plt.scatter(pc_group.loc[:,'pc_1'], pc_group.loc[:,'pc_2'], color=color, label=name)\nplt.legend()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#roc try\nfrom sklearn.model_selection import validation_curve\nfrom sklearn.preprocessing import MinMaxScaler, PolynomialFeatures\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import roc_auc_score\n\nclassifier = LogisticRegression()\n#clf = LogisticRegression()\nclassifier_pipe = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)), ('norm', MinMaxScaler()), \n ('classifier', classifier)])\n\nparam_name='C'\nparam_range=np.logspace(-4,1, 50)\n\ntrain_score, test_score = validation_curve(classifier_pipe, X_train, y_train, scoring = 'accuracy',\n param_name='classifier__{}'.format(param_name),\n param_range=param_range)\n\ntrain_max = list(map(np.max, train_score))\ntrain_min = list(map(np.min, train_score))\ntrain_mean = list(map(np.mean, train_score))\n\ntest_max = list(map(np.max, test_score))\ntest_min = list(map(np.min, test_score))\ntest_mean = list(map(np.mean, test_score))\n\nplt.figure(figsize=(10,8))\nfor min_val, max_val, mean_val, color, name in zip([train_min, test_min],\n [train_max, test_max],\n [train_mean, test_mean],\n ['blue', 'orange'], \n ['Train Score', 'Test Score']):\n\n plt.plot(param_range, mean_val, color=color, label=name)\n plt.fill_between(param_range, max_val, min_val, color=color, alpha=0.2)\n\nplt.xlabel('Hyper Parameter ({})'.format(param_name))\nplt.ylabel('Score (Accuracy)')\nsns.despine()\ndef multi_class_roc(class_code):\n\n classifier_pipe.fit(X_train, y_train)\n return roc_auc_score(np.where(y_test==class_code, 1,0), \n [val[class_code] for val in classifier_pipe.predict_proba(X_test)])\n\nprint('roc auc score for Plain_English= {}'.format(multi_class_roc(0)))\nprint('roc auc score for Fairly_Easy = {}'.format(multi_class_roc(1)))\nprint('roc auc score Very_Easy = {}'.format(multi_class_roc(2)))\nprint('roc auc score Fairly_difficult = {}'.format(multi_class_roc(3)))\nprint('roc auc score Very_difficult = {}'.format(multi_class_roc(4)))\nprint('roc auc score Difficult = {}'.format(multi_class_roc(5)))\nprint('roc auc score Easy = {}'.format(multi_class_roc(6)))\nprint('average roc auc score = {}'.format(np.mean([multi_class_roc(0),\n multi_class_roc(1),\n multi_class_roc(2),\n multi_class_roc(3),\n multi_class_roc(4),\n multi_class_roc(5),\n multi_class_roc(6)]))) ","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#try\nfrom sklearn.linear_model import LogisticRegression\nclassifier = LogisticRegression()\nclassifier.fit(X_train, y_train)\n\nlog_pred = classifier.predict(X_test)\n# Summary of the predictions made by the classifier\nprint(classification_report(y_test, log_pred))\n#print(confusion_matrix(y_test, log_pred))\n# Accuracy score\nfrom sklearn.metrics import accuracy_score\nprint('accuracy is',accuracy_score(log_pred,y_test))\n\ncm = confusion_matrix(y_test, log_pred)\nprint('Confusion Matrix')\nprint(cm)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm)\ndisp = disp.plot(cmap=plt.cm.Blues,values_format='g')\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# K-Nearest Neighbours\nfrom sklearn.neighbors import KNeighborsClassifier\n\nclassifier = KNeighborsClassifier(n_neighbors=8)\nclassifier.fit(X_train, y_train)\n\nknn_pred = classifier.predict(X_test)\n\n# Summary of the predictions made by the classifier\nprint(classification_report(y_test, knn_pred))\nprint(confusion_matrix(y_test, knn_pred))\n# Accuracy score\nfrom sklearn.metrics import accuracy_score\nprint('accuracy is',accuracy_score(knn_pred,y_test))","metadata":{"collapsed":true,"jupyter":{"outputs_hidden":true},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.metrics import roc_curve, auc\nfrom sklearn import datasets\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.preprocessing import label_binarize\n#from sklearn.cross_validation import train_test_split\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.metrics import accuracy_score\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(\"../input/ptdata/mixdata.csv\") \n\nX = df.iloc[:, :1].values\ny = df.iloc[:, 2].values\n\ny = label_binarize(y, classes=[0,1,2,3,4,5,6])\nn_classes = 7\nmy_classes = [\"Plain_English\", \"Very_Easy\", \"Fairly_difficuly\", \"Fairly_Easy\", \"Difficult\", \" Easy\", \"very_Difficult\"]\n\n# shuffle and split training and test sets\nX_train, X_test, y_train, y_test =\\\n train_test_split(X, y, test_size=0.33, random_state=0)\n\n# classifier\n#clf = OneVsRestClassifier(LinearSVC(random_state=0))\n\nknn_9 = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=9))\n# fit the model to the training set\nknn_9.fit(X_train, y_train)\n\n# predict on the test-set\ny_pred_9 = knn_9.predict(X_test)\n\n\nprint('Model accuracy score with k=9 : {0:0.4f}'. format(accuracy_score(y_test, y_pred_9)))\n\ny_pred_train = knn_9.predict(X_train)\n\nprint('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))\n\n#y_pred_1 = knn_9.predict_proba(X_test)\ny_scores = knn_9.predict_proba(X_test)\n\n#print(y_scores)\n\nfrom sklearn.metrics import roc_curve\n\n#y_score = clf.fit(X_train, y_train).decision_function(X_test)\n\n# Compute ROC curve and ROC area for each class\nfpr = dict()\ntpr = dict()\nroc_auc = dict()\nfor i in range(n_classes):\n fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_scores[:, i])\n roc_auc[i] = auc(fpr[i], tpr[i])\n\n# Plot of a ROC curve for a specific class\nfor i in range(n_classes):\n plt.figure()\n plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])\n plt.plot([0, 1], [0, 1], 'k--')\n plt.xlim([0.0, 1.0])\n plt.ylim([0.0, 1.05])\n plt.xlabel('False Positive Rate')\n plt.ylabel('True Positive Rate')\n plt.title('(Receiver operating characteristic)'+my_classes[i])\n plt.legend(loc=\"lower right\")\n plt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Support Vector Machine's \nfrom sklearn.svm import SVC\n\nclassifier = SVC()\nclassifier.fit(X_train, y_train)\n\nsvc_pred = classifier.predict(X_test)\n\n# Summary of the predictions made by the classifier\nprint(classification_report(y_test, svc_pred))\nprint(confusion_matrix(y_test, svc_pred))\n# Accuracy score\nfrom sklearn.metrics import accuracy_score\nprint('accuracy is',accuracy_score(svc_pred,y_test))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.metrics import roc_curve, auc\nfrom sklearn import datasets\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.preprocessing import label_binarize\n#from sklearn.cross_validation import train_test_split\nfrom sklearn.model_selection import train_test_split\n\n\nimport numpy as np # linear algebra\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\ndf = pd.read_csv(\"../input/ptdata/mixdata.csv\") \n\nX = df.iloc[:, :1].values\ny = df.iloc[:, 2].values\n\ny = label_binarize(y, classes=[0,1,2,3,4,5,6])\nn_classes = 7\nmy_classes = [\"Plain_English\", \"Very_Easy\", \"Fairly_difficuly\", \"Fairly_Easy\", \"Difficult\", \" Easy\", \"very_Difficult\"]\n\n# shuffle and split training and test sets\nX_train, X_test, y_train, y_test =\\\n train_test_split(X, y, test_size=0.33, random_state=0)\n\n# classifier\nclf = OneVsRestClassifier(LinearSVC(random_state=0))\ny_score = clf.fit(X_train, y_train).decision_function(X_test)\n\n# Compute ROC curve and ROC area for each class\nfpr = dict()\ntpr = dict()\nroc_auc = dict()\nfor i in range(n_classes):\n fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])\n roc_auc[i] = auc(fpr[i], tpr[i])\n # Plot of a ROC curve for a specific class\nfor i in range(n_classes):\n plt.figure()\n plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])\n plt.plot([0, 1], [0, 1], 'k--')\n plt.xlim([0.0, 1.0])\n plt.ylim([0.0, 1.05])\n plt.xlabel('False Positive Rate')\n plt.ylabel('True Positive Rate')\n plt.title('(Receiver operating characteristic)'+my_classes[i])\n plt.legend(loc=\"lower right\")\n plt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Decision Tree's\nfrom sklearn.tree import DecisionTreeClassifier\n\nclassifier = DecisionTreeClassifier()\n\nclassifier.fit(X_train, y_train)\n\ny_pred = classifier.predict(X_test)\n\n# Summary of the predictions made by the classifier\nprint(classification_report(y_test, y_pred))\nprint(confusion_matrix(y_test, y_pred))\n# Accuracy score\nfrom sklearn.metrics import accuracy_score\nprint('accuracy is',accuracy_score(y_pred,y_test))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Gaussian Naive Bayes\nfrom sklearn.naive_bayes import GaussianNB\nclassifier = GaussianNB()\nclassifier.fit(X_train, y_train)\n\nnv_pred = classifier.predict(X_test)\n\n# Summary of the predictions made by the classifier\nprint(classification_report(y_test, nv_pred))\nprint(confusion_matrix(y_test, nv_pred))\n# Accuracy score\nfrom sklearn.metrics import accuracy_score\nprint('accuracy is',accuracy_score(nv_pred,y_test))\n\ncm = confusion_matrix(y_test, nv_pred)\nprint('Confusion Matrix')\nprint(cm)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm)\ndisp = disp.plot(cmap=plt.cm.Blues,values_format='g')\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#RandomForestClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nclf=RandomForestClassifier (n_estimators=100)\nclf.fit (X_train, y_train)\npred=clf. predict (X_test)\nfrom sklearn.metrics import classification_report, confusion_matrix\nprint (confusion_matrix (y_test, pred))\nprint (classification_report (y_test, pred))\nfrom sklearn.metrics import accuracy_score as asc\nprint (asc (y_test,pred))\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#tryyy\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nmatrix_1 = confusion_matrix(y_test, log_pred) \nmatrix_2 = confusion_matrix(y_test, svc_pred) \nmatrix_3 = confusion_matrix(y_test, knn_pred)\nmatrix_4 = confusion_matrix(y_test, nv_pred)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#try\ndf_1 = pd.DataFrame(matrix_1,\n index = ['Plain_English','Very_Easy','Fairly_difficult','Fairly_Easy','Dificult','Easy','very_difficult'], \n columns = ['Plain_English','Very_Easy','Fairly_difficult','Fairly_Easy','Dificult','Easy','very_difficult'])\n\ndf_2 = pd.DataFrame(matrix_2,\n index = ['Plain_English','Very_Easy','Fairly_difficult','Fairly_Easy','Dificult','Easy','very_difficult'], \n columns = ['Plain_English','Very_Easy','Fairly_difficult','Fairly_Easy','Dificult','Easy','very_difficult'])\n\ndf_3 = pd.DataFrame(matrix_3,\n index = ['Plain_English','Very_Easy','Fairly_difficult','Fairly_Easy','Dificult','Easy','very_difficult'], \n columns = ['Plain_English','Very_Easy','Fairly_difficult','Fairly_Easy','Dificult','Easy','very_difficult'])\ndf_4 = pd.DataFrame(matrix_4,\n index = ['Plain_English','Very_Easy','Fairly_difficult','Fairly_Easy','Dificult','Easy','very_difficult'], \n columns = ['Plain_English','Very_Easy','Fairly_difficult','Fairly_Easy','Dificult','Easy','very_difficult'])","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#try\nplt.figure(figsize=(100,10))\nplt.subplots_adjust(hspace = .50)\n#plt.subplot(1,3,1)\n#plt.title('confusion_matrix(logistic regression)')\n#sns.heatmap(df_1, annot=True,cmap='Blues',values_format='g')\n\ntarget_names = ['Plain_English','Very_Easy','Fairly_difficult','Fairly_Easy','Dificult','Easy','very_difficult']\nlabels_names = [0,1,2,3,4,5,6]\n\ncm = confusion_matrix(y_test, log_pred)\nprint('Confusion Matrix')\nprint(cm)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=target_names)\ndisp = disp.plot(cmap=plt.cm.Blues,values_format='g')\nplt.title('confusion_matrix(logistic regression)')\nplt.show()\n\ncm = confusion_matrix(y_test, svc_pred)\nprint('Confusion Matrix')\nprint(cm)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=target_names)\ndisp = disp.plot(cmap=plt.cm.Blues,values_format='g')\nplt.title('confusion_matrix(Support vector machine)')\nplt.show()\n\ncm = confusion_matrix(y_test, knn_pred)\nprint('Confusion Matrix')\nprint(cm)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=target_names)\ndisp = disp.plot(cmap=plt.cm.Blues,values_format='g')\nplt.title('confusion_matrix(KNearest Neighbour)')\nplt.show()\n\ncm = confusion_matrix(y_test, nv_pred)\nprint('Confusion Matrix')\nprint(cm)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=target_names)\ndisp = disp.plot(cmap=plt.cm.Blues,values_format='g')\nplt.title('confusion_matrix(Naive Bayes)')\nplt.show()\n\n#plt.subplot(1,3,2)\n#plt.title('confusion_matrix(Support vector machines)')\n#sns.heatmap(df_2, annot=True,cmap='Greens')\n#plt.subplot(1,3,3)\n#plt.title('confusion_matrix(Random forest)')\n#sns.heatmap(df_3, annot=True,cmap='Reds')\n#plt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"X = df.iloc[:,1:-1].values\ny = df.iloc[:,-1].values","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.preprocessing import LabelEncoder\nfrom keras.utils import to_categorical\n\nlabel_encoder_y = LabelEncoder()\ny = label_encoder_y.fit_transform(y)\ny = to_categorical(y)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.preprocessing import StandardScaler\n\nstandard_scaler = StandardScaler()\nX = standard_scaler.fit_transform(X)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from keras.models import Sequential\nfrom keras.layers import Dense, Dropout\n\nmodel = Sequential()\n\nmodel.add(Dense(64, input_dim = 8, activation = 'relu'))\n#model.add(Dense(64, activation = 'relu'))\n#model.add(Dense(128, activation = 'relu'))\n#model.add(Dense(64, activation = 'relu'))\nmodel.add(Dense(64, activation = 'relu'))\nmodel.add(Dropout(0.4))\n#model.add(Dense(16, activation = 'relu'))\nmodel.add(Dense(7, activation = 'softmax'))\nmodel.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])\nmodel.summary()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model.fit(X_train, y_train, batch_size = 64, epochs = 10, verbose = 0)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model.fit(X_train,y_train,epochs=20)\naccuracy= model.evaluate(X_test,y_test)[1]\nprint (\"Accuracy :{}\", format (accuracy))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#tryroc\nfrom sklearn.metrics import roc_curve, auc\n\ny_scores = model.predict_proba(X_test)\n\n# Compute ROC curve and ROC area for each class\nn_classes = 7\nmy_classes = [\"Plain_English\", \"Very_Easy\", \"Fairly_difficuly\", \"Fairly_Easy\", \"Difficult\", \" Easy\", \"very_Difficult\"]\nfpr = dict()\ntpr = dict()\nroc_auc = dict()\nfor i in range(n_classes):\n fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_scores[:, i])\n roc_auc[i] = auc(fpr[i], tpr[i])\n\n# Plot of a ROC curve for a specific class\nfor i in range(n_classes):\n plt.figure()\n plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])\n plt.plot([0, 1], [0, 1], 'k--')\n plt.xlim([0.0, 1.0])\n plt.ylim([0.0, 1.05])\n plt.xlabel('False Positive Rate')\n plt.ylabel('True Positive Rate')\n plt.title('(Receiver operating characteristic)' + my_classes[i])\n plt.legend(loc=\"lower right\")\n plt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import sklearn.metrics as metrics\n\nmatrix = metrics.confusion_matrix(y_test.argmax(axis=1), y_scores.argmax(axis=1))\nmatrix\ncm = metrics.confusion_matrix(y_test.argmax(axis=1), y_scores.argmax(axis=1))\nprint('Confusion Matrix')\nprint(cm)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm)\ndisp = disp.plot(cmap=plt.cm.Blues,values_format='g')\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(classification_report(y_test.argmax(axis=1), y_scores.argmax(axis=1)))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# calculate the f1-measure\nfrom sklearn.metrics import fbeta_score\nfrom sklearn.metrics import precision_score\nfrom sklearn.metrics import recall_score\ny_score = model.predict_proba(X_test)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from keras.models import load_model\n#Load the saved model and predict the accuracy\n#ann_model = load_model('ANN_Traffic_Model.h5')\nmodel.summary()\nresults = model.evaluate(X_test,y_test)\nprint(\"Accuracy of the Model %.2f%%\" % (results[1]*100))\n#print(\"F1 of the Model %.2f%%\" % (results[2]*100))\n#print(\"Precision of the Model %.2f%%\" % (results[3]*100))\n#print(\"Recall of the Model %.2f%%\" % (results[4]*100))\n#y_pred_bool = np.argmax(y_pred, axis=1)\n\nfrom sklearn.metrics import classification_report\n\ny_pred = model.predict(X_test, batch_size=64, verbose=1)\ny_pred_bool = np.argmax(y_pred, axis=1)\n\nprint(classification_report(y_test, y_pred_bool))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.metrics import precision_recall_curve\n# precision recall curve\nimport matplotlib.pyplot as plt\nprecision = dict()\nrecall = dict()\nfor i in range(7):\n precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i])\n plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))\n\nplt.xlabel(\"recall\")\nplt.ylabel(\"precision\")\nplt.legend(loc=\"best\")\nplt.title(\"precision vs. recall curve\")\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.metrics import precision_recall_curve\nfrom sklearn.metrics import average_precision_score\n\n\n#print(y_score[:, 0])\n\n\n# For each class\nprecision = dict()\nrecall = dict()\naverage_precision = dict()\nfor i in range(7):\n print(i)\n precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i])\n average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])\n\n# A \"micro-average\": quantifying score on all classes jointly\nprecision[\"micro\"], recall[\"micro\"], _ = precision_recall_curve(y_test.ravel(),\n y_score.ravel())\naverage_precision[\"micro\"] = average_precision_score(y_test, y_score,\n average=\"micro\")\nprint('Average precision score, micro-averaged over all classes: {0:0.2f}'\n .format(average_precision[\"micro\"]))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from itertools import cycle\n# setup plot details\ncolors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])\n\nplt.figure(figsize=(7, 8))\nf_scores = np.linspace(0.2, 0.8, num=4)\nlines = []\nlabels = []\nfor f_score in f_scores:\n x = np.linspace(0.01, 1)\n y = f_score * x / (2 * x - f_score)\n l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)\n plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))\n\nlines.append(l)\nlabels.append('iso-f1 curves')\nl, = plt.plot(recall[\"micro\"], precision[\"micro\"], color='gold', lw=2)\nlines.append(l)\nlabels.append('micro-average Precision-recall (area = {0:0.2f})'\n ''.format(average_precision[\"micro\"]))\n\nfor i, color in zip(range(7), colors):\n l, = plt.plot(recall[i], precision[i], color=color, lw=2)\n lines.append(l)\n labels.append('Precision-recall for class {0} (area = {1:0.2f})'\n ''.format(i, average_precision[i]))\n\nfig = plt.gcf()\nfig.subplots_adjust(bottom=0.25)\nplt.xlim([0.0, 1.0])\nplt.ylim([0.0, 1.05])\nplt.xlabel('Recall')\nplt.ylabel('Precision')\nplt.title('Extension of Precision-Recall curve to multi-class')\nplt.legend(lines, labels, loc=(0, -.38), prop=dict(size=14))\n\n\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]}