{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "f5451a89-b314-4647-a4af-faf2d38e9968", "metadata": {}, "outputs": [], "source": [ "#DATA \n", "\n", "#Ferenc et al’s Datase http://www.inf.u-szeged.hu/~ferenc/papers/JSVulnerabilityDataSet/\n", "#Viszkok et al’s Dataset: https://security.snyk.io/\n", "#Apache Tomcat Dataset: https://github.com/palmafr/MDPIData2022/tree/main/datasets\n", "\n", "#Types of Vulnerabilities (Multi-Classes) Dataset ganesh2021predicting" ] }, { "cell_type": "code", "execution_count": null, "id": "40e7fae4-1de7-40cf-9d7e-ac9cfe618fab", "metadata": {}, "outputs": [], "source": [ "#CODE\n", "#Machine learning for Binary classes\n", " # compare ensemble to each baseline classifier\n", "from numpy import mean\n", "from numpy import std\n", "from sklearn.datasets import make_classification\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import RepeatedStratifiedKFold\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.ensemble import StackingClassifier\n", "from matplotlib import pyplot\n", "import pandas as pd\n", "from sklearn import svm\n", "from sklearn.model_selection import GridSearchCV\n", "import os\n", "import matplotlib.pyplot as plt\n", "#from skimage.transform import resize\n", "import imread\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report,accuracy_score,\n", "confusion_matrix\n", "import pickle\n", "# loading library\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.ensemble import BaggingClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB\n", "from sklearn import svm\n", "from imread import imread, imsave\n", "from PIL import Image\n", " import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler\n", "from keras.models import Model\n", "from keras.layers import Input\n", "import seaborn as sns\n", "from keras.layers.core import Activation, Dropout, Dense\n", "from sklearn import preprocessing\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import classification_report\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "import sys\n", "import os\n", "from math import log\n", "import scipy as sp\n", "from imblearn.over_sampling import RandomOverSampler, SMOTE\n", "from imblearn.under_sampling import RandomUnderSampler\n", "\n", "import matplotlib.pyplot as plt\n", "from keras.optimizers import SGD\n", "import tensorflow as tf \n", "from keras.models import Sequential\n", "from keras.layers import Dropout, Dense, Conv1D, Flatten, MaxPooling1D\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets import load_iris\n", "from numpy import unique\n", "from keras.layers import Dense, Input, LSTM, Dropout, SimpleRNN, Embedding,\n", "Reshape\n", "from sklearn.metrics import confusion_matrix\n", "from collections import Counter\n", "from xgboost import XGBClassifier\n", "\n", "\n", "df = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/JSVulnerabilityDataSet-1.0.csv')\n", "#df = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/MDPIData2022-main/datasets/tomcat-final.csv')\n", "#df = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/MDPIData2022-main/datasets/struts-final.csv')\n", "\n", "\n", "\n", "import matplotlib.pyplot as plt\n", "df.columns\n", "\n", "\n", "sns.countplot(x='Vuln', data=df)\n", "\n", "X = df[[ 'CC', 'CCL', 'CCO', 'CI', 'CLC', 'CLLC', 'McCC',\n", " 'NL', 'NLE', 'CD', 'CLOC', 'DLOC', 'TCD', 'TCLOC',\n", " 'LLOC', 'LOC', 'NOS','NUMPAR', 'TLLOC', 'TLOC', \n", " 'TNOS', 'HOR_D', 'HOR_T', 'HON_D', 'HON_T',\n", " 'HLEN', 'HVOC', 'HDIFF', 'HVOL', 'HEFF', \n", " 'HBUGS', 'HTIME', 'CYCL', 'PARAMS', \n", " 'CYCL_DENS']].values\n", "\n", "\n", "\n", "#y = df['Vuln']\n", "y=df['Vuln']\n", "# label_encoder object knows how to understand word labels.\n", "label_encoder = preprocessing.LabelEncoder()\n", "\n", "# Encode labels in column 'species'.\n", "y = label_encoder.fit_transform(y)\n", "\n", "counter = Counter(y)\n", "print(counter)\n", "\n", "#ros = RandomOverSampler(sampling_strategy={ 0: 10629,1: 10629},random_state=42) # String\n", "#X, y = ros.fit_resample(X, y)\n", "\n", "#rus = RandomUnderSampler(sampling_strategy={ 0: 1496,1: 1496},random_state=42) # String\n", "#X, y = rus.fit_resample(X, y)\n", "\n", "#smote = SMOTE() #SMOTE(\"minority\")\n", "#X, y= smote.fit_resample(X, y)\n", "\n", "\n", "\n", "counter = Counter(y)\n", "print(counter)\n", "\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X,y, \n", "test_size=0.33,shuffle=True, random_state=42, stratify=y)\n", "X_test1=X_test\n", "X_train1=X_train\n", "y_test1=y_test\n", "y_train1=y_train\n", "\n", "\n", "from tensorflow.keras.utils import to_categorical\n", "y_train = to_categorical(y_train)\n", "y_test = to_categorical(y_test)\n", "\n", "\n", "\n", "from sklearn.preprocessing import MinMaxScaler\n", "scaler = MinMaxScaler()\n", "scaler.fit(X_train)\n", "\n", "Xtrain_scaled = scaler.transform(X_train)\n", "Xtest_scaled = scaler.transform(X_test)\n", "\n", "Xtrain_scaled.shape\n", "\n", "\n", "\n", "#X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,\n", "random_state=42,stratify=y)\n", "print('Splitted Successfully')\n", "\n", "\n", "# get the dataset\n", "#def get_dataset():\n", "#\tX, y = make_classification(n_samples=1000, n_features=20, \n", "n_informative=15, n_redundant=5, random_state=1)\n", "#\treturn X, y\n", " \n", "# get a stacking ensemble of models\n", "def get_stacking():\n", "\t# define the base models\n", "\tlevel0 = list()\n", "\tlevel0.append(('cart', DecisionTreeClassifier()))\n", "\tlevel0.append(('lr', LogisticRegression(solver='liblinear', \n", " max_iter=1000)))\n", "\tlevel0.append(('bayes', GaussianNB()))\n", "\tlevel0.append(('xgboost', XGBClassifier()))\n", "\tlevel0.append(('bagging', BaggingClassifier()))\n", "\tlevel0.append(('RF', RandomForestClassifier()))\n", "\tlevel0.append(('knn', KNeighborsClassifier()))\n", "\tlevel0.append(('svm', SVC()))\n", "\n", "\t# define meta learner model\n", "\tlevel1 = LogisticRegression(solver='liblinear',max_iter=1000)\n", "\t# define the stacking ensemble\n", "\tmodel = StackingClassifier(estimators=level0, final_estimator=\n", " level1, cv=10)\n", "\treturn model\n", " \n", "# get a list of models to evaluate\n", "def get_models():\n", "\tmodels = dict()\n", "\tmodels['cart'] = DecisionTreeClassifier()\n", "\tmodels['lr'] = LogisticRegression(solver='liblinear',\n", " max_iter=1000)\n", "\tmodels['bayes'] = GaussianNB()\n", "\tmodels['xgboost'] = XGBClassifier()\n", "\tmodels['bagging']=BaggingClassifier()\n", "\tmodels['rf']=RandomForestClassifier()\n", "\tmodels['knn'] = KNeighborsClassifier()\n", "\tmodels['svm'] = SVC()\n", "\tmodels['stacking'] = get_stacking()\n", "\n", "\n", "\treturn models\n", " \n", "from sklearn.metrics import classification_report, accuracy_score, \n", "make_scorer\n", "\n", "# evaluate a give model using cross-validation\n", "def evaluate_model(model, X, y,name):\n", "\tcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, \n", " random_state=1)\n", "\tprint(\"=================================\"+name+\"==============================================\")\n", "\t#scores = cross_val_score(model, X, y, \n", " scoring=make_scorer(classification_report_with_accuracy_score)\n", " , cv=cv, n_jobs=-1, error_score='raise')\n", "\tscores = cross_val_score(model, X, y, scoring=\"f1\", cv=cv, \n", " n_jobs=-1, error_score='raise')\n", "\treturn scores\n", "\n", "\n", "# get the models to evaluate\n", "models = get_models()\n", "# evaluate the models and store results\n", "results, names = list(), list()\n", "for name, model in models.items():\n", "\t#scores = evaluate_model(model, X, y,name)\n", "\t#results.append(scores)\n", "\t#names.append(name)\n", "\tprint(\"=================================\"+name+\"==============================================\")\n", "\t#print(name)\n", "\tcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, \n", " random_state=42)\n", "\t#cvs1 = cross_val_score(model, X, y, scoring=\n", " make_scorer(classification_report_with_accuracy_score),\n", " cv=cv, n_jobs=-1, error_score='raise').mean()\n", "\tfor score in [\"roc_auc\", \"f1\", \"precision\", \"recall\", \"accuracy\"]:\n", "\t\tcvs = cross_val_score(model, X, y, scoring=score, cv=cv,\n", " n_jobs=-1, error_score='raise').mean()\n", "\t\tprint(score + \" : \"+ str(cvs))\n", "\t#print(cvs1) \n", "\tprint('\\n')\n", "pyplot.boxplot(results, labels=names, showmeans=True)\n", "pyplot.show()\n", "\\end{lstlisting}\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "id": "94b77efa-a205-4b49-bb11-64ee390d8a33", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c73f1b02-5be5-4d84-b746-68f52ed94257", "metadata": {}, "outputs": [], "source": [ "#CODE\n", "\n", "#Machine learning for Multi-classes\n", "# compare ensemble to each baseline classifier\n", "from numpy import mean\n", "from numpy import std\n", "from sklearn.datasets import make_classification\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import RepeatedStratifiedKFold\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.ensemble import StackingClassifier\n", "from matplotlib import pyplot\n", "import pandas as pd\n", "from sklearn import svm\n", "from sklearn.model_selection import GridSearchCV\n", "import os\n", "import matplotlib.pyplot as plt\n", "#from skimage.transform import resize\n", "import imread\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import \n", "classification_report,accuracy_score,confusion_matrix\n", "import pickle\n", "# loading library\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.ensemble import BaggingClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB\n", "from sklearn import svm\n", "from imread import imread, imsave\n", "from PIL import Image\n", " import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler\n", "from keras.models import Model\n", "from keras.layers import Input\n", "import seaborn as sns\n", "from keras.layers.core import Activation, Dropout, Dense\n", "from sklearn import preprocessing\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import classification_report\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "import sys\n", "import os\n", "from math import log\n", "import scipy as sp\n", "from imblearn.over_sampling import RandomOverSampler, SMOTE\n", "from imblearn.under_sampling import RandomUnderSampler\n", "\n", "import matplotlib.pyplot as plt\n", "from keras.optimizers import SGD\n", "import tensorflow as tf \n", "from keras.models import Sequential\n", "from keras.layers import Dropout, Dense, Conv1D, Flatten, MaxPooling1D\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets import load_iris\n", "from numpy import unique\n", "from keras.layers import Dense, Input, LSTM, Dropout, SimpleRNN, Embedding, Reshape\n", "from sklearn.metrics import confusion_matrix\n", "from collections import Counter\n", "from xgboost import XGBClassifier\n", "import matplotlib.pyplot as plt\n", "\n", "df =pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/PU_Dataset.csv')\n", "df1 =pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/cwe119_GCDFile.csv')\n", "df2 =pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/cwe399_cgd.csv')\n", "df3 =pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/API_function_call.csv')\n", "df4 =pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/Array_usage.csv')\n", "df5 =pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/Arithmetic_expression.csv')\n", "\n", "import matplotlib.pyplot as plt\n", "df.columns\n", "\n", "\n", "sns.countplot(x='IsVulnerable', data=df)\n", "\n", "X = df[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators','N2_Total_Number_Of_Operands',\n", " 'N_Program_Length','B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2', \n", " 'D_Difficulty','E_Effort','T_Time_Required_To_Program',\n", " 'V_Volume','_N_Calculated_Program_Length',\n", " 'McCab_Number']].values\n", "\n", "\n", "y= df['IsVulnerable'].values\n", "\n", "\n", "\n", "print(len(y))\n", "\n", "\n", "\n", "\n", "import matplotlib.pyplot as plt1\n", "df1.columns\n", "\n", "\n", "sns.countplot(x='IsVulnerable', data=df1)\n", "\n", "X1 = df1[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators','N2_Total_Number_Of_Operands',\n", " 'N_Program_Length','B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2','D_Difficulty','E_Effort',\n", " 'T_Time_Required_To_Program','V_Volume',\n", " '_N_Calculated_Program_Length',\n", " 'McCab_Number']].values\n", " \n", "y1= df1['IsVulnerable'].values\n", "\n", "\n", "print(len(y1))\n", "\n", "\n", "X2 = df2[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators','N2_Total_Number_Of_Operands',\n", " 'N_Program_Length','B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2','D_Difficulty',\n", " 'E_Effort','T_Time_Required_To_Program','V_Volume',\n", " '_N_Calculated_Program_Length',\n", " 'McCab_Number']].values\n", "\n", "\n", "y2= df2['IsVulnerable'].values\n", "\n", "\n", "\n", "\n", "\n", "\n", "print(len(y2))\n", "\n", "\n", "\n", "X3 = df3[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators','n2_Number_Of_Distinct_Operands',\n", " 'n_Program_Vocabulary','N1_Total_Number_Of_Operators',\n", " 'N2_Total_Number_Of_Operands','N_Program_Length',\n", " 'B_Number_of_Delivered_Bugs_1','B_Number_of_Delivered_Bugs_2',\n", " 'D_Difficulty','E_Effort','T_Time_Required_To_Program','V_Volume',\n", " '_N_Calculated_Program_Length','McCab_Number']].values\n", "\n", "y3= df3['IsVulnerable'].values\n", "\n", "print(len(y3))\n", "\n", "\n", "X4 = df4[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators','N2_Total_Number_Of_Operands',\n", " 'N_Program_Length','B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2','D_Difficulty',\n", " 'E_Effort','T_Time_Required_To_Program','V_Volume',\n", " '_N_Calculated_Program_Length','McCab_Number']].values\n", "\n", "\n", "y4= df4['IsVulnerable'].values\n", "\n", "X5 = df5[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands',\n", " 'n_Program_Vocabulary','N1_Total_Number_Of_Operators',\n", " 'N2_Total_Number_Of_Operands','N_Program_Length',\n", " 'B_Number_of_Delivered_Bugs_1','B_Number_of_Delivered_Bugs_2',\n", " 'D_Difficulty','E_Effort','T_Time_Required_To_Program',\n", " 'V_Volume','_N_Calculated_Program_Length',\n", " 'McCab_Number']].values\n", "\n", "\n", "y5= df5['IsVulnerable'].values\n", "\n", "\n", "print(len(y5))\n", "\n", "\n", "import numpy as np\n", "X=np.concatenate((X,X1,X2,X3,X4,X5))\n", "y=np.concatenate((y,y1,y2,y3,y4,y5))\n", "\n", "import numpy\n", "print(numpy.unique(y))\n", "\n", "# label_encoder object knows how to understand word labels.\n", "label_encoder = preprocessing.LabelEncoder()\n", "\n", "# Encode labels in column 'species'.\n", "y = label_encoder.fit_transform(y)\n", "print(numpy.unique(y))\n", "\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33,shuffle=True, random_state=42, stratify=y)\n", "\n", "\n", "\n", "\n", "\n", "from tensorflow.keras.utils import to_categorical\n", "y_train = to_categorical(y_train)\n", "y_test = to_categorical(y_test)\n", "\n", "\n", "\n", "from sklearn.preprocessing import MinMaxScaler\n", "scaler = MinMaxScaler()\n", "scaler.fit(X_train)\n", "Xtrain_scaled = scaler.transform(X_train)\n", "Xtest_scaled = scaler.transform(X_test)\n", "Xtrain_scaled.shape\n", "print('Splitted Successfully')\n", "\n", " \n", "# get a stacking ensemble of models\n", "def get_stacking():\n", "\t# define the base models\n", "\tlevel0 = list()\n", "\tlevel0.append(('cart', DecisionTreeClassifier()))\n", "\tlevel0.append(('lr', LogisticRegression(solver='liblinear', \n", " max_iter=1000)))\n", "\tlevel0.append(('bayes', GaussianNB()))\n", "\tlevel0.append(('xgboost', XGBClassifier()))\n", "\tlevel0.append(('bagging', BaggingClassifier()))\n", "\tlevel0.append(('RF', RandomForestClassifier()))\n", "\tlevel0.append(('knn', KNeighborsClassifier()))\n", "\tlevel0.append(('svm', SVC()))\n", "\n", "\t# define meta learner model\n", "\tlevel1 = LogisticRegression(solver='liblinear',max_iter=1000)\n", "\t# define the stacking ensemble\n", "\tmodel = StackingClassifier(estimators=level0, \n", " final_estimator=level1, cv=10) \n", " return model\n", " \n", "# get a list of models to evaluate\n", "def get_models():\n", "\tmodels = dict()\n", "\tmodels['cart'] = DecisionTreeClassifier()\n", "\tmodels['lr'] = LogisticRegression(solver='liblinear',max_iter=1000)\n", "\tmodels['bayes'] = GaussianNB()\n", "\tmodels['xgboost'] = XGBClassifier()\n", "\tmodels['bagging']=BaggingClassifier()\n", "\tmodels['rf']=RandomForestClassifier()\n", "\tmodels['knn'] = KNeighborsClassifier()\n", "\tmodels['svm'] = SVC()\n", "\tmodels['stacking'] = get_stacking()\n", "\n", "\n", "\treturn models\n", " \n", "from sklearn.metrics import classification_report, \n", "accuracy_score, make_scorer\n", "\n", "# evaluate a give model using cross-validation\n", "def evaluate_model(model, X, y,name):\n", "\tcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", "\tprint(\"=================================\"+name+\"==== ==========================================\")\n", "\tscores = cross_val_score(model, X, y, scoring=\"f1\",cv=cv, n_jobs=-1, error_score='raise')\n", "\treturn scores\n", "\n", "\n", "# get the models to evaluate\n", "models = get_models()\n", "# evaluate the models and store results\n", "results, names = list(), list()\n", "for name, model in models.items():\n", "\n", "\tprint(\"=================================\"+name+\"==============================================\")\n", "\n", "\tcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3,\n", " random_state=42)\n", "\tfor score in [\"roc_auc\", \"f1\", \"precision\", \n", " \"recall\", \"accuracy\"]:\n", "\t\tcvs = cross_val_score(model, X, y, \n", " scoring=score, cv=cv, n_jobs=-1, \n", " error_score='raise').mean()\n", "\t\tprint(score + \" : \"+ str(cvs))\n", "\t#print(cvs1) \n", "\tprint('\\n')\n", "pyplot.boxplot(results, labels=names, showmeans=True)\n", "pyplot.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "bba40cc1-ebef-4314-b32d-aa54b22248c3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "f256d51a-12ac-46c3-8b8a-87496a3a24ce", "metadata": {}, "outputs": [], "source": [ "#CODE\n", "\n", "#Deep learning for Binary classes\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler\n", "from keras.models import Model\n", "from keras.layers import Input\n", "import seaborn as sns\n", "from keras.layers.core import Activation, Dropout, Dense\n", "from sklearn import preprocessing\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import classification_report\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "import sys\n", "import os\n", "from math import log\n", "import scipy as sp\n", "from imblearn.over_sampling import RandomOverSampler, SMOTE\n", "from imblearn.under_sampling import RandomUnderSampler\n", "\n", "import matplotlib.pyplot as plt\n", "from keras.optimizers import SGD\n", "import tensorflow as tf \n", "from keras.models import Sequential\n", "from keras.layers import Dropout, Dense, Conv1D, Flatten, MaxPooling1D\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets import load_iris\n", "from numpy import unique\n", "from keras.layers import Dense, Input, LSTM, Dropout, SimpleRNN,\n", "Embedding, Reshape\n", "from sklearn.metrics import confusion_matrix\n", "from collections import Counter\n", "\n", "df = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/\n", "JSVulnerabilityDataSet-1.0.csv')\n", "import matplotlib.pyplot as plt\n", "sns.countplot(x='Vuln', data=df)\n", "\n", "X = df[[ 'CC', 'CCL', 'CCO', 'CI', 'CLC', 'CLLC', 'McCC',\n", " 'NL', 'NLE', 'CD', 'CLOC', 'DLOC', 'TCD', 'TCLOC',\n", " 'LLOC', 'LOC', 'NOS','NUMPAR', 'TLLOC', 'TLOC', \n", " 'TNOS', 'HOR_D', 'HOR_T', 'HON_D', 'HON_T','HLEN',\n", " 'HVOC', 'HDIFF', 'HVOL', 'HEFF', 'HBUGS', 'HTIME',\n", " 'CYCL', 'PARAMS', 'CYCL_DENS']].values\n", "\n", "y = df['Vuln']\n", "\n", "y\n", "\n", "# label_encoder object knows how to understand word labels.\n", "label_encoder = preprocessing.LabelEncoder()\n", "\n", "# Encode labels in column 'species'.\n", "y = label_encoder.fit_transform(y)\n", "\n", "counter = Counter(y)\n", "print(counter)\n", "\n", "#ros = RandomOverSampler(sampling_strategy={ 0: 10629,1: 10629}, random_state=42) # String\n", "#X, y = ros.fit_resample(X, y)\n", "\n", "#rus = RandomUnderSampler(sampling_strategy={ 0: 1496,1: 1496}, random_state=42) # String\n", "#X, y = rus.fit_resample(X, y)\n", "\n", "#smote = SMOTE() #SMOTE(\"minority\")\n", "#X, y= smote.fit_resample(X, y)\n", "\n", "\n", "\n", "counter = Counter(y)\n", "print(counter)\n", "\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X,y, \n", "test_size=0.33,shuffle=True, random_state=42, stratify=y)\n", "X_test1=X_test\n", "X_train1=X_train\n", "y_test1=y_test\n", "y_train1=y_train\n", "\n", "\n", "from tensorflow.keras.utils import to_categorical\n", "y_train = to_categorical(y_train)\n", "y_test = to_categorical(y_test)\n", "\n", "\n", "\n", "from sklearn.preprocessing import MinMaxScaler\n", "scaler = MinMaxScaler()\n", "scaler.fit(X_train)\n", "\n", "Xtrain_scaled = scaler.transform(X_train)\n", "Xtest_scaled = scaler.transform(X_test)\n", "\n", "Xtrain_scaled.shape\n", "\n", "\n", "num_classes = 2\n", "\n", "def model_VGG16(learning_rate=0.001, momentum=0.9):\n", " model = Sequential()\n", " \n", " \n", " model.add(Conv1D(64, 2, activation=\"relu\", input_shape=(X.shape[1],1)))\n", " model.add(Conv1D(64, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " model.add(Conv1D(512,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " \n", " model.add(Conv1D(512,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " \n", " model.add(Flatten())\n", " #malware_model.add(Dropout(0.5))\n", " \n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(1000, activation='relu', \n", " kernel_initializer='he_uniform'))\n", "\n", " model.add(Dense(num_classes, activation='sigmoid'))\n", "\n", " # compile model\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " #model.compil(optimizer=opt, \n", " loss='categorical_crossentropy', \n", " metrics=[tf.keras.metrics.Precision(), \n", " tf.keras.metrics.Recall(), 'accuracy'])\n", " model.compile(loss='binary_crossentropy', \n", " optimizer=opt, metrics=['accuracy'])\n", " \n", " return model\n", "\n", "\n", "\n", "def model_VGG19(learning_rate=0.001, momentum=0.9):\n", " \n", " \n", " model = Sequential()\n", " \n", " model.add(Conv1D(64, 2, activation=\"relu\", \n", " input_shape=(X.shape[1],1)))\n", " model.add(Conv1D(64, 2, activation='relu', \n", " kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform', padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform', padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " model.add(Conv1D(512,2, activation='relu', \n", " \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform', padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " \n", " model.add(Conv1D(512,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform', padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform', padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " \n", " model.add(Flatten())\n", " #malware_model.add(Dropout(0.5))\n", " \n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(1000, activation='relu', \n", " kernel_initializer='he_uniform'))\n", "\n", " model.add(Dense(num_classes, activation='sigmoid'))\n", "\n", " # compile model\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " #malware_model.compile(optimizer=opt, \n", " loss='categorical_crossentropy', \n", " metrics=[tf.keras.metrics.Precision(), \n", " tf.keras.metrics.Recall(), 'accuracy'])\n", " model.compile(loss='binary_crossentropy', \n", " optimizer=opt, metrics=['accuracy'])\n", "\n", " return model\n", "\n", "\n", "\n", "def model_AlexNet(learning_rate=0.001, momentum=0.9):\n", " model = Sequential()\n", " \n", " model.add(Conv1D(96, 11, activation=\"relu\", \n", " input_shape=(X.shape[1],1)))\n", " model.add(MaxPooling1D())\n", " \n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " model.add(Conv1D(384,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(384, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(384, 2, activation='relu', \n", " kernel_initializer='he_uniform', padding='same'))\n", " model.add(MaxPooling1D())\n", " \n", " \n", " model.add(Flatten()) \n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(1000, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dense(num_classes, activation='sigmoid'))\n", "\n", " # compile model\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " #malware_model.compile(optimizer=opt, \n", " loss='categorical_crossentropy', \n", " metrics=[tf.keras.metrics.Precision(),\n", " tf.keras.metrics.Recall(), 'accuracy'])\n", " model.compile(loss='binary_crossentropy', \n", " optimizer=opt, metrics=['accuracy'])\n", " return model\n", "\n", "\n", "def model_Resent(learning_rate=0.001, momentum=0.9):\n", " model = Sequential()\n", " \n", " model.add(Conv1D(64, 2, activation=\"relu\", \n", " input_shape=(X.shape[1],1)))\n", " model.add(Dropout(0.5))\n", " model.add(MaxPooling1D())\n", " \n", " model.add(Conv1D(64, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(64, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(64, 2, activation='relu',\n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(64, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " \n", " model.add(Conv1D(128,2, activation='relu',\n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Dropout(0.5))\n", "\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " \n", " model.add(Conv1D(256, 2, activation='relu', kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(Dropout(0.5))\n", "\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " \n", " model.add(Conv1D(512,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Dropout(0.5))\n", "\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", "\n", " model.add(MaxPooling1D())\n", " \n", " \n", " model.add(Flatten())\n", " model.add(Dropout(0.5))\n", "\n", " model.add(Dense(1000, activation='relu', \n", " kernel_initializer='he_uniform'))\n", "\n", " model.add(Dense(num_classes, activation='sigmoid'))\n", "\n", " # compile model\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " #malware_model.compile(optimizer=opt, \n", " loss='categorical_crossentropy', \n", " metrics=[tf.keras.metrics.Precision(), \n", " tf.keras.metrics.Recall(), 'accuracy'])\n", " model.compile(loss='binary_crossentropy', \n", " optimizer=opt, metrics=['accuracy'])\n", "\n", " return model\n", "\n", "def model_LSTM(learning_rate=0.001, momentum=0.9):\n", "\n", " input_layer = Input(shape=(X.shape[1],1))\n", " \n", " conv1 = Conv1D(filters=35,\n", " kernel_size=8,\n", " strides=1,\n", " activation='relu')(input_layer)\n", " pool1 = MaxPooling1D(pool_size=4)(conv1)\n", " lstm1 = LSTM(35)(pool1)\n", " output_layer = Dense(2, activation='sigmoid')(lstm1)\n", " model = Model(inputs=input_layer, outputs=output_layer)\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " model.compile(loss='binary_crossentropy',\n", " optimizer=opt, metrics=['acc'])\n", " \n", " return model\n", " \n", "\n", "\n", "print(unique(y_test))\n", "Vuln_model1 = model_VGG16()\n", "Vuln_model2 = model_VGG19()\n", "Vuln_model3 = model_AlexNet()\n", "Vuln_model4 = model_Resent()\n", "Vuln_model5 = model_LSTM()\n", "\n", "\n", "\n", "\n", "y_train_new = np.argmax(y_train, axis=1)\n", "\n", "y_train_new\n", "\n", "from sklearn.utils import class_weight\n", "from sklearn.utils import compute_class_weight\n", "\n", "class_weights = compute_class_weight(\n", " class_weight = \"balanced\",\n", " classes = np.unique(y_train_new),\n", " y = y_train_new \n", " )\n", "class_weights = dict(zip(np.unique(y_train_new), class_weights))\n", "class_weights\n", "\n", "\n", "\n", "\n", "history1 = Vuln_model1.fit(x= Xtrain_scaled, y=y_train, batch_size=128, epochs=1000, verbose=1, validation_split=0.33,class_weight=class_weights)\n", "scores1 = Vuln_model1.evaluate(x= Xtest_scaled, \n", "y=y_test, verbose=1)\n", "print('Final Vuln_model1 (VGG16) accuracy: ', scores1[1])\n", "\n", "plt.plot(history1.history['accuracy'])\n", "plt.plot(history1.history['val_accuracy'])\n", "\n", "plt.title('VGG16 model accuracy')\n", "plt.ylabel('accuracy')\n", "plt.xlabel('epoch')\n", "plt.legend(['train','test'], loc='upper left')\n", "plt.show()\n", "\n", "plt.plot(history1.history['loss'])\n", "plt.plot(history1.history['val_loss'])\n", "\n", "plt.title('VGG16 model loss')\n", "plt.ylabel('loss')\n", "plt.xlabel('epoch')\n", "plt.legend(['train','test'], loc='upper left')\n", "plt.show()\n", "\n", "\n", "classes = Vuln_model1.predict(x= Xtest_scaled)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "\n", "\n", "history2 = Vuln_model2.fit(x= Xtrain_scaled,y=y_train, batch_size=128, epochs=1000, verbose=1, validation_split=0.33,class_weight=class_weights)\n", "scores2 = Vuln_model2.evaluate(x= Xtest_scaled, \n", "y=y_test, verbose=1)\n", "print('Final Vuln_model2 (VGG19) accuracy: ', scores1[1])\n", "\n", "import matplotlib.pyplot as plt2\n", "\n", "\n", "print(\"Test Score:\", scores2[0])\n", "print(\"Test Accuracy:\", scores2[1])\n", "\n", "plt2.plot(history2.history['accuracy'])\n", "plt2.plot(history2.history['val_accuracy'])\n", "\n", "plt2.title('VGG19 model accuracy')\n", "plt2.ylabel('accuracy')\n", "plt2.xlabel('epoch')\n", "plt2.legend(['train','test'], loc='upper left')\n", "plt2.show()\n", "\n", "plt2.plot(history2.history['loss'])\n", "plt2.plot(history2.history['val_loss'])\n", "\n", "plt2.title('VGG19 model loss')\n", "plt2.ylabel('loss')\n", "plt2.xlabel('epoch')\n", "plt2.legend(['train','test'], loc='upper left')\n", "plt2.show()\n", "\n", "\n", "classes = Vuln_model2.predict(x= Xtest_scaled)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "'''\n", "group_names1 = ['True Neg','False Pos','False Neg','True Pos']\n", "group_counts1 = [\"{0:0.0f}\".format(value) for value in\n", " cf_matrix.flatten()]\n", "group_percentages1 = [\"{0:.2%}\".format(value) for value in\n", " cf_matrix.flatten()/np.sum(cf_matrix)]\n", "labels1 = [f\"{v1}\\n{v2}\\n{v3}\" for v1, v2, v3 in\n", " zip(group_names1,group_counts1,group_percentages1)]\n", "labels1 = np.asarray(labels1).reshape(2,2)\n", "sns.heatmap(cf_matrix, annot=labels1, fmt='', cmap='Blues')\n", "'''\n", "\n", "\n", "\n", "import matplotlib.pyplot as plt3\n", "\n", "\n", "history3 = Vuln_model3.fit(x= Xtrain_scaled, y=y_train, batch_size=128, epochs=1000, verbose=1, validation_split=0.33,class_weight=class_weights)\n", "scores3 = Vuln_model3.evaluate(x= Xtest_scaled,\n", "y=y_test, verbose=1)\n", "print('Final Vuln_model3 (AlexNet) accuracy: ', scores3[1])\n", "\n", "\n", "print(\"Test Score:\", scores3[0])\n", "print(\"Test Accuracy:\", scores3[1])\n", "\n", "plt3.plot(history3.history['accuracy'])\n", "plt3.plot(history3.history['val_accuracy'])\n", "\n", "plt3.title('AlexNet model accuracy')\n", "plt3.ylabel('accuracy')\n", "plt3.xlabel('epoch')\n", "plt3.legend(['train','test'], loc='upper left')\n", "plt3.show()\n", "\n", "plt3.plot(history3.history['loss'])\n", "plt3.plot(history3.history['val_loss'])\n", "\n", "plt3.title('AlexNet model loss')\n", "plt3.ylabel('loss')\n", "plt3.xlabel('epoch')\n", "plt3.legend(['train','test'], loc='upper left')\n", "plt3.show()\n", "\n", "\n", "classes = Vuln_model3.predict(x= Xtest_scaled)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "import matplotlib.pyplot as plt4\n", "\n", "history4 = Vuln_model4.fit(x= Xtrain_scaled,y=y_train, batch_size=128, epochs=1000, verbose=1, validation_split=0.33,class_weight=class_weights)\n", "scores4 = Vuln_model4.evaluate(x= Xtest_scaled,\n", "y=y_test, verbose=1)\n", "print('Final Vuln_model4 (Resent) accuracy: ', scores4[1])\n", "\n", "\n", "print(\"Test Score:\", scores4[0])\n", "print(\"Test Accuracy:\", scores4[1])\n", "\n", "plt4.plot(history4.history['accuracy'])\n", "plt4.plot(history4.history['val_accuracy'])\n", "\n", "plt4.title('Resent model accuracy')\n", "plt4.ylabel('accuracy')\n", "plt4.xlabel('epoch')\n", "plt4.legend(['train','test'], loc='upper left')\n", "plt4.show()\n", "\n", "plt4.plot(history4.history['loss'])\n", "plt4.plot(history4.history['val_loss'])\n", "\n", "plt4.title('Resent model loss')\n", "plt4.ylabel('loss')\n", "plt4.xlabel('epoch')\n", "plt4.legend(['train','test'], loc='upper left')\n", "plt4.show()\n", "\n", "\n", "classes = Vuln_model4.predict(x= Xtest_scaled)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "history5 = Vuln_model5.fit(x= X_train, y=y_train,batch_size=128, epochs=1000, verbose=1,validation_split=0.33,class_weight=class_weights)\n", "scores5 = Vuln_model5.evaluate(x= X_test, \n", "y=y_test, verbose=1)\n", "print('Final Vuln_model5 (LSTM) accuracy: ', scores5[1])\n", "\n", "import matplotlib.pyplot as plt5\n", "print(\"Test Score:\", scores5[0])\n", "print(\"Test Accuracy:\", scores5[1])\n", "\n", "plt5.plot(history5.history['acc'])\n", "plt5.plot(history5.history['val_acc'])\n", "\n", "plt5.title('LSTM model accuracy')\n", "plt5.ylabel('accuracy')\n", "plt5.xlabel('epoch')\n", "plt5.legend(['train','test'], loc='upper left')\n", "plt5.show()\n", "\n", "plt5.plot(history5.history['loss'])\n", "plt5.plot(history5.history['val_loss'])\n", "\n", "plt5.title('LSTM model loss')\n", "plt5.ylabel('loss')\n", "plt5.xlabel('epoch')\n", "plt5.legend(['train','test'], loc='upper left')\n", "plt5.show()\n", "\n", "\n", "classes = Vuln_model5.predict(x= X_test)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "group_names = ['True Neg','False Pos','False Neg','True Pos']\n", "group_counts = [\"{0:0.0f}\".format(value) for value in\n", " cf_matrix.flatten()]\n", "group_percentages = [\"{0:.2%}\".format(value) for value in\n", " cf_matrix.flatten()/np.sum(cf_matrix)]\n", "labels = [f\"{v1}\\n{v2}\\n{v3}\" for v1, v2, v3 in\n", " zip(group_names,group_counts,group_percentages)]\n", "labels = np.asarray(labels).reshape(2,2)\n", "sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')\n", "\n", "\n", "\n", "\n", "Vuln_model1.save('/Users/abdullah/models/Model1.h5')\n", "Vuln_model2.save('/Users/abdullah/models/Model2.h5')\n", "Vuln_model3.save('/Users/abdullah/models/Model3.h5')\n", "Vuln_model4.save('/Users/abdullah/models/Model4.h5')\n", "Vuln_model5.save('/Users/abdullah/models/Model5.h5')\n", "\n", "\n", "\n", "from keras.models import load_model\n", "\n", "# load models from file\n", "def load_all_models(n_models):\n", "\tall_models = list()\n", "\tfor i in range(n_models):\n", "\t\t# define filename for this ensemble\n", "\t\tfilename = '/Users/abdullah/models/Model' + str(i+1) + '.h5'\n", "\t\t# load model from file\n", "\t\tmodel = load_model(filename)\n", "\t\t# add to list of members\n", "\t\tall_models.append(model)\n", "\t\tprint('>loaded \\%s' \\% filename)\n", "\treturn all_models\n", "\n", "\n", "# load all models\n", "n_members = 5\n", "members = load_all_models(n_members)\n", "print('Loaded \\%d models' \\% len(members))\n", "\n", "\n", " # create stacked model input dataset as outputs from the ensemble\n", "def stacked_dataset(members, inputX):\n", "\tstackX = None\n", "\tfor model in members:\n", "\t\t# make prediction\n", "\t\tyhat = model.predict(inputX, verbose=0)\n", "\t\t# stack predictions into [rows, members, probabilities]\n", "\t\tif stackX is None:\n", "\t\t\tstackX = yhat\n", "\t\telse:\n", "\t\t\tstackX = dstack((stackX, yhat))\n", "\t# flatten predictions to [rows, members x probabilities]\n", "\tstackX = stackX.reshape((stackX.shape[0], \n", " stackX.shape[1]*stackX.shape[2]))\n", "\treturn stackX\n", "\n", "\n", "\n", "\n", " # fit a model based on the outputs from the ensemble members\n", "def fit_stacked_model(members, inputX, inputy):\n", "\t# create dataset using ensemble\n", "\tstackedX = stacked_dataset(members, inputX)\n", "\t# fit standalone model\n", "\tmodel = LogisticRegression()\n", "\tmodel.fit(stackedX, inputy)\n", "\treturn model\n", "\n", "\n", "# fit stacked model using the ensemble\n", "from numpy import dstack\n", "from sklearn.linear_model import LogisticRegression\n", "y_test=np.argmax(y_test, axis=1)\n", "\n", "model = fit_stacked_model(members, Xtest_scaled, y_test)\n", "\n", "\n", "\n", "# make a prediction with the stacked model\n", "\n", "def stacked_prediction(members, model, inputX):\n", "\t# create dataset using ensemble\n", "\tstackedX = stacked_dataset(members, inputX)\n", "\t# make a prediction\n", "\tyhat = model.predict(stackedX)\n", "\treturn yhat\n", "\n", "# evaluate model on test set\n", "from sklearn.metrics import accuracy_score\n", "yhat = stacked_prediction(members, model, Xtest_scaled)\n", "\n", "acc = accuracy_score(y_test, yhat)\n", "print('Stacked Test Accuracy: \\%.3f' \\% acc)\n", "print(len(yhat))\n", "\n", "\n", "print(classification_report(y_test, yhat,digits=6))\n", "\n", "cf_matrix = confusion_matrix(y_test, yhat)\n", "print(cf_matrix)" ] }, { "cell_type": "code", "execution_count": null, "id": "f625d9f0-910a-4d29-8d9f-344d1ea9d93f", "metadata": {}, "outputs": [], "source": [ "#CODE\n", "\n", "#Deep learning for Multi-classes\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler\n", "from keras.models import Model\n", "from keras.layers import Input\n", "import seaborn as sns\n", "from keras.layers.core import Activation, Dropout, Dense\n", "from sklearn import preprocessing\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import classification_report\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.model_selection import train_test_split\n", "import sys\n", "import os\n", "from math import log\n", "import scipy as sp\n", "from imblearn.over_sampling import RandomOverSampler, SMOTE\n", "from imblearn.under_sampling import RandomUnderSampler\n", "\n", "import matplotlib.pyplot as plt\n", "from keras.optimizers import SGD\n", "import tensorflow as tf \n", "from keras.models import Sequential\n", "from keras.layers import Dropout, Dense, Conv1D, Flatten, MaxPooling1D\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets import load_iris\n", "from numpy import unique\n", "from keras.layers import Dense, Input, LSTM, Dropout, SimpleRNN, \n", "Embedding, Reshape\n", "from sklearn.metrics import confusion_matrix\n", "from collections import Counter\n", "\n", "df = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/PU_Dataset.csv')\n", "df1 = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/cwe119_GCDFile.csv')\n", "df2 = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/cwe399_cgd.csv')\n", "df3 = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/API_function_call.csv')\n", "df4 = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/Array_usage.csv')\n", "df5 = pd.read_csv('/Users/abdullah/Desktop/Folder/Research/Vulnerability_Research/Vul_Datasets/Code_Metrics_Datasets/Multi-Classes/Arithmetic_expression.csv')\n", "\n", "\n", "\n", "\n", "\n", "\n", "import matplotlib.pyplot as plt\n", "df.columns\n", "\n", "\n", "sns.countplot(x='IsVulnerable', data=df)\n", "\n", "X = df[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators','N2_Total_Number_Of_Operands',\n", " 'N_Program_Length','B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2',\n", " 'D_Difficulty','E_Effort','T_Time_Required_To_Program',\n", " 'V_Volume','_N_Calculated_Program_Length',\n", " 'McCab_Number']].values\n", "\n", "\n", "y= df['IsVulnerable'].values\n", "\n", "\n", "\n", "print(len(y))\n", "\n", "\n", "\n", "\n", "import matplotlib.pyplot as plt1\n", "df1.columns\n", "\n", "\n", "sns.countplot(x='IsVulnerable', data=df1)\n", "\n", "X1 = df1[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators','N2_Total_Number_Of_Operands',\n", " 'N_Program_Length','B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2','D_Difficulty',\n", " 'E_Effort','T_Time_Required_To_Program','V_Volume',\n", " '_N_Calculated_Program_Length','McCab_Number']].values\n", "\n", "\n", "y1= df1['IsVulnerable'].values\n", "\n", "\n", "print(len(y1))\n", "\n", "\n", "\n", "\n", "X2 = df2[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators','N2_Total_Number_Of_Operands',\n", " 'N_Program_Length','B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2', 'D_Difficulty','E_Effort',\n", " 'T_Time_Required_To_Program','V_Volume',\n", " '_N_Calculated_Program_Length','McCab_Number']].values\n", "\n", "\n", "y2= df2['IsVulnerable'].values\n", "\n", "\n", "\n", "\n", "\n", "\n", "print(len(y2))\n", "\n", "\n", "\n", "X3 = df3[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators','N2_Total_Number_Of_Operands',\n", " 'N_Program_Length','B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2','D_Difficulty','E_Effort',\n", " 'T_Time_Required_To_Program','V_Volume',\n", " '_N_Calculated_Program_Length','McCab_Number']].values\n", "\n", "\n", "\n", "y3= df3['IsVulnerable'].values\n", "\n", "\n", "print(len(y3))\n", "\n", "\n", "\n", "\n", "\n", "X4 = df4[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators','N2_Total_Number_Of_Operands',\n", " 'N_Program_Length','B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2',\n", " 'D_Difficulty','E_Effort',\n", " 'T_Time_Required_To_Program','V_Volume',\n", " '_N_Calculated_Program_Length','McCab_Number']].values\n", "\n", "\n", "y4= df4['IsVulnerable'].values\n", "\n", "\n", "\n", "\n", "\n", "X5 = df5[['Lines_Of_Program','Physic_Lines',\n", " 'n1_Number_Of_Distinct_Operators',\n", " 'n2_Number_Of_Distinct_Operands','n_Program_Vocabulary',\n", " 'N1_Total_Number_Of_Operators',\n", " 'N2_Total_Number_Of_Operands','N_Program_Length',\n", " 'B_Number_of_Delivered_Bugs_1',\n", " 'B_Number_of_Delivered_Bugs_2',\n", " 'D_Difficulty','E_Effort',\n", " 'T_Time_Required_To_Program','V_Volume',\n", " '_N_Calculated_Program_Length','McCab_Number']].values\n", "\n", "\n", "y5= df5['IsVulnerable'].values\n", "\n", "\n", "print(len(y5))\n", "\n", "\n", "import numpy as np\n", "X=np.concatenate((X,X1,X2,X3,X4,X5))\n", "y=np.concatenate((y,y1,y2,y3,y4,y5))\n", "\n", "\n", "\n", "# label_encoder object knows how to understand word labels.\n", "label_encoder = preprocessing.LabelEncoder()\n", "\n", "# Encode labels in column 'species'.\n", "y = label_encoder.fit_transform(y)\n", "\n", "counter = Counter(y)\n", "print(counter)\n", "\n", "#ros = RandomOverSampler(sampling_strategy={ 0: 85399,1: 85399}, random_state=42) # String\n", "#X, y = ros.fit_resample(X, y)\n", "\n", "#rus = RandomUnderSampler(sampling_strategy={ 0: 3475,1: 3475,2:3475,3:3475,4:3475,5:3475,6:3475}, random_state=42) # String\n", "#X, y = rus.fit_resample(X, y)\n", "\n", "#smote = SMOTE() #SMOTE(\"minority\")\n", "#X, y= smote.fit_resample(X, y)\n", "\n", "#rus = RandomUnderSampler(sampling_strategy={0:15000,1: 9952,3:13603,2:10440,4:7285,5:3475,6:10926}, random_state=42) # String\n", "#X, y = rus.fit_resample(X, y)\n", "\n", "\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X,y, \n", "test_size=0.33,shuffle=True, random_state=42, stratify=y)\n", "X_test1=X_test\n", "X_train1=X_train\n", "y_test1=y_test\n", "y_train1=y_train\n", "\n", "\n", "from tensorflow.keras.utils import to_categorical\n", "y_train = to_categorical(y_train)\n", "y_test = to_categorical(y_test)\n", "\n", "\n", "\n", "from sklearn.preprocessing import MinMaxScaler\n", "scaler = MinMaxScaler()\n", "scaler.fit(X_train)\n", "\n", "Xtrain_scaled = scaler.transform(X_train)\n", "Xtest_scaled = scaler.transform(X_test)\n", "\n", "Xtrain_scaled.shape\n", "\n", "\n", "num_classes = 7\n", "\n", "def model_VGG16(learning_rate=0.001, momentum=0.9):\n", " model = Sequential()\n", " \n", " \n", " model.add(Conv1D(64, 2, activation=\"relu\", \n", " input_shape=(X.shape[1],1)))\n", " model.add(Conv1D(64, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu',\n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " model.add(Conv1D(512,2, activation='relu',\n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " \n", " model.add(Conv1D(512,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " \n", " model.add(Flatten())\n", " #malware_model.add(Dropout(0.5))\n", " \n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(1000, activation='relu', \n", " kernel_initializer='he_uniform'))\n", "\n", " model.add(Dense(num_classes, activation='softmax'))\n", "\n", " # compile model\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " #malware_model.compile(optimizer=opt, \n", " loss='categorical_crossentropy',\n", " metrics=[tf.keras.metrics.Precision(), \n", " tf.keras.metrics.Recall(), 'accuracy'])\n", " model.compile(loss='categorical_crossentropy',\n", " optimizer=opt, metrics=['accuracy'])\n", " \n", " return model\n", "\n", "\n", "\n", "def model_VGG19(learning_rate=0.01, momentum=0.9):\n", " \n", " \n", " model = Sequential()\n", " \n", " model.add(Conv1D(64, 2, activation=\"relu\", \n", " input_shape=(X.shape[1],1)))\n", " model.add(Conv1D(64, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu',\n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " model.add(Conv1D(512,2, activation='relu', \n", " kernel_initializer='he_uniform', padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " \n", " model.add(Conv1D(512,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',\n", " padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " \n", " model.add(Flatten())\n", " #malware_model.add(Dropout(0.5))\n", " \n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(1000, activation='relu', \n", " kernel_initializer='he_uniform'))\n", "\n", " model.add(Dense(num_classes, activation='softmax'))\n", "\n", " # compile model\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " #malware_model.compile(optimizer=opt, \n", " loss='categorical_crossentropy', \n", " metrics=[tf.keras.metrics.Precision(), \n", " tf.keras.metrics.Recall(), 'accuracy'])\n", " model.compile(loss='categorical_crossentropy', \n", " optimizer=opt, metrics=['accuracy'])\n", "\n", " return model\n", "\n", "\n", "\n", "def model_AlexNet(learning_rate=0.01, momentum=0.9):\n", " model = Sequential()\n", " \n", " model.add(Conv1D(96, 11, activation=\"relu\", \n", " input_shape=(X.shape[1],1)))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " model.add(Conv1D(384,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(384, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(384, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " \n", " model.add(Flatten())\n", " #malware_model.add(Dropout(0.5))\n", " \n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(4096, activation='relu', \n", " kernel_initializer='he_uniform'))\n", " model.add(Dropout(0.5))\n", " model.add(Dense(1000, activation='relu', \n", " kernel_initializer='he_uniform'))\n", "\n", " model.add(Dense(num_classes, activation='softmax'))\n", "\n", " # compile model\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " #malware_model.compile(optimizer=opt, \n", " loss='categorical_crossentropy',\n", " metrics=[tf.keras.metrics.Precision(), \n", " tf.keras.metrics.Recall(), 'accuracy'])\n", " model.compile(loss='categorical_crossentropy', \n", " optimizer=opt, metrics=['accuracy'])\n", "\n", " return model\n", "\n", "\n", "def model_Resent(learning_rate=0.01, momentum=0.9):\n", " \n", " \n", " model = Sequential()\n", " \n", " model.add(Conv1D(64, 2, activation=\"relu\", \n", " input_shape=(X.shape[1],1)))\n", " model.add(Dropout(0.5))\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " model.add(Conv1D(64, 2, activation='relu',\n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(64, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(64, 2, activation='relu',\n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(64, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " \n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Dropout(0.5))\n", "\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform', padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128,2, activation='relu',\n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(128, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " \n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Dropout(0.5))\n", "\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu',\n", " kernel_initializer='he_uniform', padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(256, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " \n", " model.add(Conv1D(512,2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Dropout(0.5))\n", "\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512,2, activation='relu',\n", " kernel_initializer='he_uniform',padding='same'))\n", " model.add(Conv1D(512, 2, activation='relu', \n", " kernel_initializer='he_uniform',padding='same'))\n", "\n", " model.add(MaxPooling1D(pool_size=1))\n", " \n", " \n", " model.add(Flatten())\n", " model.add(Dropout(0.5))\n", "\n", " model.add(Dense(1000, activation='relu', \n", " kernel_initializer='he_uniform'))\n", "\n", " model.add(Dense(num_classes, activation='softmax'))\n", "\n", " # compile model\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " #malware_model.compile(optimizer=opt, \n", " loss='categorical_crossentropy', \n", " metrics=[tf.keras.metrics.Precision(), \n", " tf.keras.metrics.Recall(), 'accuracy'])\n", " model.compile(loss='categorical_crossentropy', \n", " optimizer=opt, metrics=['accuracy'])\n", "\n", " return model\n", "\n", "def model_LSTM(learning_rate=0.001, momentum=0.9):\n", "\n", " input_layer = Input(shape=(X.shape[1],1))\n", " \n", " conv1 = Conv1D(filters=35,\n", " kernel_size=8,\n", " strides=1,\n", " activation='relu')(input_layer)\n", " pool1 = MaxPooling1D(pool_size=4)(conv1)\n", " lstm1 = LSTM(35)(pool1)\n", " output_layer = Dense(7, activation='softmax')(lstm1)\n", " model = Model(inputs=input_layer, outputs=output_layer)\n", " opt = SGD(lr=learning_rate, momentum=momentum)\n", " model.compile(loss='categorical_crossentropy',\n", " optimizer=opt, metrics=['acc'])\n", " \n", " return model\n", " \n", "\n", "\n", "print(unique(y_test))\n", "Vuln_model1 = model_VGG16()\n", "Vuln_model2 = model_VGG19()\n", "Vuln_model3 = model_AlexNet()\n", "Vuln_model4 = model_Resent()\n", "Vuln_model5 = model_LSTM()\n", "\n", "\n", "\n", "\n", "y_train_new = np.argmax(y_train, axis=1)\n", "\n", "y_train_new\n", "\n", "from sklearn.utils import class_weight\n", "from sklearn.utils import compute_class_weight\n", "\n", "class_weights = \n", "compute_class_weight(\n", " \n", " class_weight = \"balanced\",\n", " classes = np.unique(y_train_new),\n", " y = y_train_new \n", " )\n", "class_weights = dict(zip(np.unique(y_train_new), class_weights))\n", "class_weights\n", "\n", "\n", "\n", "\n", "history1 = Vuln_model1.fit(x= Xtrain_scaled, y=y_train,batch_size=128, epochs=50, verbose=1, validation_split=0.33,class_weight=class_weights)\n", "scores1 = Vuln_model1.evaluate(x= Xtest_scaled, y=y_test, verbose=1)\n", "print('Final Vuln_model1 (VGG16) accuracy: ', scores1[1])\n", "\n", "\n", "\n", "\n", "\n", "plt.plot(history1.history['accuracy'])\n", "plt.plot(history1.history['val_accuracy'])\n", "\n", "plt.title('VGG16 model accuracy')\n", "plt.ylabel('accuracy')\n", "plt.xlabel('epoch')\n", "plt.legend(['train','test'], loc='upper left')\n", "plt.show()\n", "\n", "plt.plot(history1.history['loss'])\n", "plt.plot(history1.history['val_loss'])\n", "\n", "plt.title('VGG16 model loss')\n", "plt.ylabel('loss')\n", "plt.xlabel('epoch')\n", "plt.legend(['train','test'], loc='upper left')\n", "plt.show()\n", "\n", "\n", "classes = Vuln_model1.predict(x= Xtest_scaled)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "history2 = Vuln_model2.fit(x= Xtrain_scaled, y=y_train,batch_size=128, epochs=20, verbose=1, validation_split=0.33,class_weight=class_weights)\n", "scores2 = Vuln_model2.evaluate(x= Xtest_scaled,y=y_test, verbose=1)\n", "print('Final Vuln_model2 (VGG19) accuracy: ', scores1[1])\n", "\n", "import matplotlib.pyplot as plt2\n", "\n", "\n", "plt2.plot(history2.history['accuracy'])\n", "plt2.plot(history2.history['val_accuracy'])\n", "\n", "plt2.title('VGG19 model accuracy')\n", "plt2.ylabel('accuracy')\n", "plt2.xlabel('epoch')\n", "plt2.legend(['train','test'], loc='upper left')\n", "plt2.show()\n", "\n", "plt2.plot(history2.history['loss'])\n", "plt2.plot(history2.history['val_loss'])\n", "\n", "plt2.title('VGG19 model loss')\n", "plt2.ylabel('loss')\n", "plt2.xlabel('epoch')\n", "plt2.legend(['train','test'], loc='upper left')\n", "plt2.show()\n", "\n", "\n", "classes = Vuln_model2.predict(x= Xtest_scaled)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "import matplotlib.pyplot as plt3\n", "\n", "\n", "history3 = Vuln_model3.fit(x= Xtrain_scaled, y=y_train, batch_size=128, epochs=20, verbose=1, validation_split=0.33,class_weight=class_weights)\n", "scores3 = Vuln_model3.evaluate(x= Xtest_scaled, y=y_test, verbose=1)\n", "print('Final Vuln_model3 (AlexNet) accuracy: ', scores3[1])\n", "\n", "\n", "\n", "plt3.plot(history3.history['accuracy'])\n", "plt3.plot(history3.history['val_accuracy'])\n", "\n", "plt3.title('AlexNet model accuracy')\n", "plt3.ylabel('accuracy')\n", "plt3.xlabel('epoch')\n", "plt3.legend(['train','test'], loc='upper left')\n", "plt3.show()\n", "\n", "plt3.plot(history3.history['loss'])\n", "plt3.plot(history3.history['val_loss'])\n", "\n", "plt3.title('AlexNet model loss')\n", "plt3.ylabel('loss')\n", "plt3.xlabel('epoch')\n", "plt3.legend(['train','test'], loc='upper left')\n", "plt3.show()\n", "\n", "\n", "classes = Vuln_model3.predict(x= Xtest_scaled)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "\n", "import matplotlib.pyplot as plt4\n", "\n", "\n", "history4 = Vuln_model4.fit(x= Xtrain_scaled,y=y_train, batch_size=128, epochs=20, verbose=1, validation_split=0.33,class_weight=class_weights)\n", "scores4 = Vuln_model4.evaluate(x= Xtest_scaled, y=y_test, verbose=1)\n", "print('Final Vuln_model4 (Resent) accuracy: ', scores4[1])\n", "\n", "\n", "plt4.plot(history4.history['accuracy'])\n", "plt4.plot(history4.history['val_accuracy'])\n", "\n", "plt4.title('Resent model accuracy')\n", "plt4.ylabel('accuracy')\n", "plt4.xlabel('epoch')\n", "plt4.legend(['train','test'], loc='upper left')\n", "plt4.show()\n", "\n", "plt4.plot(history4.history['loss'])\n", "plt4.plot(history4.history['val_loss'])\n", "\n", "plt4.title('Resent model loss')\n", "plt4.ylabel('loss')\n", "plt4.xlabel('epoch')\n", "plt4.legend(['train','test'], loc='upper left')\n", "plt4.show()\n", "\n", "\n", "classes = Vuln_model4.predict(x= Xtest_scaled)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "history5 = Vuln_model5.fit(x= X_train, y=y_train,batch_size=128, epochs=20, verbose=1, validation_split=0.33,class_weight=class_weights)\n", "scores5 = Vuln_model5.evaluate(x= X_test,y=y_test, verbose=1)\n", "print('Final Vuln_model5 (LSTM) accuracy: ', scores5[1])\n", "\n", "import matplotlib.pyplot as plt5\n", "\n", "plt5.plot(history5.history['acc'])\n", "plt5.plot(history5.history['val_acc'])\n", "\n", "plt5.title('LSTM model accuracy')\n", "plt5.ylabel('accuracy')\n", "plt5.xlabel('epoch')\n", "plt5.legend(['train','test'], loc='upper left')\n", "plt5.show()\n", "\n", "plt5.plot(history5.history['loss'])\n", "plt5.plot(history5.history['val_loss'])\n", "\n", "plt5.title('LSTM model loss')\n", "plt5.ylabel('loss')\n", "plt5.xlabel('epoch')\n", "plt5.legend(['train','test'], loc='upper left')\n", "plt5.show()\n", "\n", "\n", "classes = Vuln_model5.predict(x= X_test)\n", "y_classes = classes.argmax(axis=-1)\n", "true_classes = y_test.argmax(axis=-1)\n", "\n", "print(classification_report(true_classes, y_classes,digits=6))\n", "\n", "\n", "\n", "cf_matrix = confusion_matrix(true_classes, y_classes)\n", "print(cf_matrix)\n", "\n", "\n", "\n", "\n", "Vuln_model1.save('/Users/abdullah/models/Model1.h5')\n", "Vuln_model2.save('/Users/abdullah/models/Model2.h5')\n", "Vuln_model3.save('/Users/abdullah/models/Model3.h5')\n", "Vuln_model4.save('/Users/abdullah/models/Model4.h5')\n", "Vuln_model5.save('/Users/abdullah/models/Model5.h5')\n", "\n", "\n", "\n", "from keras.models import load_model\n", "\n", "# load models from file\n", "def load_all_models(n_models):\n", "\tall_models = list()\n", "\tfor i in range(n_models):\n", "\t\t# define filename for this ensemble\n", "\t\tfilename = '/Users/abdullah/models/Model' + str(i+1) + '.h5'\n", "\t\t# load model from file\n", "\t\tmodel = load_model(filename)\n", "\t\t# add to list of members\n", "\t\tall_models.append(model)\n", "\t\tprint('>loaded \\%s' \\% filename)\n", "\treturn all_models\n", "\n", "\n", "# load all models\n", "n_members = 5\n", "members = load_all_models(n_members)\n", "print('Loaded \\%d models' \\% len(members))\n", "\n", "\n", " # create stacked model input dataset as outputs from the ensemble\n", "def stacked_dataset(members, inputX):\n", "\tstackX = None\n", "\tfor model in members:\n", "\t\t# make prediction\n", "\t\tyhat = model.predict(inputX, verbose=0)\n", "\t\t# stack predictions into [rows, members, probabilities]\n", "\t\tif stackX is None:\n", "\t\t\tstackX = yhat\n", "\t\telse:\n", "\t\t\tstackX = dstack((stackX, yhat))\n", "\t# flatten predictions to [rows, members x probabilities]\n", "\tstackX = stackX.reshape((stackX.shape[0],\n", " stackX.shape[1]*stackX.shape[2]))\n", "\treturn stackX\n", "\n", "\n", "\n", "\n", " # fit a model based on the outputs from the ensemble members\n", "def fit_stacked_model(members, inputX, inputy):\n", "\t# create dataset using ensemble\n", "\tstackedX = stacked_dataset(members, inputX)\n", "\t# fit standalone model\n", "\tmodel = LogisticRegression()\n", "\tmodel.fit(stackedX, inputy)\n", "\treturn model\n", "\n", "\n", "# fit stacked model using the ensemble\n", "from numpy import dstack\n", "from sklearn.linear_model import LogisticRegression\n", "y_test=np.argmax(y_test, axis=1)\n", "\n", "model = fit_stacked_model(members, Xtest_scaled, y_test)\n", "\n", "\n", "\n", "# make a prediction with the stacked model\n", "\n", "def stacked_prediction(members, model, inputX):\n", "\t# create dataset using ensemble\n", "\tstackedX = stacked_dataset(members, inputX)\n", "\t# make a prediction\n", "\tyhat = model.predict(stackedX)\n", "\treturn yhat\n", "\n", "# evaluate model on test set\n", "from sklearn.metrics import accuracy_score\n", "yhat = stacked_prediction(members, model, Xtest_scaled)\n", "\n", "acc = accuracy_score(y_test, yhat)\n", "print('Stacked Test Accuracy: \\%.3f' \\% acc)\n", "print(len(yhat))\n", "\n", "print(classification_report(y_test, yhat,digits=6))\n", "\n", "cf_matrix = confusion_matrix(y_test, yhat)\n", "print(cf_matrix)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 5 }