{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "import tensorflow as tf\n", "import numpy as np\n", "import pandas as pd\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "from keras import losses\n", "from keras import optimizers\n", "from keras import metrics\n", "import math\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ], "metadata": { "id": "sIDDU2PYPdH_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "CSV_HEADER = [\n", " \"duration\",\n", " \"protocol_type\",\n", " \"service\",\n", " \"flag\",\n", " \"src_bytes\",\n", " \"dst_bytes\",\n", " \"land\",\n", " \"wrong_fragment\",\n", " \"urgent\",\n", " \"hot\",\n", " \"num_failed_logins\",\n", " \"logged_in\",\n", " \"num_compromised\",\n", " \"root_shell\",\n", " \"su_attempted\",\n", " \"num_root\",\n", " \"num_file_creations\",\n", " \"num_shells\",\n", " \"num_access_files\",\n", " \"num_outbound_cmds\",\n", " \"is_host_login\",\n", " \"is_guest_login\",\n", " \"count\",\n", " \"srv_count\",\n", " \"serror_rate\",\n", " \"srv_serror_rate\",\n", " \"rerror_rate\",\n", " \"srv_rerror_rate\",\n", " \"same_srv_rate\",\n", " \"diff_srv_rate\",\n", " \"srv_diff_host_rate\",\n", " \"dst_host_count\",\n", " \"dst_host_srv_count\",\n", " \"dst_host_same_srv_rate\",\n", " \"dst_host_diff_srv_rate\",\n", " \"dst_host_same_src_port_rate\",\n", " \"dst_host_srv_diff_host_rate\",\n", " \"dst_host_serror_rate\",\n", " \"dst_host_srv_serror_rate\",\n", " \"dst_host_rerror_rate\",\n", " \"dst_host_srv_rerror_rate\",\n", " \"class\"\n", "]\n", "\n", "\n", "train_data = pd.read_csv(\"train.csv\", header=None, names=CSV_HEADER)\n", "\n", "test_data = pd.read_csv(\"test.csv\", header=None, names=CSV_HEADER)\n", "\n", "print(f\"Train dataset shape: {train_data.shape}\")\n", "print(f\"Test dataset shape: {test_data.shape}\")\n", "train_data['class'] = train_data['class'].str.replace(r\"^(.(?:53: FutureWarning: The default value of regex will change from True to False in a future version.\n", " train_data['class'] = train_data['class'].str.replace(r\"^(.(?:54: FutureWarning: The default value of regex will change from True to False in a future version.\n", " test_data['class'] = test_data['class'].str.replace(r\"^(.(?" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## Removing outliers and duplicates" ], "metadata": { "id": "wHhIvDl9V5kl" } }, { "cell_type": "code", "source": [ "def Remove_Outlier_Indices(df):\n", " Q1 = df.quantile(0.02)\n", " Q3 = df.quantile(0.98)\n", " IQR = Q3 - Q1\n", " #trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR)))\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n", " return trueList\n", "\n", "nonOutlierList = Remove_Outlier_Indices(train_data)\n", "new_train_data = train_data[nonOutlierList]\n", "\n", "nonOutlierList = Remove_Outlier_Indices(test_data)\n", "new_test_data = test_data[nonOutlierList]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8GZwWkuSV5HT", "outputId": "a0f476a0-e5da-483d-cab3-396ba3a213d2" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":2: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q1 = df.quantile(0.02)\n", ":3: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q3 = df.quantile(0.98)\n", ":6: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Merging train and test datasets" ], "metadata": { "id": "uNzhq0uHKkUB" } }, { "cell_type": "code", "source": [ "frames = [new_train_data, new_test_data]\n", "df = pd.concat(frames)\n", "df = df.reset_index(drop=True)\n", "df" ], "metadata": { "id": "3kxP6kBRGX2y", "colab": { "base_uri": "https://localhost:8080/", "height": 444 }, "outputId": "775127dd-2714-47e8-b1eb-60695ad07e48" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " duration protocol_type service flag src_bytes dst_bytes land \\\n", "0 0 1 20 9 491 0 0 \n", "1 0 2 44 9 146 0 0 \n", "2 0 1 49 5 0 0 0 \n", "3 0 1 24 9 232 8153 0 \n", "4 0 1 24 9 199 420 0 \n", "... ... ... ... ... ... ... ... \n", "139899 0 0 14 9 1032 0 0 \n", "139900 0 1 49 9 794 333 0 \n", "139901 0 1 22 9 317 938 0 \n", "139902 0 2 11 9 42 42 0 \n", "139903 0 1 52 1 0 0 0 \n", "\n", " wrong_fragment urgent hot ... dst_host_srv_count \\\n", "0 0 0 0 ... 25 \n", "1 0 0 0 ... 1 \n", "2 0 0 0 ... 26 \n", "3 0 0 0 ... 255 \n", "4 0 0 0 ... 255 \n", "... ... ... ... ... ... \n", "139899 0 0 0 ... 255 \n", "139900 0 0 0 ... 141 \n", "139901 0 0 0 ... 255 \n", "139902 0 0 0 ... 252 \n", "139903 0 0 0 ... 21 \n", "\n", " dst_host_same_srv_rate dst_host_diff_srv_rate \\\n", "0 0.17 0.03 \n", "1 0.00 0.60 \n", "2 0.10 0.05 \n", "3 1.00 0.00 \n", "4 1.00 0.00 \n", "... ... ... \n", "139899 1.00 0.00 \n", "139900 0.72 0.06 \n", "139901 1.00 0.00 \n", "139902 0.99 0.01 \n", "139903 0.08 0.03 \n", "\n", " dst_host_same_src_port_rate dst_host_srv_diff_host_rate \\\n", "0 0.17 0.00 \n", "1 0.88 0.00 \n", "2 0.00 0.00 \n", "3 0.03 0.04 \n", "4 0.00 0.00 \n", "... ... ... \n", "139899 1.00 0.00 \n", "139900 0.01 0.01 \n", "139901 0.01 0.01 \n", "139902 0.00 0.00 \n", "139903 0.00 0.00 \n", "\n", " dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate \\\n", "0 0.00 0.00 0.05 \n", "1 0.00 0.00 0.00 \n", "2 1.00 1.00 0.00 \n", "3 0.03 0.01 0.00 \n", "4 0.00 0.00 0.00 \n", "... ... ... ... \n", "139899 0.00 0.00 0.00 \n", "139900 0.01 0.00 0.00 \n", "139901 0.01 0.00 0.00 \n", "139902 0.00 0.00 0.00 \n", "139903 0.00 0.00 0.44 \n", "\n", " dst_host_srv_rerror_rate class \n", "0 0.00 normal \n", "1 0.00 normal \n", "2 0.00 anomaly \n", "3 0.01 normal \n", "4 0.00 normal \n", "... ... ... \n", "139899 0.00 anomaly \n", "139900 0.00 normal \n", "139901 0.00 normal \n", "139902 0.00 normal \n", "139903 1.00 anomaly \n", "\n", "[139904 rows x 42 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationprotocol_typeserviceflagsrc_bytesdst_byteslandwrong_fragmenturgenthot...dst_host_srv_countdst_host_same_srv_ratedst_host_diff_srv_ratedst_host_same_src_port_ratedst_host_srv_diff_host_ratedst_host_serror_ratedst_host_srv_serror_ratedst_host_rerror_ratedst_host_srv_rerror_rateclass
00120949100000...250.170.030.170.000.000.000.050.00normal
10244914600000...10.000.600.880.000.000.000.000.00normal
201495000000...260.100.050.000.001.001.000.000.00anomaly
30124923281530000...2551.000.000.030.040.030.010.000.01normal
4012491994200000...2551.000.000.000.000.000.000.000.00normal
..................................................................
13989900149103200000...2551.000.001.000.000.000.000.000.00anomaly
139900014997943330000...1410.720.060.010.010.010.000.000.00normal
139901012293179380000...2551.000.000.010.010.010.000.000.00normal
1399020211942420000...2520.990.010.000.000.000.000.000.00normal
13990301521000000...210.080.030.000.000.000.000.441.00anomaly
\n", "

139904 rows × 42 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "markdown", "source": [ "## Feature Selection" ], "metadata": { "id": "FiXD9jj7JzAt" } }, { "cell_type": "code", "source": [ "from sklearn.linear_model import Lasso\n", "from sklearn.model_selection import GridSearchCV, KFold\n", "X = df.drop(['class'], axis=1)\n", "Y = df[\"class\"].astype('category').cat.codes\n", "# # parameters to be tested on GridSearchCV\n", "# params = {\"alpha\":np.arange(0.00001, 10, 500)}\n", "\n", "# # Number of Folds and adding the random state for replication\n", "# kf=KFold(n_splits=5,shuffle=True, random_state=42)\n", "\n", "# # Initializing the Model\n", "# lasso = Lasso()\n", "\n", "# # GridSearchCV with model, params and folds.\n", "# lasso_cv=GridSearchCV(lasso, param_grid=params, cv=kf)\n", "# lasso_cv.fit(X, Y)\n", "\n" ], "metadata": { "id": "vkQIwJLqxnb2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# calling the model with the best parameter\n", "lasso1 = Lasso(alpha=0.00001)\n", "lasso1.fit(X, Y)\n", "\n", "# Using np.abs() to make coefficients positive.\n", "lasso1_coef = np.abs(lasso1.coef_)\n", "# Subsetting the features which has more than 0.001 importance.\n", "# feature_subset=np.array(names)[lasso1_coef>0.14]\n", "# print(\"Selected Feature Columns: {}\".format(feature_subset))" ], "metadata": { "id": "CcnIqyU6u_lU", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "2bbfacac-d731-41c8-d062-c66b16160a76" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.040e+01, tolerance: 3.492e+00\n", " model = cd_fast.enet_coordinate_descent(\n" ] } ] }, { "cell_type": "code", "source": [ "names=df.drop(\"class\", axis=1).columns\n", "feature_subset=np.array(names)[lasso1_coef>0.173]\n", "feature_subset" ], "metadata": { "id": "tE0wNJm-CXLz", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "077ac555-eaed-4d47-cf27-014c65442773" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['protocol_type', 'num_failed_logins', 'logged_in',\n", " 'is_guest_login', 'rerror_rate', 'srv_rerror_rate',\n", " 'same_srv_rate', 'dst_host_same_src_port_rate',\n", " 'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate'],\n", " dtype=object)" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "source": [ "m=np.array(lasso1_coef)[lasso1_coef>0.17]\n", "m" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "thxrpXYpCjLG", "outputId": "aadeb4bc-fa89-4041-b635-930d423f94b5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([0.26677555, 0.58052921, 0.18609507, 0.64464029, 0.17964476,\n", " 0.22037623, 0.3956496 , 0.20942421, 0.33044222, 0.20789329,\n", " 0.17277382])" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "# plotting the Column Names and Importance of Columns.\n", "plt.bar(names, lasso1_coef)\n", "plt.xticks(rotation=90)\n", "plt.grid()\n", "plt.title(\"Feature Selection Based on Lasso\")\n", "plt.xlabel(\"Features\")\n", "plt.ylabel(\"Importance\")\n", "plt.ylim(0, 0.15)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 659 }, "id": "BiLBVbqxA-EN", "outputId": "f1d883c5-1cbd-43c7-848a-e479a4f03ccc" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "columns = []\n", "for k in feature_subset:\n", " columns.append(str(k))\n", "columns" ], "metadata": { "id": "tUrifaWBvZpx", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "24dd6a69-4636-47fc-c722-86a19004ecda" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['protocol_type',\n", " 'num_failed_logins',\n", " 'logged_in',\n", " 'is_guest_login',\n", " 'rerror_rate',\n", " 'srv_rerror_rate',\n", " 'same_srv_rate',\n", " 'dst_host_same_src_port_rate',\n", " 'dst_host_srv_diff_host_rate',\n", " 'dst_host_srv_serror_rate']" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "X = df.drop(['class'], axis=1)\n", "Y = df[\"class\"].astype('category').cat.codes\n", "principalDf = pd.DataFrame(data = X\n", " , columns = columns)\n", "finalDf = pd.concat([principalDf, df[\"class\"]], axis = 1)\n", "finalDf" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 444 }, "id": "RzFbtWLlsmn0", "outputId": "1be552ec-8a93-4215-d063-4a436956afb7" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " protocol_type num_failed_logins logged_in is_guest_login \\\n", "0 1 0 0 0 \n", "1 2 0 0 0 \n", "2 1 0 0 0 \n", "3 1 0 1 0 \n", "4 1 0 1 0 \n", "... ... ... ... ... \n", "139899 0 0 0 0 \n", "139900 1 0 1 0 \n", "139901 1 0 1 0 \n", "139902 2 0 0 0 \n", "139903 1 0 0 0 \n", "\n", " rerror_rate srv_rerror_rate same_srv_rate \\\n", "0 0.0 0.0 1.00 \n", "1 0.0 0.0 0.08 \n", "2 0.0 0.0 0.05 \n", "3 0.0 0.0 1.00 \n", "4 0.0 0.0 1.00 \n", "... ... ... ... \n", "139899 0.0 0.0 1.00 \n", "139900 0.0 0.0 1.00 \n", "139901 0.0 0.0 1.00 \n", "139902 0.0 0.0 1.00 \n", "139903 1.0 1.0 0.25 \n", "\n", " dst_host_same_src_port_rate dst_host_srv_diff_host_rate \\\n", "0 0.17 0.00 \n", "1 0.88 0.00 \n", "2 0.00 0.00 \n", "3 0.03 0.04 \n", "4 0.00 0.00 \n", "... ... ... \n", "139899 1.00 0.00 \n", "139900 0.01 0.01 \n", "139901 0.01 0.01 \n", "139902 0.00 0.00 \n", "139903 0.00 0.00 \n", "\n", " dst_host_srv_serror_rate class \n", "0 0.00 normal \n", "1 0.00 normal \n", "2 1.00 anomaly \n", "3 0.01 normal \n", "4 0.00 normal \n", "... ... ... \n", "139899 0.00 anomaly \n", "139900 0.00 normal \n", "139901 0.00 normal \n", "139902 0.00 normal \n", "139903 0.00 anomaly \n", "\n", "[139904 rows x 11 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
protocol_typenum_failed_loginslogged_inis_guest_loginrerror_ratesrv_rerror_ratesame_srv_ratedst_host_same_src_port_ratedst_host_srv_diff_host_ratedst_host_srv_serror_rateclass
010000.00.01.000.170.000.00normal
120000.00.00.080.880.000.00normal
210000.00.00.050.000.001.00anomaly
310100.00.01.000.030.040.01normal
410100.00.01.000.000.000.00normal
....................................
13989900000.00.01.001.000.000.00anomaly
13990010100.00.01.000.010.010.00normal
13990110100.00.01.000.010.010.00normal
13990220000.00.01.000.000.000.00normal
13990310001.01.00.250.000.000.00anomaly
\n", "

139904 rows × 11 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "markdown", "source": [ "# Model" ], "metadata": { "id": "B6B5ZuR5J5WQ" } }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "train_data, test_data = train_test_split(finalDf, test_size=0.25)\n", "train_data_file = \"train_data.csv\"\n", "test_data_file = \"test_data.csv\"\n", "\n", "train_data.to_csv(train_data_file, index=False, header=False)\n", "test_data.to_csv(test_data_file, index=False, header=False)" ], "metadata": { "id": "_1N70b_DJb2m" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "CSV_HEADER = []\n", "for x in columns:\n", " CSV_HEADER.append(x)\n", "CSV_HEADER.append(\"class\")\n", "\n", "# A list of the numerical feature names.\n", "NUMERIC_FEATURE_NAMES = columns\n", "# A dictionary of the categorical features and their vocabulary.\n", "CATEGORICAL_FEATURES_WITH_VOCABULARY = {\n", "}\n", "# A list of the columns to ignore from the dataset.\n", "IGNORE_COLUMN_NAMES = []\n", "# A list of the categorical feature names.\n", "CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())\n", "# A list of all the input features.\n", "FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES\n", "# A list of column default values for each feature.\n", "COLUMN_DEFAULTS = [\n", " [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else [\"NA\"]\n", " for feature_name in CSV_HEADER\n", "]\n", "# The name of the target feature.\n", "TARGET_FEATURE_NAME = \"class\"\n", "# A list of the labels of the target features.\n", "TARGET_LABELS = [\"normal\", \"anomaly\"]\n", "\n", "from tensorflow.keras.layers import StringLookup\n", "\n", "target_label_lookup = StringLookup(\n", " vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0\n", ")\n", "\n", "\n", "def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):\n", " dataset = tf.data.experimental.make_csv_dataset(\n", " csv_file_path,\n", " batch_size=batch_size,\n", " column_names=CSV_HEADER,\n", " column_defaults=COLUMN_DEFAULTS,\n", " label_name=TARGET_FEATURE_NAME,\n", " num_epochs=1,\n", " header=False,\n", " na_value=\"?\",\n", " shuffle=shuffle,\n", " ).map(lambda features, target: (features, target_label_lookup(target)))\n", " return dataset.cache()\n", "\n", "def create_model_inputs():\n", " inputs = {}\n", " for feature_name in FEATURE_NAMES:\n", " if feature_name in NUMERIC_FEATURE_NAMES:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.float32\n", " )\n", " else:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.string\n", " )\n", " return inputs\n", "\n", "def encode_inputs(inputs):\n", " encoded_features = []\n", " for feature_name in inputs:\n", " if feature_name in CATEGORICAL_FEATURE_NAMES:\n", " vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]\n", " #print(vocabulary)\n", " # Create a lookup to convert a string values to an integer indices.\n", " # Since we are not using a mask token, nor expecting any out of vocabulary\n", " # (oov) token, we set mask_token to None and num_oov_indices to 0.\n", " lookup = StringLookup(\n", " vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n", " )\n", " # Convert the string input values into integer indices.\n", " value_index = lookup(inputs[feature_name])\n", " embedding_dims = int(math.sqrt(lookup.vocabulary_size()))\n", " # Create an embedding layer with the specified dimensions.\n", " embedding = layers.Embedding(\n", " input_dim=lookup.vocabulary_size(), output_dim=embedding_dims\n", " )\n", " # Convert the index values to embedding representations.\n", " encoded_feature = embedding(value_index)\n", " else:\n", " # Use the numerical features as-is.\n", " encoded_feature = inputs[feature_name]\n", " if inputs[feature_name].shape[-1] is None:\n", " encoded_feature = tf.expand_dims(encoded_feature, -1)\n", "\n", " encoded_features.append(encoded_feature)\n", "\n", " encoded_features = layers.concatenate(encoded_features)\n", " return encoded_features\n", "\n", "class NeuralDecisionTree(keras.Model):\n", " def __init__(self, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.depth = depth\n", " self.num_leaves = 2 ** depth\n", " self.num_classes = num_classes\n", "\n", " # Create a mask for the randomly selected features.\n", " num_used_features = int(num_features * used_features_rate)\n", " one_hot = np.eye(num_features)\n", " sampled_feature_indicies = np.random.choice(\n", " np.arange(num_features), num_used_features, replace=False\n", " )\n", " self.used_features_mask = one_hot[sampled_feature_indicies]\n", "\n", " # Initialize the weights of the classes in leaves.\n", " self.pi = tf.Variable(\n", " initial_value=tf.random_normal_initializer()(\n", " shape=[self.num_leaves, self.num_classes]\n", " ),\n", " dtype=\"float32\",\n", " trainable=True,\n", " )\n", "\n", " # Initialize the stochastic routing layer.\n", " self.decision_fn = layers.Dense(\n", " units=self.num_leaves, activation=\"sigmoid\", name=\"decision\"\n", " )\n", "\n", " def call(self, features):\n", " batch_size = tf.shape(features)[0]\n", "\n", " # Apply the feature mask to the input features.\n", " features = tf.matmul(\n", " features, self.used_features_mask, transpose_b=True\n", " ) # [batch_size, num_used_features]\n", " # Compute the routing probabilities.\n", " decisions = tf.expand_dims(\n", " self.decision_fn(features), axis=2\n", " ) # [batch_size, num_leaves, 1]\n", " # Concatenate the routing probabilities with their complements.\n", " decisions = layers.concatenate(\n", " [decisions, 1 - decisions], axis=2\n", " ) # [batch_size, num_leaves, 2]\n", "\n", " mu = tf.ones([batch_size, 1, 1])\n", "\n", " begin_idx = 1\n", " end_idx = 2\n", " # Traverse the tree in breadth-first order.\n", " for level in range(self.depth):\n", " mu = tf.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1]\n", " mu = tf.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2]\n", " level_decisions = decisions[\n", " :, begin_idx:end_idx, :\n", " ] # [batch_size, 2 ** level, 2]\n", " mu = mu * level_decisions # [batch_size, 2**level, 2]\n", " begin_idx = end_idx\n", " end_idx = begin_idx + 2 ** (level + 1)\n", "\n", " mu = tf.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves]\n", " probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes]\n", " outputs = tf.matmul(mu, probabilities) # [batch_size, num_classes]\n", " return outputs\n", "\n", "class NeuralDecisionForest(keras.Model):\n", " def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.ensemble = []\n", " # Initialize the ensemble by adding NeuralDecisionTree instances.\n", " # Each tree will have its own randomly selected input features to use.\n", " for _ in range(num_trees):\n", " self.ensemble.append(\n", " NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)\n", " )\n", "\n", " def call(self, inputs):\n", " # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.\n", " batch_size = tf.shape(inputs)[0]\n", " outputs = tf.zeros([batch_size, num_classes])\n", "\n", " # Aggregate the outputs of trees in the ensemble.\n", " for tree in self.ensemble:\n", " outputs += tree(inputs)\n", " # Divide the outputs by the ensemble size to get the average.\n", " outputs /= len(self.ensemble)\n", " return outputs\n", "learning_rate = 0.01\n", "batch_size = 128\n", "num_epochs = 10\n", "\n", "\n", "def run_experiment(model):\n", "\n", " # model.compile(\n", " # optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " # loss=keras.losses.SparseCategoricalCrossentropy(),\n", " # metrics=[keras.metrics.SparseCategoricalAccuracy()],\n", " # )\n", " model.compile(\n", " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " loss=keras.losses.SparseCategoricalCrossentropy(),\n", " metrics=[metrics.SparseCategoricalAccuracy()],\n", " )\n", " print(\"Start training the model...\")\n", " train_dataset = get_dataset_from_csv(\n", " train_data_file, shuffle=True, batch_size=batch_size\n", " )\n", "\n", " model.fit(train_dataset, epochs=num_epochs)\n", " print(\"Model training finished\")\n", "\n", " print(\"Evaluating the model on the test data...\")\n", " test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "\n", " _, accuracy = model.evaluate(test_dataset)\n", " print(f\"Test accuracy: {round(accuracy * 100, 2)}%\")\n", " return model" ], "metadata": { "id": "PGCEwlOPPpEP", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "6b1d9165-a8ed-41a9-fb06-eda1b67a9118" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/numpy/core/numeric.py:2463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", " return bool(asarray(a1 == a2).all())\n" ] } ] }, { "cell_type": "code", "source": [ "num_trees = 25\n", "depth = 5\n", "used_features_rate = 0.5\n", "num_classes = len(TARGET_LABELS)\n", "\n", "\n", "def create_forest_model():\n", " inputs = create_model_inputs()\n", " features = encode_inputs(inputs)\n", " features = layers.BatchNormalization()(features)\n", " num_features = features.shape[1]\n", "\n", " forest_model = NeuralDecisionForest(\n", " num_trees, depth, num_features, used_features_rate, num_classes\n", " )\n", "\n", " outputs = forest_model(features)\n", " model = keras.Model(inputs=inputs, outputs=outputs)\n", " return model\n", "\n", "\n", "forest_model = create_forest_model()\n", "\n", "finalModel = run_experiment(forest_model)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vLjxBfBIQUKR", "outputId": "5afeb048-e506-4273-ed9c-c38f4472311a" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Start training the model...\n", "Epoch 1/10\n", "820/820 [==============================] - 53s 27ms/step - loss: 0.1932 - sparse_categorical_accuracy: 0.9393\n", "Epoch 2/10\n", "820/820 [==============================] - 21s 26ms/step - loss: 0.1443 - sparse_categorical_accuracy: 0.9520\n", "Epoch 3/10\n", "820/820 [==============================] - 21s 26ms/step - loss: 0.1412 - sparse_categorical_accuracy: 0.9543\n", "Epoch 4/10\n", "820/820 [==============================] - 20s 24ms/step - loss: 0.1396 - sparse_categorical_accuracy: 0.9562\n", "Epoch 5/10\n", "820/820 [==============================] - 21s 26ms/step - loss: 0.1386 - sparse_categorical_accuracy: 0.9577\n", "Epoch 6/10\n", "820/820 [==============================] - 19s 23ms/step - loss: 0.1381 - sparse_categorical_accuracy: 0.9583\n", "Epoch 7/10\n", "820/820 [==============================] - 20s 25ms/step - loss: 0.1376 - sparse_categorical_accuracy: 0.9587\n", "Epoch 8/10\n", "820/820 [==============================] - 20s 25ms/step - loss: 0.1373 - sparse_categorical_accuracy: 0.9590\n", "Epoch 9/10\n", "820/820 [==============================] - 20s 25ms/step - loss: 0.1371 - sparse_categorical_accuracy: 0.9592\n", "Epoch 10/10\n", "820/820 [==============================] - 20s 24ms/step - loss: 0.1369 - sparse_categorical_accuracy: 0.9594\n", "Model training finished\n", "Evaluating the model on the test data...\n", "274/274 [==============================] - 6s 11ms/step - loss: 0.1370 - sparse_categorical_accuracy: 0.9594\n", "Test accuracy: 95.94%\n" ] } ] }, { "cell_type": "code", "source": [ "test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "colnames=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'target']\n", "data = pd.read_csv(\"test_data.csv\", names=colnames, header=None)\n", "data['target'].replace('normal', 0,inplace=True)\n", "data['target'].replace('anomaly', 1,inplace=True)\n", "y_test = data['target'].values # as a numpy array\n", "from sklearn.metrics import confusion_matrix\n", "y_prediction = finalModel.predict(test_dataset)\n", "y_prediction = np.argmax (y_prediction, axis = 1)\n", "result = confusion_matrix(y_test, y_prediction , normalize='pred')\n", "print(result)\n", "TP = result[0][0]\n", "FP = result[0][1]\n", "TN = result[1][1]\n", "FN = result[1][0]\n", "ACC = (TP+TN)/(TP+TN+FP+FN)\n", "PR = TP/(TP+FP) #precision\n", "TPR = TP/(TP+FN) #Recall or True positive rate\n", "FPR = FP/(FP+TN)\n", "F1Score = 2*(PR*TPR)/(PR+TPR)\n", "print(\"ACC: \" + str(ACC))\n", "print(\"PR: \" + str(PR))\n", "print(\"TPR: \" + str(TPR))\n", "print(\"FPR: \" + str(FPR))\n", "print(\"F1Score: \" + str(F1Score))\n" ], "metadata": { "id": "Bzegny-zWVlM", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "46a090ad-fcf8-4ba8-843f-2af61f213fd5" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "274/274 [==============================] - 6s 12ms/step\n", "[[0.94727873 0.02660822]\n", " [0.05272127 0.97339178]]\n", "ACC: 0.9603352535958944\n", "PR: 0.9726783245865884\n", "TPR: 0.9472787304292257\n", "FPR: 0.02660822323743672\n", "F1Score: 0.9598105187022743\n" ] } ] }, { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "import numpy\n", "from sklearn import metrics\n", "\n", "\n", "cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = result, display_labels = [True, False])\n", "\n", "cm_display.plot()\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "id": "_SfKSvXHKd7z", "outputId": "cdc42ecf-3c02-4e95-9478-6a31db155e79" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "result" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FfPwUmlCPlXc", "outputId": "4d17fb2d-be11-4d12-860e-93b74d991411" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0.94727873, 0.02660822],\n", " [0.05272127, 0.97339178]])" ] }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "result = np.array( [[0.94727873, 0.02660822],\n", " [0.05272127, 0.97339178]] )\n", "result" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d7w5H-XkPl4y", "outputId": "296a6792-cc0d-4f8e-f418-b75aa9b7afd5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0.94727873, 0.02660822],\n", " [0.05272127, 0.97339178]])" ] }, "metadata": {}, "execution_count": 1 } ] }, { "cell_type": "markdown", "source": [ "## CISIDS2017" ], "metadata": { "id": "lx4RpN47ybt6" } }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kHU-PAI6yhyz", "outputId": "4ed1ac5a-4ce4-48e3-fd44-8526e3179c58" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import glob\n", "import os\n", "\n", "path = r'C:\\DRO\\DCL_rawdata_files' # use your path\n", "all_files = glob.glob(os.path.join(path , \"/content/drive/MyDrive/datasets/CISIDS2017/*.csv\"))\n", "\n", "li = []\n", "\n", "for filename in all_files:\n", " df = pd.read_csv(filename, index_col=None, header=0)\n", " li.append(df)\n", "\n", "frame = pd.concat(li, axis=0, ignore_index=True)" ], "metadata": { "id": "orUywltc2yaK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "column_headers = list(frame.columns.values)\n", "column_headers" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eJcGQKBn4Abs", "outputId": "886afb27-efba-4e72-dad4-a2fc4632607b" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[' Destination Port',\n", " ' Flow Duration',\n", " ' Total Fwd Packets',\n", " ' Total Backward Packets',\n", " 'Total Length of Fwd Packets',\n", " ' Total Length of Bwd Packets',\n", " ' Fwd Packet Length Max',\n", " ' Fwd Packet Length Min',\n", " ' Fwd Packet Length Mean',\n", " ' Fwd Packet Length Std',\n", " 'Bwd Packet Length Max',\n", " ' Bwd Packet Length Min',\n", " ' Bwd Packet Length Mean',\n", " ' Bwd Packet Length Std',\n", " 'Flow Bytes/s',\n", " ' Flow Packets/s',\n", " ' Flow IAT Mean',\n", " ' Flow IAT Std',\n", " ' Flow IAT Max',\n", " ' Flow IAT Min',\n", " 'Fwd IAT Total',\n", " ' Fwd IAT Mean',\n", " ' Fwd IAT Std',\n", " ' Fwd IAT Max',\n", " ' Fwd IAT Min',\n", " 'Bwd IAT Total',\n", " ' Bwd IAT Mean',\n", " ' Bwd IAT Std',\n", " ' Bwd IAT Max',\n", " ' Bwd IAT Min',\n", " 'Fwd PSH Flags',\n", " ' Bwd PSH Flags',\n", " ' Fwd URG Flags',\n", " ' Bwd URG Flags',\n", " ' Fwd Header Length',\n", " ' Bwd Header Length',\n", " 'Fwd Packets/s',\n", " ' Bwd Packets/s',\n", " ' Min Packet Length',\n", " ' Max Packet Length',\n", " ' Packet Length Mean',\n", " ' Packet Length Std',\n", " ' Packet Length Variance',\n", " 'FIN Flag Count',\n", " ' SYN Flag Count',\n", " ' RST Flag Count',\n", " ' PSH Flag Count',\n", " ' ACK Flag Count',\n", " ' URG Flag Count',\n", " ' CWE Flag Count',\n", " ' ECE Flag Count',\n", " ' Down/Up Ratio',\n", " ' Average Packet Size',\n", " ' Avg Fwd Segment Size',\n", " ' Avg Bwd Segment Size',\n", " ' Fwd Header Length.1',\n", " 'Fwd Avg Bytes/Bulk',\n", " ' Fwd Avg Packets/Bulk',\n", " ' Fwd Avg Bulk Rate',\n", " ' Bwd Avg Bytes/Bulk',\n", " ' Bwd Avg Packets/Bulk',\n", " 'Bwd Avg Bulk Rate',\n", " 'Subflow Fwd Packets',\n", " ' Subflow Fwd Bytes',\n", " ' Subflow Bwd Packets',\n", " ' Subflow Bwd Bytes',\n", " 'Init_Win_bytes_forward',\n", " ' Init_Win_bytes_backward',\n", " ' act_data_pkt_fwd',\n", " ' min_seg_size_forward',\n", " 'Active Mean',\n", " ' Active Std',\n", " ' Active Max',\n", " ' Active Min',\n", " 'Idle Mean',\n", " ' Idle Std',\n", " ' Idle Max',\n", " ' Idle Min',\n", " ' Label']" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "frame[' Label'] = frame[' Label'].str.replace(r\"^(.(?:1: FutureWarning: The default value of regex will change from True to False in a future version.\n", " frame[' Label'] = frame[' Label'].str.replace(r\"^(.(? (Q3 + 1.5 * IQR)))\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n", " return trueList\n", "\n", "nonOutlierList = Remove_Outlier_Indices(df)\n", "new_data = df[nonOutlierList]\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "f8da8baa-d15e-412d-905e-e27926307110", "id": "i_PXbZyt5-fR" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":2: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q1 = df.quantile(0.02)\n", ":3: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q3 = df.quantile(0.98)\n", ":6: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n" ] } ] }, { "cell_type": "code", "source": [ "df = new_data\n", "df = df.reset_index(drop=True)\n", "del new_data\n", "del nonOutlierList\n", "del li\n", "del frame" ], "metadata": { "id": "FSPL95xg5-fT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "dqn7YpLGXhEW" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.linear_model import Lasso\n", "from sklearn.model_selection import GridSearchCV, KFold\n", "X = df.drop([' Label'], axis=1)\n", "Y = df[\" Label\"].astype('category').cat.codes\n", "# parameters to be tested on GridSearchCV\n", "params = {\"alpha\":np.arange(0.00001, 10, 500)}\n", "\n", "# Number of Folds and adding the random state for replication\n", "kf=KFold(n_splits=5,shuffle=True, random_state=42)\n", "\n", "# Initializing the Model\n", "lasso = Lasso()\n", "\n", "# GridSearchCV with model, params and folds.\n", "lasso_cv=GridSearchCV(lasso, param_grid=params, cv=kf)\n", "lasso_cv.fit(X, Y)\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 346 }, "id": "b_-yg9HYXhv7", "outputId": "538a8f92-3dd4-419e-f12f-79ff3ac17f37" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.965e+04, tolerance: 3.205e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.967e+04, tolerance: 3.207e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.962e+04, tolerance: 3.203e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.967e+04, tolerance: 3.203e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.959e+04, tolerance: 3.204e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.455e+04, tolerance: 4.006e+01\n", " model = cd_fast.enet_coordinate_descent(\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),\n", " estimator=Lasso(), param_grid={'alpha': array([1.e-05])})" ], "text/html": [ "
GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),\n",
              "             estimator=Lasso(), param_grid={'alpha': array([1.e-05])})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "from sklearn.linear_model import Lasso\n", "from sklearn.model_selection import GridSearchCV, KFold\n", "X = df.drop([' Label'], axis=1)\n", "Y = df[\" Label\"].astype('category').cat.codes\n", "# calling the model with the best parameter\n", "lasso1 = Lasso(alpha=0.00001)\n", "lasso1.fit(X, Y)\n", "\n", "# Using np.abs() to make coefficients positive.\n", "lasso1_coef = np.abs(lasso1.coef_)\n", "# Subsetting the features which has more than 0.001 importance.\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "N1SR7E6EXhv7", "outputId": "3c918ac7-b837-4b8d-82a8-0cf7a03cfbc5" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.455e+04, tolerance: 4.006e+01\n", " model = cd_fast.enet_coordinate_descent(\n" ] } ] }, { "cell_type": "code", "source": [ "names=df.drop(\" Label\", axis=1).columns" ], "metadata": { "id": "fb0YK8mtv7Ph" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "feature_subset=np.array(names)[lasso1_coef>0.01]\n", "feature_subset" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hsra5JiBXhv8", "outputId": "8c61c043-2898-487b-8f06-9bfead8bd784" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([' Total Fwd Packets', ' Total Backward Packets', 'Fwd PSH Flags',\n", " 'FIN Flag Count', ' SYN Flag Count', ' PSH Flag Count',\n", " ' ACK Flag Count', ' URG Flag Count', ' Down/Up Ratio',\n", " ' Subflow Bwd Packets'], dtype=object)" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "m=np.array(lasso1_coef)[lasso1_coef>0.01]\n", "m" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "022906ca-f74e-42ab-c43a-0b6b2845217b", "id": "UB68DTTvXhv9" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([0.02261965, 0.01475288, 0.05149803, 0.28977241, 0.0593355 ,\n", " 0.4697659 , 0.38491962, 0.30526729, 0.07180872, 0.0207763 ])" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "# plotting the Column Names and Importance of Columns.\n", "plt.bar(names, lasso1_coef)\n", "plt.xticks(rotation=90)\n", "plt.grid()\n", "plt.title(\"Feature Selection Based on Lasso\")\n", "plt.xlabel(\"Features\")\n", "plt.ylabel(\"Importance\")\n", "plt.ylim(0, 0.15)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 656 }, "outputId": "5a2c0486-7235-4def-d6c0-2b1a79343932", "id": "Xm5vrrslXhv-" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "columns = []\n", "for k in feature_subset:\n", " columns.append(str(k))\n", "columns" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "6913a1e0-8ef7-419b-b02b-4dd52835d442", "id": "PXkTjydrXhv-" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[' Total Fwd Packets',\n", " ' Total Backward Packets',\n", " 'Fwd PSH Flags',\n", " 'FIN Flag Count',\n", " ' SYN Flag Count',\n", " ' PSH Flag Count',\n", " ' ACK Flag Count',\n", " ' URG Flag Count',\n", " ' Down/Up Ratio',\n", " ' Subflow Bwd Packets']" ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "X = df.drop([' Label'], axis=1)\n", "Y = df[\" Label\"].astype('category').cat.codes\n", "principalDf = pd.DataFrame(data = X\n", " , columns = columns)\n", "finalDf = pd.concat([principalDf, df[\" Label\"]], axis = 1)\n", "finalDf" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 441 }, "outputId": "2289a865-9d89-46db-cf32-53232ebe3f43", "id": "8wP2orQHXhv_" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Total Fwd Packets Total Backward Packets Fwd PSH Flags \\\n", "0 2 0 0 \n", "1 1 1 0 \n", "2 1 1 0 \n", "3 1 1 0 \n", "4 2 0 0 \n", "... ... ... ... \n", "2539413 4 2 0 \n", "2539414 2 2 0 \n", "2539415 2 1 1 \n", "2539416 6 2 0 \n", "2539417 4 2 0 \n", "\n", " FIN Flag Count SYN Flag Count PSH Flag Count ACK Flag Count \\\n", "0 0 0 0 1 \n", "1 0 0 0 1 \n", "2 0 0 0 1 \n", "3 0 0 0 1 \n", "4 0 0 0 1 \n", "... ... ... ... ... \n", "2539413 0 0 0 0 \n", "2539414 0 0 0 0 \n", "2539415 0 1 0 1 \n", "2539416 0 0 0 0 \n", "2539417 0 0 0 0 \n", "\n", " URG Flag Count Down/Up Ratio Subflow Bwd Packets Label \n", "0 0 0 0 BENIGN \n", "1 1 1 1 BENIGN \n", "2 1 1 1 BENIGN \n", "3 1 1 1 BENIGN \n", "4 0 0 0 BENIGN \n", "... ... ... ... ... \n", "2539413 0 0 2 BENIGN \n", "2539414 0 1 2 BENIGN \n", "2539415 0 0 1 BENIGN \n", "2539416 0 0 2 BENIGN \n", "2539417 0 0 2 BENIGN \n", "\n", "[2539418 rows x 11 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Total Fwd PacketsTotal Backward PacketsFwd PSH FlagsFIN Flag CountSYN Flag CountPSH Flag CountACK Flag CountURG Flag CountDown/Up RatioSubflow Bwd PacketsLabel
02000001000BENIGN
11100001111BENIGN
21100001111BENIGN
31100001111BENIGN
42000001000BENIGN
....................................
25394134200000002BENIGN
25394142200000012BENIGN
25394152110101001BENIGN
25394166200000002BENIGN
25394174200000002BENIGN
\n", "

2539418 rows × 11 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "train_data, test_data = train_test_split(finalDf, test_size=0.25)\n", "train_data_file = \"train_data.csv\"\n", "test_data_file = \"test_data.csv\"\n", "\n", "train_data.to_csv(train_data_file, index=False, header=False)\n", "test_data.to_csv(test_data_file, index=False, header=False)" ], "metadata": { "id": "c85wOCBv5-fX" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "del finalDf\n", "del principalDf\n", "del train_data\n", "del test_data" ], "metadata": { "id": "szYkC6TjPd0U" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "CSV_HEADER = []\n", "for x in columns:\n", " CSV_HEADER.append(x)\n", "CSV_HEADER.append(\" Label\")\n", "\n", "# A list of the numerical feature names.\n", "NUMERIC_FEATURE_NAMES = columns\n", "# A dictionary of the categorical features and their vocabulary.\n", "CATEGORICAL_FEATURES_WITH_VOCABULARY = {\n", "}\n", "# A list of the columns to ignore from the dataset.\n", "IGNORE_COLUMN_NAMES = []\n", "# A list of the categorical feature names.\n", "CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())\n", "# A list of all the input features.\n", "FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES\n", "# A list of column default values for each feature.\n", "COLUMN_DEFAULTS = [\n", " [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else [\"NA\"]\n", " for feature_name in CSV_HEADER\n", "]\n", "# The name of the target feature.\n", "TARGET_FEATURE_NAME = \" Label\"\n", "# A list of the labels of the target features.\n", "TARGET_LABELS = [\"BENIGN\", \"ANOMALY\"]\n", "\n", "from tensorflow.keras.layers import StringLookup\n", "\n", "target_label_lookup = StringLookup(\n", " vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0\n", ")\n", "\n", "\n", "def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):\n", " dataset = tf.data.experimental.make_csv_dataset(\n", " csv_file_path,\n", " batch_size=batch_size,\n", " column_names=CSV_HEADER,\n", " column_defaults=COLUMN_DEFAULTS,\n", " label_name=TARGET_FEATURE_NAME,\n", " num_epochs=1,\n", " header=False,\n", " na_value=\"?\",\n", " shuffle=shuffle,\n", " ).map(lambda features, target: (features, target_label_lookup(target)))\n", " return dataset.cache()\n", "\n", "def create_model_inputs():\n", " inputs = {}\n", " for feature_name in FEATURE_NAMES:\n", " if feature_name in NUMERIC_FEATURE_NAMES:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.float32\n", " )\n", " else:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.string\n", " )\n", " return inputs\n", "\n", "def encode_inputs(inputs):\n", " encoded_features = []\n", " for feature_name in inputs:\n", " if feature_name in CATEGORICAL_FEATURE_NAMES:\n", " vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]\n", " #print(vocabulary)\n", " # Create a lookup to convert a string values to an integer indices.\n", " # Since we are not using a mask token, nor expecting any out of vocabulary\n", " # (oov) token, we set mask_token to None and num_oov_indices to 0.\n", " lookup = StringLookup(\n", " vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n", " )\n", " # Convert the string input values into integer indices.\n", " value_index = lookup(inputs[feature_name])\n", " embedding_dims = int(math.sqrt(lookup.vocabulary_size()))\n", " # Create an embedding layer with the specified dimensions.\n", " embedding = layers.Embedding(\n", " input_dim=lookup.vocabulary_size(), output_dim=embedding_dims\n", " )\n", " # Convert the index values to embedding representations.\n", " encoded_feature = embedding(value_index)\n", " else:\n", " # Use the numerical features as-is.\n", " encoded_feature = inputs[feature_name]\n", " if inputs[feature_name].shape[-1] is None:\n", " encoded_feature = tf.expand_dims(encoded_feature, -1)\n", "\n", " encoded_features.append(encoded_feature)\n", "\n", " encoded_features = layers.concatenate(encoded_features)\n", " return encoded_features\n", "\n", "class NeuralDecisionTree(keras.Model):\n", " def __init__(self, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.depth = depth\n", " self.num_leaves = 2 ** depth\n", " self.num_classes = num_classes\n", "\n", " # Create a mask for the randomly selected features.\n", " num_used_features = int(num_features * used_features_rate)\n", " one_hot = np.eye(num_features)\n", " sampled_feature_indicies = np.random.choice(\n", " np.arange(num_features), num_used_features, replace=False\n", " )\n", " self.used_features_mask = one_hot[sampled_feature_indicies]\n", "\n", " # Initialize the weights of the classes in leaves.\n", " self.pi = tf.Variable(\n", " initial_value=tf.random_normal_initializer()(\n", " shape=[self.num_leaves, self.num_classes]\n", " ),\n", " dtype=\"float32\",\n", " trainable=True,\n", " )\n", "\n", " # Initialize the stochastic routing layer.\n", " self.decision_fn = layers.Dense(\n", " units=self.num_leaves, activation=\"sigmoid\", name=\"decision\"\n", " )\n", "\n", " def call(self, features):\n", " batch_size = tf.shape(features)[0]\n", "\n", " # Apply the feature mask to the input features.\n", " features = tf.matmul(\n", " features, self.used_features_mask, transpose_b=True\n", " ) # [batch_size, num_used_features]\n", " # Compute the routing probabilities.\n", " decisions = tf.expand_dims(\n", " self.decision_fn(features), axis=2\n", " ) # [batch_size, num_leaves, 1]\n", " # Concatenate the routing probabilities with their complements.\n", " decisions = layers.concatenate(\n", " [decisions, 1 - decisions], axis=2\n", " ) # [batch_size, num_leaves, 2]\n", "\n", " mu = tf.ones([batch_size, 1, 1])\n", "\n", " begin_idx = 1\n", " end_idx = 2\n", " # Traverse the tree in breadth-first order.\n", " for level in range(self.depth):\n", " mu = tf.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1]\n", " mu = tf.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2]\n", " level_decisions = decisions[\n", " :, begin_idx:end_idx, :\n", " ] # [batch_size, 2 ** level, 2]\n", " mu = mu * level_decisions # [batch_size, 2**level, 2]\n", " begin_idx = end_idx\n", " end_idx = begin_idx + 2 ** (level + 1)\n", "\n", " mu = tf.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves]\n", " probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes]\n", " outputs = tf.matmul(mu, probabilities) # [batch_size, num_classes]\n", " return outputs\n", "\n", "class NeuralDecisionForest(keras.Model):\n", " def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.ensemble = []\n", " # Initialize the ensemble by adding NeuralDecisionTree instances.\n", " # Each tree will have its own randomly selected input features to use.\n", " for _ in range(num_trees):\n", " self.ensemble.append(\n", " NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)\n", " )\n", "\n", " def call(self, inputs):\n", " # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.\n", " batch_size = tf.shape(inputs)[0]\n", " outputs = tf.zeros([batch_size, num_classes])\n", "\n", " # Aggregate the outputs of trees in the ensemble.\n", " for tree in self.ensemble:\n", " outputs += tree(inputs)\n", " # Divide the outputs by the ensemble size to get the average.\n", " outputs /= len(self.ensemble)\n", " return outputs\n", "learning_rate = 0.01\n", "batch_size = 128\n", "num_epochs = 10\n", "\n", "\n", "def run_experiment(model):\n", "\n", " # model.compile(\n", " # optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " # loss=keras.losses.SparseCategoricalCrossentropy(),\n", " # metrics=[keras.metrics.SparseCategoricalAccuracy()],\n", " # )\n", " model.compile(\n", " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " loss=keras.losses.SparseCategoricalCrossentropy(),\n", " metrics=[metrics.SparseCategoricalAccuracy()],\n", " )\n", " print(\"Start training the model...\")\n", " train_dataset = get_dataset_from_csv(\n", " train_data_file, shuffle=True, batch_size=batch_size\n", " )\n", "\n", " model.fit(train_dataset, epochs=num_epochs)\n", " print(\"Model training finished\")\n", "\n", " print(\"Evaluating the model on the test data...\")\n", " test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "\n", " _, accuracy = model.evaluate(test_dataset)\n", " print(f\"Test accuracy: {round(accuracy * 100, 2)}%\")\n", " return model" ], "metadata": { "id": "8txIkhqk5-fX", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "0e116a14-d48d-4309-c4d1-8a5925355182" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/numpy/core/numeric.py:2463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", " return bool(asarray(a1 == a2).all())\n" ] } ] }, { "cell_type": "code", "source": [ "num_trees = 25\n", "depth = 5\n", "used_features_rate = 0.5\n", "num_classes = len(TARGET_LABELS)\n", "\n", "\n", "def create_forest_model():\n", " inputs = create_model_inputs()\n", " features = encode_inputs(inputs)\n", " features = layers.BatchNormalization()(features)\n", " num_features = features.shape[1]\n", "\n", " forest_model = NeuralDecisionForest(\n", " num_trees, depth, num_features, used_features_rate, num_classes\n", " )\n", "\n", " outputs = forest_model(features)\n", " model = keras.Model(inputs=inputs, outputs=outputs)\n", " return model\n", "\n", "\n", "forest_model = create_forest_model()\n", "\n", "finalModel = run_experiment(forest_model)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9eed7729-1873-423a-87c3-6ad24d4d8610", "id": "cEIPaJMi5-fa" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Start training the model...\n", "Epoch 1/10\n", "14880/14880 [==============================] - 288s 18ms/step - loss: 0.1936 - sparse_categorical_accuracy: 0.9490\n", "Epoch 2/10\n", "14880/14880 [==============================] - 235s 16ms/step - loss: 0.1866 - sparse_categorical_accuracy: 0.9514\n", "Epoch 3/10\n", "14880/14880 [==============================] - 234s 16ms/step - loss: 0.1856 - sparse_categorical_accuracy: 0.9518\n", "Epoch 4/10\n", "14880/14880 [==============================] - 233s 16ms/step - loss: 0.1851 - sparse_categorical_accuracy: 0.9520\n", "Epoch 5/10\n", "14880/14880 [==============================] - 233s 16ms/step - loss: 0.1848 - sparse_categorical_accuracy: 0.9521\n", "Epoch 6/10\n", "14880/14880 [==============================] - 238s 16ms/step - loss: 0.1849 - sparse_categorical_accuracy: 0.9521\n", "Epoch 7/10\n", "14880/14880 [==============================] - 242s 16ms/step - loss: 0.1847 - sparse_categorical_accuracy: 0.9522\n", "Epoch 8/10\n", "14880/14880 [==============================] - 239s 16ms/step - loss: 0.1844 - sparse_categorical_accuracy: 0.9522\n", "Epoch 9/10\n", "14880/14880 [==============================] - 237s 16ms/step - loss: 0.1843 - sparse_categorical_accuracy: 0.9523\n", "Epoch 10/10\n", "14880/14880 [==============================] - 235s 16ms/step - loss: 0.1842 - sparse_categorical_accuracy: 0.9523\n", "Model training finished\n", "Evaluating the model on the test data...\n", "4960/4960 [==============================] - 45s 9ms/step - loss: 0.1749 - sparse_categorical_accuracy: 0.9573\n", "Test accuracy: 95.73%\n" ] } ] }, { "cell_type": "code", "source": [ "test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "colnames=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'target']\n", "data = pd.read_csv(\"test_data.csv\", names=colnames, header=None)\n", "data['target'].replace('BENIGN', 0,inplace=True)\n", "data['target'].replace('ANOMALY', 1,inplace=True)\n", "y_test = data['target'].values # as a numpy array\n", "from sklearn.metrics import confusion_matrix\n", "y_prediction = finalModel.predict(test_dataset)\n", "y_prediction = np.argmax (y_prediction, axis = 1)\n", "result = confusion_matrix(y_test, y_prediction , normalize='pred')\n", "print(result)\n", "TP = result[0][0]\n", "FP = result[0][1]\n", "TN = result[1][1]\n", "FN = result[1][0]\n", "ACC = (TP+TN)/(TP+TN+FP+FN)\n", "PR = TP/(TP+FP) #precision\n", "TPR = TP/(TP+FN) #Recall or True positive rate\n", "FPR = FP/(FP+TN)\n", "F1Score = 2*(PR*TPR)/(PR+TPR)\n", "print(\"ACC: \" + str(ACC))\n", "print(\"PR: \" + str(PR))\n", "print(\"TPR: \" + str(TPR))\n", "print(\"FPR: \" + str(FPR))\n", "print(\"F1Score: \" + str(F1Score))\n", "import matplotlib.pyplot as plt\n", "import numpy\n", "from sklearn import metrics\n", "\n", "\n", "cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = result, display_labels = [True, False])\n", "\n", "cm_display.plot()\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 588 }, "id": "ocxti1_vYBlQ", "outputId": "b86f0f50-f4e2-42af-d940-814bac6bbbfd" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "4960/4960 [==============================] - 50s 10ms/step\n", "[[0.95495116 0.03088329]\n", " [0.04504884 0.96911671]]\n", "ACC: 0.9620339331438207\n", "PR: 0.9686729438493482\n", "TPR: 0.954951157412199\n", "FPR: 0.030883291124557774\n", "F1Score: 0.9617631098260437\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## UNSW-NB15" ], "metadata": { "id": "tsKelzsw3X0i" } }, { "cell_type": "code", "source": [ "import tensorflow as tf\n", "import numpy as np\n", "import pandas as pd\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "from keras import losses\n", "from keras import optimizers\n", "from keras import metrics\n", "import math\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ], "metadata": { "id": "0siibgmK3fuV" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "db864d4a-e708-4c09-e9c1-cfd142798120", "id": "AgJ1lpVg3X0j" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "code", "source": [ "CSV_HEADER = [\n", "\"srcip\",\n", "\"sport\",\n", "\"dstip\",\n", "\"dsport\",\n", "\"proto\",\n", "\"state\",\n", "\"dur\",\n", "\"sbytes\",\n", "\"dbytes\",\n", "\"sttl\",\n", "\"dttl\",\n", "\"sloss\",\n", "\"dloss\",\n", "\"service\",\n", "\"Sload\",\n", "\"Dload\",\n", "\"Spkts\",\n", "\"Dpkts\",\n", "\"swin\",\n", "\"dwin\",\n", "\"stcpb\",\n", "\"dtcpb\",\n", "\"smeansz\",\n", "\"dmeansz\",\n", "\"trans_depth\",\n", "\"res_bdy_len\",\n", "\"Sjit\",\n", "\"Djit\",\n", "\"Stime\",\n", "\"Ltime\",\n", "\"Sintpkt\",\n", "\"Dintpkt\",\n", "\"tcprtt\",\n", "\"synack\",\n", "\"ackdat\",\n", "\"is_sm_ips_ports\",\n", "\"ct_state_ttl\",\n", "\"ct_flw_http_mthd\",\n", "\"is_ftp_login\",\n", "\"ct_ftp_cmd\",\n", "\"ct_srv_src\",\n", "\"ct_srv_dst\",\n", "\"ct_dst_ltm\",\n", "\"ct_src_ ltm\",\n", "\"ct_src_dport_ltm\",\n", "\"ct_dst_sport_ltm\",\n", "\"ct_dst_src_ltm\",\n", "\"attack_cat\",\n", "\"Label\"\n", "]" ], "metadata": { "id": "i0XhepFL3iCE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import glob\n", "import os\n", "\n", "path = r'C:\\DRO\\DCL_rawdata_files' # use your path\n", "all_files = glob.glob(os.path.join(path , \"/content/drive/MyDrive/datasets/UNSW-NB15/*.csv\"))\n", "\n", "li = []\n", "\n", "for filename in all_files:\n", " df = pd.read_csv(filename, index_col=None, header=None, names=CSV_HEADER,low_memory=False)\n", " li.append(df)\n", "\n", "frame = pd.concat(li, axis=0, ignore_index=True)\n", "pd.set_option('display.max_columns', None)\n", "frame" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 461 }, "id": "pJZ6lcuN3X0k", "outputId": "cc9cd371-c37c-4e25-eaa7-240b9fbcf4db" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " srcip sport dstip dsport proto state dur \\\n", "0 59.166.0.9 7045 149.171.126.7 25 tcp FIN 0.201886 \n", "1 59.166.0.9 9685 149.171.126.2 80 tcp FIN 5.864748 \n", "2 59.166.0.2 1421 149.171.126.4 53 udp CON 0.001391 \n", "3 59.166.0.2 21553 149.171.126.2 25 tcp FIN 0.053948 \n", "4 59.166.0.8 45212 149.171.126.4 53 udp CON 0.000953 \n", "... ... ... ... ... ... ... ... \n", "2540042 59.166.0.8 12520 149.171.126.6 31010 tcp FIN 0.020383 \n", "2540043 59.166.0.0 18895 149.171.126.9 80 tcp FIN 1.402957 \n", "2540044 59.166.0.0 30103 149.171.126.5 5190 tcp FIN 0.007108 \n", "2540045 59.166.0.6 30388 149.171.126.5 111 udp CON 0.004435 \n", "2540046 59.166.0.0 6055 149.171.126.5 54145 tcp FIN 0.072974 \n", "\n", " sbytes dbytes sttl dttl sloss dloss service Sload \\\n", "0 37552 3380 31 29 18 8 smtp 1.459438e+06 \n", "1 19410 1087890 31 29 2 370 http 2.640454e+04 \n", "2 146 178 31 29 0 0 dns 4.198418e+05 \n", "3 37812 3380 31 29 19 8 smtp 5.503374e+06 \n", "4 146 178 31 29 0 0 dns 6.128017e+05 \n", "... ... ... ... ... ... ... ... ... \n", "2540042 320 1874 31 29 1 2 - 1.047932e+05 \n", "2540043 19410 1087890 31 29 2 370 http 1.103783e+05 \n", "2540044 2158 2464 31 29 6 6 - 2.328644e+06 \n", "2540045 568 304 31 29 0 0 - 7.684329e+05 \n", "2540046 4238 60788 31 29 7 30 - 4.582454e+05 \n", "\n", " Dload Spkts Dpkts swin dwin stcpb dtcpb \\\n", "0 1.307669e+05 52 42 255 255 1422136554 3572668484 \n", "1 1.481983e+06 364 746 255 255 389619597 394688654 \n", "2 5.118620e+05 2 2 0 0 0 0 \n", "3 4.893601e+05 54 42 255 255 4047523379 1903327524 \n", "4 7.471144e+05 2 2 0 0 0 0 \n", "... ... ... ... ... ... ... ... \n", "2540042 6.436736e+05 6 8 255 255 3208686479 3225486168 \n", "2540043 6.195098e+06 364 746 255 255 283296697 2429736754 \n", "2540044 2.658413e+06 24 24 255 255 703293844 2848960529 \n", "2540045 4.112740e+05 4 4 0 0 0 0 \n", "2540046 6.571546e+06 72 72 255 255 1003293149 1003585034 \n", "\n", " smeansz dmeansz trans_depth res_bdy_len Sjit Djit \\\n", "0 722 80 0 0 456.043567 15.530109 \n", "1 53 1458 1 0 1031.366423 690.219581 \n", "2 73 89 0 0 0.000000 0.000000 \n", "3 700 80 0 0 65.909688 3.155258 \n", "4 73 89 0 0 0.000000 0.000000 \n", "... ... ... ... ... ... ... \n", "2540042 53 234 0 0 212.810729 3.079195 \n", "2540043 53 1458 1 3924 203.808900 114.173588 \n", "2540044 90 103 0 0 17.627831 0.432619 \n", "2540045 142 76 0 0 1.638604 1.390643 \n", "2540046 59 844 0 0 62.045310 61.899776 \n", "\n", " Stime Ltime Sintpkt Dintpkt tcprtt synack \\\n", "0 1424250009 1424250009 3.943843 4.912488 0.000590 0.000473 \n", "1 1424250003 1424250009 16.155447 7.871279 0.000771 0.000638 \n", "2 1424250009 1424250009 0.009000 0.002000 0.000000 0.000000 \n", "3 1424250009 1424250009 1.011547 1.302561 0.000674 0.000540 \n", "4 1424250009 1424250009 0.009000 0.004000 0.000000 0.000000 \n", "... ... ... ... ... ... ... \n", "2540042 1421955842 1421955842 4.007400 2.027429 0.006386 0.006189 \n", "2540043 1421955841 1421955842 3.864028 1.882421 0.000712 0.000550 \n", "2540044 1421955842 1421955842 0.274261 0.285478 0.000657 0.000532 \n", "2540045 1421955842 1421955842 1.165667 0.987333 0.000000 0.000000 \n", "2540046 1421955842 1421955842 1.022690 0.997042 0.002317 0.002173 \n", "\n", " ackdat is_sm_ips_ports ct_state_ttl ct_flw_http_mthd \\\n", "0 0.000117 0 0 NaN \n", "1 0.000133 0 0 1.0 \n", "2 0.000000 0 0 NaN \n", "3 0.000134 0 0 NaN \n", "4 0.000000 0 0 NaN \n", "... ... ... ... ... \n", "2540042 0.000197 0 0 0.0 \n", "2540043 0.000162 0 0 4.0 \n", "2540044 0.000125 0 0 0.0 \n", "2540045 0.000000 0 0 0.0 \n", "2540046 0.000144 0 0 0.0 \n", "\n", " is_ftp_login ct_ftp_cmd ct_srv_src ct_srv_dst ct_dst_ltm \\\n", "0 NaN 2 2 7 \n", "1 NaN 3 1 4 \n", "2 NaN 3 5 2 \n", "3 NaN 1 1 4 \n", "4 NaN 2 5 2 \n", "... ... ... ... ... ... \n", "2540042 0.0 0 8 20 7 \n", "2540043 0.0 0 1 1 2 \n", "2540044 0.0 0 13 13 6 \n", "2540045 0.0 0 10 13 6 \n", "2540046 0.0 0 13 13 6 \n", "\n", " ct_src_ ltm ct_src_dport_ltm ct_dst_sport_ltm ct_dst_src_ltm \\\n", "0 4 1 1 3 \n", "1 4 1 1 1 \n", "2 7 1 1 4 \n", "3 7 1 1 3 \n", "4 1 1 1 2 \n", "... ... ... ... ... \n", "2540042 5 1 1 4 \n", "2540043 7 2 2 2 \n", "2540044 7 2 1 2 \n", "2540045 5 1 1 3 \n", "2540046 7 1 1 2 \n", "\n", " attack_cat Label \n", "0 NaN 0 \n", "1 NaN 0 \n", "2 NaN 0 \n", "3 NaN 0 \n", "4 NaN 0 \n", "... ... ... \n", "2540042 NaN 0 \n", "2540043 NaN 0 \n", "2540044 NaN 0 \n", "2540045 NaN 0 \n", "2540046 NaN 0 \n", "\n", "[2540047 rows x 49 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
srcipsportdstipdsportprotostatedursbytesdbytessttldttlslossdlossserviceSloadDloadSpktsDpktsswindwinstcpbdtcpbsmeanszdmeansztrans_depthres_bdy_lenSjitDjitStimeLtimeSintpktDintpkttcprttsynackackdatis_sm_ips_portsct_state_ttlct_flw_http_mthdis_ftp_loginct_ftp_cmdct_srv_srcct_srv_dstct_dst_ltmct_src_ ltmct_src_dport_ltmct_dst_sport_ltmct_dst_src_ltmattack_catLabel
059.166.0.97045149.171.126.725tcpFIN0.2018863755233803129188smtp1.459438e+061.307669e+055242255255142213655435726684847228000456.04356715.530109142425000914242500093.9438434.9124880.0005900.0004730.00011700NaNNaN2274113NaN0
159.166.0.99685149.171.126.280tcpFIN5.86474819410108789031292370http2.640454e+041.481983e+06364746255255389619597394688654531458101031.366423690.2195811424250003142425000916.1554477.8712790.0007710.0006380.000133001.0NaN3144111NaN0
259.166.0.21421149.171.126.453udpCON0.001391146178312900dns4.198418e+055.118620e+052200007389000.0000000.000000142425000914242500090.0090000.0020000.0000000.0000000.00000000NaNNaN3527114NaN0
359.166.0.221553149.171.126.225tcpFIN0.0539483781233803129198smtp5.503374e+064.893601e+05544225525540475233791903327524700800065.9096883.155258142425000914242500091.0115471.3025610.0006740.0005400.00013400NaNNaN1147113NaN0
459.166.0.845212149.171.126.453udpCON0.000953146178312900dns6.128017e+057.471144e+052200007389000.0000000.000000142425000914242500090.0090000.0040000.0000000.0000000.00000000NaNNaN2521112NaN0
......................................................................................................................................................
254004259.166.0.812520149.171.126.631010tcpFIN0.0203833201874312912-1.047932e+056.436736e+0568255255320868647932254861685323400212.8107293.079195142195584214219558424.0074002.0274290.0063860.0061890.000197000.00.0082075114NaN0
254004359.166.0.018895149.171.126.980tcpFIN1.40295719410108789031292370http1.103783e+056.195098e+06364746255255283296697242973675453145813924203.808900114.173588142195584114219558423.8640281.8824210.0007120.0005500.000162004.00.001127222NaN0
254004459.166.0.030103149.171.126.55190tcpFIN0.00710821582464312966-2.328644e+062.658413e+0624242552557032938442848960529901030017.6278310.432619142195584214219558420.2742610.2854780.0006570.0005320.000125000.00.00131367212NaN0
254004559.166.0.630388149.171.126.5111udpCON0.004435568304312900-7.684329e+054.112740e+0544000014276001.6386041.390643142195584214219558421.1656670.9873330.0000000.0000000.000000000.00.00101365113NaN0
254004659.166.0.06055149.171.126.554145tcpFIN0.0729744238607883129730-4.582454e+056.571546e+06727225525510032931491003585034598440062.04531061.899776142195584214219558421.0226900.9970420.0023170.0021730.000144000.00.00131367112NaN0
\n", "

2540047 rows × 49 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "source": [ "frame.srcip = frame.srcip.astype('category').cat.codes\n", "frame.dstip = frame.dstip.astype('category').cat.codes\n", "frame.proto = frame.proto.astype('category').cat.codes\n", "frame.state = frame.state.astype('category').cat.codes\n", "frame.service = frame.service.astype('category').cat.codes\n", "frame.ct_flw_http_mthd = frame.ct_flw_http_mthd.astype('category').cat.codes\n", "frame.is_ftp_login = frame.is_ftp_login.astype('category').cat.codes\n", "frame.ct_ftp_cmd = frame.ct_ftp_cmd.astype('category').cat.codes\n", "frame['Label'] = frame['Label'].astype(str)\n", "frame['Label'] = frame['Label'].str.replace(\"1\", \"anomaly\")\n", "frame['Label'] = frame['Label'].str.replace(\"0\", \"normal\")\n", "frame = frame.drop('attack_cat', axis=1)\n", "frame" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 461 }, "id": "CO6IvWfH3rxv", "outputId": "9a235628-04c3-485f-aa89-83fa59bb4d25" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " srcip sport dstip dsport proto state dur sbytes dbytes \\\n", "0 42 7045 25 25 114 5 0.201886 37552 3380 \n", "1 42 9685 20 80 114 5 5.864748 19410 1087890 \n", "2 35 1421 22 53 120 2 0.001391 146 178 \n", "3 35 21553 20 25 114 5 0.053948 37812 3380 \n", "4 41 45212 22 53 120 2 0.000953 146 178 \n", "... ... ... ... ... ... ... ... ... ... \n", "2540042 41 12520 24 31010 114 5 0.020383 320 1874 \n", "2540043 33 18895 27 80 114 5 1.402957 19410 1087890 \n", "2540044 33 30103 23 5190 114 5 0.007108 2158 2464 \n", "2540045 39 30388 23 111 120 2 0.004435 568 304 \n", "2540046 33 6055 23 54145 114 5 0.072974 4238 60788 \n", "\n", " sttl dttl sloss dloss service Sload Dload Spkts \\\n", "0 31 29 18 8 9 1.459438e+06 1.307669e+05 52 \n", "1 31 29 2 370 5 2.640454e+04 1.481983e+06 364 \n", "2 31 29 0 0 2 4.198418e+05 5.118620e+05 2 \n", "3 31 29 19 8 9 5.503374e+06 4.893601e+05 54 \n", "4 31 29 0 0 2 6.128017e+05 7.471144e+05 2 \n", "... ... ... ... ... ... ... ... ... \n", "2540042 31 29 1 2 0 1.047932e+05 6.436736e+05 6 \n", "2540043 31 29 2 370 5 1.103783e+05 6.195098e+06 364 \n", "2540044 31 29 6 6 0 2.328644e+06 2.658413e+06 24 \n", "2540045 31 29 0 0 0 7.684329e+05 4.112740e+05 4 \n", "2540046 31 29 7 30 0 4.582454e+05 6.571546e+06 72 \n", "\n", " Dpkts swin dwin stcpb dtcpb smeansz dmeansz \\\n", "0 42 255 255 1422136554 3572668484 722 80 \n", "1 746 255 255 389619597 394688654 53 1458 \n", "2 2 0 0 0 0 73 89 \n", "3 42 255 255 4047523379 1903327524 700 80 \n", "4 2 0 0 0 0 73 89 \n", "... ... ... ... ... ... ... ... \n", "2540042 8 255 255 3208686479 3225486168 53 234 \n", "2540043 746 255 255 283296697 2429736754 53 1458 \n", "2540044 24 255 255 703293844 2848960529 90 103 \n", "2540045 4 0 0 0 0 142 76 \n", "2540046 72 255 255 1003293149 1003585034 59 844 \n", "\n", " trans_depth res_bdy_len Sjit Djit Stime \\\n", "0 0 0 456.043567 15.530109 1424250009 \n", "1 1 0 1031.366423 690.219581 1424250003 \n", "2 0 0 0.000000 0.000000 1424250009 \n", "3 0 0 65.909688 3.155258 1424250009 \n", "4 0 0 0.000000 0.000000 1424250009 \n", "... ... ... ... ... ... \n", "2540042 0 0 212.810729 3.079195 1421955842 \n", "2540043 1 3924 203.808900 114.173588 1421955841 \n", "2540044 0 0 17.627831 0.432619 1421955842 \n", "2540045 0 0 1.638604 1.390643 1421955842 \n", "2540046 0 0 62.045310 61.899776 1421955842 \n", "\n", " Ltime Sintpkt Dintpkt tcprtt synack ackdat \\\n", "0 1424250009 3.943843 4.912488 0.000590 0.000473 0.000117 \n", "1 1424250009 16.155447 7.871279 0.000771 0.000638 0.000133 \n", "2 1424250009 0.009000 0.002000 0.000000 0.000000 0.000000 \n", "3 1424250009 1.011547 1.302561 0.000674 0.000540 0.000134 \n", "4 1424250009 0.009000 0.004000 0.000000 0.000000 0.000000 \n", "... ... ... ... ... ... ... \n", "2540042 1421955842 4.007400 2.027429 0.006386 0.006189 0.000197 \n", "2540043 1421955842 3.864028 1.882421 0.000712 0.000550 0.000162 \n", "2540044 1421955842 0.274261 0.285478 0.000657 0.000532 0.000125 \n", "2540045 1421955842 1.165667 0.987333 0.000000 0.000000 0.000000 \n", "2540046 1421955842 1.022690 0.997042 0.002317 0.002173 0.000144 \n", "\n", " is_sm_ips_ports ct_state_ttl ct_flw_http_mthd is_ftp_login \\\n", "0 0 0 -1 -1 \n", "1 0 0 1 -1 \n", "2 0 0 -1 -1 \n", "3 0 0 -1 -1 \n", "4 0 0 -1 -1 \n", "... ... ... ... ... \n", "2540042 0 0 0 0 \n", "2540043 0 0 4 0 \n", "2540044 0 0 0 0 \n", "2540045 0 0 0 0 \n", "2540046 0 0 0 0 \n", "\n", " ct_ftp_cmd ct_srv_src ct_srv_dst ct_dst_ltm ct_src_ ltm \\\n", "0 8 2 2 7 4 \n", "1 8 3 1 4 4 \n", "2 8 3 5 2 7 \n", "3 8 1 1 4 7 \n", "4 8 2 5 2 1 \n", "... ... ... ... ... ... \n", "2540042 0 8 20 7 5 \n", "2540043 0 1 1 2 7 \n", "2540044 0 13 13 6 7 \n", "2540045 0 10 13 6 5 \n", "2540046 0 13 13 6 7 \n", "\n", " ct_src_dport_ltm ct_dst_sport_ltm ct_dst_src_ltm Label \n", "0 1 1 3 normal \n", "1 1 1 1 normal \n", "2 1 1 4 normal \n", "3 1 1 3 normal \n", "4 1 1 2 normal \n", "... ... ... ... ... \n", "2540042 1 1 4 normal \n", "2540043 2 2 2 normal \n", "2540044 2 1 2 normal \n", "2540045 1 1 3 normal \n", "2540046 1 1 2 normal \n", "\n", "[2540047 rows x 48 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
srcipsportdstipdsportprotostatedursbytesdbytessttldttlslossdlossserviceSloadDloadSpktsDpktsswindwinstcpbdtcpbsmeanszdmeansztrans_depthres_bdy_lenSjitDjitStimeLtimeSintpktDintpkttcprttsynackackdatis_sm_ips_portsct_state_ttlct_flw_http_mthdis_ftp_loginct_ftp_cmdct_srv_srcct_srv_dstct_dst_ltmct_src_ ltmct_src_dport_ltmct_dst_sport_ltmct_dst_src_ltmLabel
0427045252511450.201886375523380312918891.459438e+061.307669e+055242255255142213655435726684847228000456.04356715.530109142425000914242500093.9438434.9124880.0005900.0004730.00011700-1-182274113normal
1429685208011455.8647481941010878903129237052.640454e+041.481983e+06364746255255389619597394688654531458101031.366423690.2195811424250003142425000916.1554477.8712790.0007710.0006380.000133001-183144111normal
2351421225312020.00139114617831290024.198418e+055.118620e+052200007389000.0000000.000000142425000914242500090.0090000.0020000.0000000.0000000.00000000-1-183527114normal
33521553202511450.053948378123380312919895.503374e+064.893601e+05544225525540475233791903327524700800065.9096883.155258142425000914242500091.0115471.3025610.0006740.0005400.00013400-1-181147113normal
44145212225312020.00095314617831290026.128017e+057.471144e+052200007389000.0000000.000000142425000914242500090.0090000.0040000.0000000.0000000.00000000-1-182521112normal
...................................................................................................................................................
25400424112520243101011450.020383320187431291201.047932e+056.436736e+0568255255320868647932254861685323400212.8107293.079195142195584214219558424.0074002.0274290.0063860.0061890.0001970000082075114normal
25400433318895278011451.4029571941010878903129237051.103783e+056.195098e+06364746255255283296697242973675453145813924203.808900114.173588142195584114219558423.8640281.8824210.0007120.0005500.000162004001127222normal
2540044333010323519011450.0071082158246431296602.328644e+062.658413e+0624242552557032938442848960529901030017.6278310.432619142195584214219558420.2742610.2854780.0006570.0005320.00012500000131367212normal
254004539303882311112020.00443556830431290007.684329e+054.112740e+0544000014276001.6386041.390643142195584214219558421.1656670.9873330.0000000.0000000.00000000000101365113normal
2540046336055235414511450.072974423860788312973004.582454e+056.571546e+06727225525510032931491003585034598440062.04531061.899776142195584214219558421.0226900.9970420.0023170.0021730.00014400000131367112normal
\n", "

2540047 rows × 48 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "column_headers = list(frame.columns.values)\n", "column_headers" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "76ffaf32-9bd4-432d-ba1e-0a85afcbe633", "id": "g9uSi28h3X0k" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['srcip',\n", " 'sport',\n", " 'dstip',\n", " 'dsport',\n", " 'proto',\n", " 'state',\n", " 'dur',\n", " 'sbytes',\n", " 'dbytes',\n", " 'sttl',\n", " 'dttl',\n", " 'sloss',\n", " 'dloss',\n", " 'service',\n", " 'Sload',\n", " 'Dload',\n", " 'Spkts',\n", " 'Dpkts',\n", " 'swin',\n", " 'dwin',\n", " 'stcpb',\n", " 'dtcpb',\n", " 'smeansz',\n", " 'dmeansz',\n", " 'trans_depth',\n", " 'res_bdy_len',\n", " 'Sjit',\n", " 'Djit',\n", " 'Stime',\n", " 'Ltime',\n", " 'Sintpkt',\n", " 'Dintpkt',\n", " 'tcprtt',\n", " 'synack',\n", " 'ackdat',\n", " 'is_sm_ips_ports',\n", " 'ct_state_ttl',\n", " 'ct_flw_http_mthd',\n", " 'is_ftp_login',\n", " 'ct_ftp_cmd',\n", " 'ct_srv_src',\n", " 'ct_srv_dst',\n", " 'ct_dst_ltm',\n", " 'ct_src_ ltm',\n", " 'ct_src_dport_ltm',\n", " 'ct_dst_sport_ltm',\n", " 'ct_dst_src_ltm',\n", " 'Label']" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "code", "source": [ "CSV_HEADER = column_headers\n", "df = frame\n", "def Remove_Outlier_Indices(df):\n", " Q1 = df.quantile(0.02)\n", " Q3 = df.quantile(0.98)\n", " IQR = Q3 - Q1\n", " #trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR)))\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n", " return trueList\n", "\n", "nonOutlierList = Remove_Outlier_Indices(df)\n", "new_data = df[nonOutlierList]\n", "\n", "df = new_data\n", "df = df.reset_index(drop=True)\n", "del new_data\n", "del nonOutlierList\n", "del li\n", "del frame\n", "import gc\n", "gc.collect()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "62ee996a-146b-44da-9ead-aa5b88c614b0", "id": "JfHf8zgY3X0l" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":4: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q1 = df.quantile(0.02)\n", ":5: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q3 = df.quantile(0.98)\n", ":8: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "31" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "from sklearn.linear_model import Lasso\n", "from sklearn.model_selection import GridSearchCV, KFold\n", "X = df.drop([' Label'], axis=1)\n", "Y = df[\" Label\"].astype('category').cat.codes\n", "# parameters to be tested on GridSearchCV\n", "params = {\"alpha\":np.arange(0.00001, 10, 500)}\n", "\n", "# Number of Folds and adding the random state for replication\n", "kf=KFold(n_splits=5,shuffle=True, random_state=42)\n", "\n", "# Initializing the Model\n", "lasso = Lasso()\n", "\n", "# GridSearchCV with model, params and folds.\n", "lasso_cv=GridSearchCV(lasso, param_grid=params, cv=kf)\n", "lasso_cv.fit(X, Y)\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 346 }, "outputId": "538a8f92-3dd4-419e-f12f-79ff3ac17f37", "id": "cIYk9ZnY3X0m" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.965e+04, tolerance: 3.205e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.967e+04, tolerance: 3.207e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.962e+04, tolerance: 3.203e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.967e+04, tolerance: 3.203e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.959e+04, tolerance: 3.204e+01\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.455e+04, tolerance: 4.006e+01\n", " model = cd_fast.enet_coordinate_descent(\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),\n", " estimator=Lasso(), param_grid={'alpha': array([1.e-05])})" ], "text/html": [ "
GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),\n",
              "             estimator=Lasso(), param_grid={'alpha': array([1.e-05])})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "from sklearn.linear_model import Lasso\n", "from sklearn.model_selection import GridSearchCV, KFold\n", "X = df.drop(['Label'], axis=1)\n", "Y = df[\"Label\"].astype('category').cat.codes\n", "# calling the model with the best parameter\n", "lasso1 = Lasso(alpha=0.00001)\n", "lasso1.fit(X, Y)\n", "\n", "# Using np.abs() to make coefficients positive.\n", "lasso1_coef = np.abs(lasso1.coef_)\n", "# Subsetting the features which has more than 0.001 importance.\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "f1e5fa37-4c6a-4998-b267-aadcc9e5bea9", "id": "DkvVFpIP3X0m" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.146e+04, tolerance: 2.565e+01\n", " model = cd_fast.enet_coordinate_descent(\n" ] } ] }, { "cell_type": "code", "source": [ "names=df.drop(\"Label\", axis=1).columns" ], "metadata": { "id": "eG_vL8HU3X0m" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "feature_subset=np.array(names)[lasso1_coef>0.003]\n", "feature_subset" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "67af06bf-5e83-4649-ff31-fd3cefbedee2", "id": "nm5knwtG3X0m" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['proto', 'state', 'dur', 'sloss', 'service', 'trans_depth',\n", " 'synack', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login'],\n", " dtype=object)" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "m=np.array(lasso1_coef)[lasso1_coef>0.003]\n", "m" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "de6ce49c-4e5b-41e1-cce2-f2081d65e777", "id": "OSo02gS13X0n" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([3.58566963e-03, 3.25063436e-02, 9.61164559e-03, 5.40550719e-03,\n", " 4.20648889e-03, 3.34519850e-02, 3.83053988e+00, 3.90737301e-01,\n", " 3.78285090e-03, 1.22391985e-02])" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "# plotting the Column Names and Importance of Columns.\n", "plt.bar(names, lasso1_coef)\n", "plt.xticks(rotation=90)\n", "plt.grid()\n", "plt.title(\"Feature Selection Based on Lasso\")\n", "plt.xlabel(\"Features\")\n", "plt.ylabel(\"Importance\")\n", "plt.ylim(0, 0.15)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 577 }, "outputId": "8886487e-073a-46a3-ef27-33c0295a2fe5", "id": "sv01YtY-3X0n" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "columns = []\n", "for k in feature_subset:\n", " columns.append(str(k))\n", "columns" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c07cb763-7fb2-403b-9a72-3968332617f8", "id": "Hos6VC0I3X0n" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['proto',\n", " 'state',\n", " 'dur',\n", " 'sloss',\n", " 'service',\n", " 'trans_depth',\n", " 'synack',\n", " 'ct_state_ttl',\n", " 'ct_flw_http_mthd',\n", " 'is_ftp_login']" ] }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "X = df.drop(['Label'], axis=1)\n", "Y = df[\"Label\"].astype('category').cat.codes\n", "principalDf = pd.DataFrame(data = X\n", " , columns = columns)\n", "finalDf = pd.concat([principalDf, df[\"Label\"]], axis = 1)\n", "finalDf" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "d90a896b-2d5b-4834-b156-97ee35493074", "id": "rLz6kZRM3X0n" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " proto state dur sloss service trans_depth synack \\\n", "0 114 5 0.201886 18 9 0 0.000473 \n", "1 114 5 5.864748 2 5 1 0.000638 \n", "2 120 2 0.001391 0 2 0 0.000000 \n", "3 114 5 0.053948 19 9 0 0.000540 \n", "4 120 2 0.000953 0 2 0 0.000000 \n", "... ... ... ... ... ... ... ... \n", "2438669 114 5 0.020383 1 0 0 0.006189 \n", "2438670 114 5 1.402957 2 5 1 0.000550 \n", "2438671 114 5 0.007108 6 0 0 0.000532 \n", "2438672 120 2 0.004435 0 0 0 0.000000 \n", "2438673 114 5 0.072974 7 0 0 0.002173 \n", "\n", " ct_state_ttl ct_flw_http_mthd is_ftp_login Label \n", "0 0 -1 -1 normal \n", "1 0 1 -1 normal \n", "2 0 -1 -1 normal \n", "3 0 -1 -1 normal \n", "4 0 -1 -1 normal \n", "... ... ... ... ... \n", "2438669 0 0 0 normal \n", "2438670 0 4 0 normal \n", "2438671 0 0 0 normal \n", "2438672 0 0 0 normal \n", "2438673 0 0 0 normal \n", "\n", "[2438674 rows x 11 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
protostatedurslossservicetrans_depthsynackct_state_ttlct_flw_http_mthdis_ftp_loginLabel
011450.20188618900.0004730-1-1normal
111455.8647482510.00063801-1normal
212020.0013910200.0000000-1-1normal
311450.05394819900.0005400-1-1normal
412020.0009530200.0000000-1-1normal
....................................
243866911450.0203831000.006189000normal
243867011451.4029572510.000550040normal
243867111450.0071086000.000532000normal
243867212020.0044350000.000000000normal
243867311450.0729747000.002173000normal
\n", "

2438674 rows × 11 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "train_data, test_data = train_test_split(finalDf, test_size=0.25)\n", "train_data_file = \"train_data.csv\"\n", "test_data_file = \"test_data.csv\"\n", "\n", "train_data.to_csv(train_data_file, index=False, header=False)\n", "test_data.to_csv(test_data_file, index=False, header=False)" ], "metadata": { "id": "u0wGu11w3X0o" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "del finalDf\n", "del principalDf\n", "del train_data\n", "del test_data" ], "metadata": { "id": "gskeDt0-3X0o" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "CSV_HEADER = []\n", "for x in columns:\n", " CSV_HEADER.append(x)\n", "CSV_HEADER.append(\"Label\")\n", "\n", "# A list of the numerical feature names.\n", "NUMERIC_FEATURE_NAMES = columns\n", "# A dictionary of the categorical features and their vocabulary.\n", "CATEGORICAL_FEATURES_WITH_VOCABULARY = {\n", "}\n", "# A list of the columns to ignore from the dataset.\n", "IGNORE_COLUMN_NAMES = []\n", "# A list of the categorical feature names.\n", "CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())\n", "# A list of all the input features.\n", "FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES\n", "# A list of column default values for each feature.\n", "COLUMN_DEFAULTS = [\n", " [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else [\"NA\"]\n", " for feature_name in CSV_HEADER\n", "]\n", "# The name of the target feature.\n", "TARGET_FEATURE_NAME = \"Label\"\n", "# A list of the labels of the target features.\n", "TARGET_LABELS = [\"normal\", \"anomaly\"]\n", "\n", "from tensorflow.keras.layers import StringLookup\n", "\n", "target_label_lookup = StringLookup(\n", " vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0\n", ")\n", "\n", "\n", "def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):\n", " dataset = tf.data.experimental.make_csv_dataset(\n", " csv_file_path,\n", " batch_size=batch_size,\n", " column_names=CSV_HEADER,\n", " column_defaults=COLUMN_DEFAULTS,\n", " label_name=TARGET_FEATURE_NAME,\n", " num_epochs=1,\n", " header=False,\n", " na_value=\"?\",\n", " shuffle=shuffle,\n", " ).map(lambda features, target: (features, target_label_lookup(target)))\n", " return dataset.cache()\n", "\n", "def create_model_inputs():\n", " inputs = {}\n", " for feature_name in FEATURE_NAMES:\n", " if feature_name in NUMERIC_FEATURE_NAMES:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.float32\n", " )\n", " else:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.string\n", " )\n", " return inputs\n", "\n", "def encode_inputs(inputs):\n", " encoded_features = []\n", " for feature_name in inputs:\n", " if feature_name in CATEGORICAL_FEATURE_NAMES:\n", " vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]\n", " #print(vocabulary)\n", " # Create a lookup to convert a string values to an integer indices.\n", " # Since we are not using a mask token, nor expecting any out of vocabulary\n", " # (oov) token, we set mask_token to None and num_oov_indices to 0.\n", " lookup = StringLookup(\n", " vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n", " )\n", " # Convert the string input values into integer indices.\n", " value_index = lookup(inputs[feature_name])\n", " embedding_dims = int(math.sqrt(lookup.vocabulary_size()))\n", " # Create an embedding layer with the specified dimensions.\n", " embedding = layers.Embedding(\n", " input_dim=lookup.vocabulary_size(), output_dim=embedding_dims\n", " )\n", " # Convert the index values to embedding representations.\n", " encoded_feature = embedding(value_index)\n", " else:\n", " # Use the numerical features as-is.\n", " encoded_feature = inputs[feature_name]\n", " if inputs[feature_name].shape[-1] is None:\n", " encoded_feature = tf.expand_dims(encoded_feature, -1)\n", "\n", " encoded_features.append(encoded_feature)\n", "\n", " encoded_features = layers.concatenate(encoded_features)\n", " return encoded_features\n", "\n", "class NeuralDecisionTree(keras.Model):\n", " def __init__(self, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.depth = depth\n", " self.num_leaves = 2 ** depth\n", " self.num_classes = num_classes\n", "\n", " # Create a mask for the randomly selected features.\n", " num_used_features = int(num_features * used_features_rate)\n", " one_hot = np.eye(num_features)\n", " sampled_feature_indicies = np.random.choice(\n", " np.arange(num_features), num_used_features, replace=False\n", " )\n", " self.used_features_mask = one_hot[sampled_feature_indicies]\n", "\n", " # Initialize the weights of the classes in leaves.\n", " self.pi = tf.Variable(\n", " initial_value=tf.random_normal_initializer()(\n", " shape=[self.num_leaves, self.num_classes]\n", " ),\n", " dtype=\"float32\",\n", " trainable=True,\n", " )\n", "\n", " # Initialize the stochastic routing layer.\n", " self.decision_fn = layers.Dense(\n", " units=self.num_leaves, activation=\"sigmoid\", name=\"decision\"\n", " )\n", "\n", " def call(self, features):\n", " batch_size = tf.shape(features)[0]\n", "\n", " # Apply the feature mask to the input features.\n", " features = tf.matmul(\n", " features, self.used_features_mask, transpose_b=True\n", " ) # [batch_size, num_used_features]\n", " # Compute the routing probabilities.\n", " decisions = tf.expand_dims(\n", " self.decision_fn(features), axis=2\n", " ) # [batch_size, num_leaves, 1]\n", " # Concatenate the routing probabilities with their complements.\n", " decisions = layers.concatenate(\n", " [decisions, 1 - decisions], axis=2\n", " ) # [batch_size, num_leaves, 2]\n", "\n", " mu = tf.ones([batch_size, 1, 1])\n", "\n", " begin_idx = 1\n", " end_idx = 2\n", " # Traverse the tree in breadth-first order.\n", " for level in range(self.depth):\n", " mu = tf.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1]\n", " mu = tf.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2]\n", " level_decisions = decisions[\n", " :, begin_idx:end_idx, :\n", " ] # [batch_size, 2 ** level, 2]\n", " mu = mu * level_decisions # [batch_size, 2**level, 2]\n", " begin_idx = end_idx\n", " end_idx = begin_idx + 2 ** (level + 1)\n", "\n", " mu = tf.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves]\n", " probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes]\n", " outputs = tf.matmul(mu, probabilities) # [batch_size, num_classes]\n", " return outputs\n", "\n", "class NeuralDecisionForest(keras.Model):\n", " def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.ensemble = []\n", " # Initialize the ensemble by adding NeuralDecisionTree instances.\n", " # Each tree will have its own randomly selected input features to use.\n", " for _ in range(num_trees):\n", " self.ensemble.append(\n", " NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)\n", " )\n", "\n", " def call(self, inputs):\n", " # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.\n", " batch_size = tf.shape(inputs)[0]\n", " outputs = tf.zeros([batch_size, num_classes])\n", "\n", " # Aggregate the outputs of trees in the ensemble.\n", " for tree in self.ensemble:\n", " outputs += tree(inputs)\n", " # Divide the outputs by the ensemble size to get the average.\n", " outputs /= len(self.ensemble)\n", " return outputs\n", "learning_rate = 0.01\n", "batch_size = 128\n", "num_epochs = 10\n", "\n", "\n", "def run_experiment(model):\n", "\n", " # model.compile(\n", " # optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " # loss=keras.losses.SparseCategoricalCrossentropy(),\n", " # metrics=[keras.metrics.SparseCategoricalAccuracy()],\n", " # )\n", " model.compile(\n", " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " loss=keras.losses.SparseCategoricalCrossentropy(),\n", " metrics=[metrics.SparseCategoricalAccuracy()],\n", " )\n", " print(\"Start training the model...\")\n", " train_dataset = get_dataset_from_csv(\n", " train_data_file, shuffle=True, batch_size=batch_size\n", " )\n", "\n", " model.fit(train_dataset, epochs=num_epochs)\n", " print(\"Model training finished\")\n", "\n", " print(\"Evaluating the model on the test data...\")\n", " test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "\n", " _, accuracy = model.evaluate(test_dataset)\n", " print(f\"Test accuracy: {round(accuracy * 100, 2)}%\")\n", " return model" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c2f09d8c-a13d-4bed-b751-02913e8c56cd", "id": "kvpabeu43X0o" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/numpy/core/numeric.py:2463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", " return bool(asarray(a1 == a2).all())\n" ] } ] }, { "cell_type": "code", "source": [ "num_trees = 25\n", "depth = 5\n", "used_features_rate = 0.5\n", "num_classes = len(TARGET_LABELS)\n", "\n", "\n", "def create_forest_model():\n", " inputs = create_model_inputs()\n", " features = encode_inputs(inputs)\n", " features = layers.BatchNormalization()(features)\n", " num_features = features.shape[1]\n", "\n", " forest_model = NeuralDecisionForest(\n", " num_trees, depth, num_features, used_features_rate, num_classes\n", " )\n", "\n", " outputs = forest_model(features)\n", " model = keras.Model(inputs=inputs, outputs=outputs)\n", " return model\n", "\n", "\n", "forest_model = create_forest_model()\n", "\n", "finalModel = run_experiment(forest_model)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "56d0e878-b296-4b96-c931-f1c6c4cbaa42", "id": "BTvfogDW3X0p" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Start training the model...\n", "Epoch 1/10\n", "14290/14290 [==============================] - 368s 24ms/step - loss: 0.0615 - sparse_categorical_accuracy: 0.9885\n", "Epoch 2/10\n", "14290/14290 [==============================] - 304s 21ms/step - loss: 0.0574 - sparse_categorical_accuracy: 0.9892\n", "Epoch 3/10\n", "14290/14290 [==============================] - 305s 21ms/step - loss: 0.0571 - sparse_categorical_accuracy: 0.9892\n", "Epoch 4/10\n", "14290/14290 [==============================] - 303s 21ms/step - loss: 0.0570 - sparse_categorical_accuracy: 0.9893\n", "Epoch 5/10\n", "14290/14290 [==============================] - 302s 21ms/step - loss: 0.0570 - sparse_categorical_accuracy: 0.9892\n", "Epoch 6/10\n", "14290/14290 [==============================] - 302s 21ms/step - loss: 0.0570 - sparse_categorical_accuracy: 0.9893\n", "Epoch 7/10\n", "14290/14290 [==============================] - 304s 21ms/step - loss: 0.0570 - sparse_categorical_accuracy: 0.9893\n", "Epoch 8/10\n", "14290/14290 [==============================] - 305s 21ms/step - loss: 0.0569 - sparse_categorical_accuracy: 0.9894\n", "Epoch 9/10\n", "14290/14290 [==============================] - 308s 22ms/step - loss: 0.0569 - sparse_categorical_accuracy: 0.9894\n", "Epoch 10/10\n", "14290/14290 [==============================] - 306s 21ms/step - loss: 0.0569 - sparse_categorical_accuracy: 0.9894\n", "Model training finished\n", "Evaluating the model on the test data...\n", "4764/4764 [==============================] - 49s 10ms/step - loss: 0.0564 - sparse_categorical_accuracy: 0.9897\n", "Test accuracy: 98.97%\n" ] } ] }, { "cell_type": "code", "source": [ "test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "colnames=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'target']\n", "data = pd.read_csv(\"test_data.csv\", names=colnames, header=None)\n", "data['target'].replace('normal', 0,inplace=True)\n", "data['target'].replace('anomaly', 1,inplace=True)\n", "y_test = data['target'].values # as a numpy array\n", "from sklearn.metrics import confusion_matrix\n", "y_prediction = finalModel.predict(test_dataset)\n", "y_prediction = np.argmax (y_prediction, axis = 1)\n", "result = confusion_matrix(y_test, y_prediction , normalize='pred')\n", "print(result)\n", "TP = result[0][0]\n", "FP = result[0][1]\n", "TN = result[1][1]\n", "FN = result[1][0]\n", "ACC = (TP+TN)/(TP+TN+FP+FN)\n", "PR = TP/(TP+FP) #precision\n", "TPR = TP/(TP+FN) #Recall or True positive rate\n", "FPR = FP/(FP+TN)\n", "F1Score = 2*(PR*TPR)/(PR+TPR)\n", "print(\"ACC: \" + str(ACC))\n", "print(\"PR: \" + str(PR))\n", "print(\"TPR: \" + str(TPR))\n", "print(\"FPR: \" + str(FPR))\n", "print(\"F1Score: \" + str(F1Score))\n", "import matplotlib.pyplot as plt\n", "import numpy\n", "from sklearn import metrics\n", "\n", "\n", "cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = result, display_labels = [True, False])\n", "\n", "cm_display.plot()\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "54ec1211-4d37-4baa-d57e-d74780e3d603", "id": "nPIZFS1A3X0p" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "4764/4764 [==============================] - 46s 9ms/step\n", "[[0.99779348 0.06634595]\n", " [0.00220652 0.93365405]]\n", "ACC: 0.965723761422259\n", "PR: 0.9376529508800495\n", "TPR: 0.997793476119445\n", "FPR: 0.066345953274927\n", "F1Score: 0.966788834039377\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] } ] }