{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "import tensorflow as tf\n", "import numpy as np\n", "import pandas as pd\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "from keras import losses\n", "from keras import optimizers\n", "from keras import metrics\n", "import math\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ], "metadata": { "id": "sIDDU2PYPdH_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "CSV_HEADER = [\n", " \"duration\",\n", " \"protocol_type\",\n", " \"service\",\n", " \"flag\",\n", " \"src_bytes\",\n", " \"dst_bytes\",\n", " \"land\",\n", " \"wrong_fragment\",\n", " \"urgent\",\n", " \"hot\",\n", " \"num_failed_logins\",\n", " \"logged_in\",\n", " \"num_compromised\",\n", " \"root_shell\",\n", " \"su_attempted\",\n", " \"num_root\",\n", " \"num_file_creations\",\n", " \"num_shells\",\n", " \"num_access_files\",\n", " \"num_outbound_cmds\",\n", " \"is_host_login\",\n", " \"is_guest_login\",\n", " \"count\",\n", " \"srv_count\",\n", " \"serror_rate\",\n", " \"srv_serror_rate\",\n", " \"rerror_rate\",\n", " \"srv_rerror_rate\",\n", " \"same_srv_rate\",\n", " \"diff_srv_rate\",\n", " \"srv_diff_host_rate\",\n", " \"dst_host_count\",\n", " \"dst_host_srv_count\",\n", " \"dst_host_same_srv_rate\",\n", " \"dst_host_diff_srv_rate\",\n", " \"dst_host_same_src_port_rate\",\n", " \"dst_host_srv_diff_host_rate\",\n", " \"dst_host_serror_rate\",\n", " \"dst_host_srv_serror_rate\",\n", " \"dst_host_rerror_rate\",\n", " \"dst_host_srv_rerror_rate\",\n", " \"class\"\n", "]\n", "\n", "\n", "train_data = pd.read_csv(\"train.csv\", header=None, names=CSV_HEADER)\n", "\n", "test_data = pd.read_csv(\"test.csv\", header=None, names=CSV_HEADER)\n", "\n", "print(f\"Train dataset shape: {train_data.shape}\")\n", "print(f\"Test dataset shape: {test_data.shape}\")\n", "train_data['class'] = train_data['class'].str.replace(r\"^(.(?:53: FutureWarning: The default value of regex will change from True to False in a future version.\n", " train_data['class'] = train_data['class'].str.replace(r\"^(.(?:54: FutureWarning: The default value of regex will change from True to False in a future version.\n", " test_data['class'] = test_data['class'].str.replace(r\"^(.(?" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## Removing outliers and duplicates" ], "metadata": { "id": "wHhIvDl9V5kl" } }, { "cell_type": "code", "source": [ "def Remove_Outlier_Indices(df):\n", " Q1 = df.quantile(0.02)\n", " Q3 = df.quantile(0.98)\n", " IQR = Q3 - Q1\n", " #trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR)))\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n", " return trueList\n", "\n", "nonOutlierList = Remove_Outlier_Indices(train_data)\n", "new_train_data = train_data[nonOutlierList]\n", "\n", "nonOutlierList = Remove_Outlier_Indices(test_data)\n", "new_test_data = test_data[nonOutlierList]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8GZwWkuSV5HT", "outputId": "ab0a253e-f92e-46e7-9e93-b72402784899" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":2: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q1 = df.quantile(0.02)\n", ":3: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q3 = df.quantile(0.98)\n", ":6: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Merging train and test datasets" ], "metadata": { "id": "uNzhq0uHKkUB" } }, { "cell_type": "code", "source": [ "frames = [new_train_data, new_test_data]\n", "df = pd.concat(frames)\n", "df = df.reset_index(drop=True)\n", "df" ], "metadata": { "id": "3kxP6kBRGX2y", "colab": { "base_uri": "https://localhost:8080/", "height": 444 }, "outputId": "8f225d41-1967-4ae7-a5d1-5d8b58d2b26e" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " duration protocol_type service flag src_bytes dst_bytes land \\\n", "0 0 1 20 9 491 0 0 \n", "1 0 2 44 9 146 0 0 \n", "2 0 1 49 5 0 0 0 \n", "3 0 1 24 9 232 8153 0 \n", "4 0 1 24 9 199 420 0 \n", "... ... ... ... ... ... ... ... \n", "139899 0 0 14 9 1032 0 0 \n", "139900 0 1 49 9 794 333 0 \n", "139901 0 1 22 9 317 938 0 \n", "139902 0 2 11 9 42 42 0 \n", "139903 0 1 52 1 0 0 0 \n", "\n", " wrong_fragment urgent hot ... dst_host_srv_count \\\n", "0 0 0 0 ... 25 \n", "1 0 0 0 ... 1 \n", "2 0 0 0 ... 26 \n", "3 0 0 0 ... 255 \n", "4 0 0 0 ... 255 \n", "... ... ... ... ... ... \n", "139899 0 0 0 ... 255 \n", "139900 0 0 0 ... 141 \n", "139901 0 0 0 ... 255 \n", "139902 0 0 0 ... 252 \n", "139903 0 0 0 ... 21 \n", "\n", " dst_host_same_srv_rate dst_host_diff_srv_rate \\\n", "0 0.17 0.03 \n", "1 0.00 0.60 \n", "2 0.10 0.05 \n", "3 1.00 0.00 \n", "4 1.00 0.00 \n", "... ... ... \n", "139899 1.00 0.00 \n", "139900 0.72 0.06 \n", "139901 1.00 0.00 \n", "139902 0.99 0.01 \n", "139903 0.08 0.03 \n", "\n", " dst_host_same_src_port_rate dst_host_srv_diff_host_rate \\\n", "0 0.17 0.00 \n", "1 0.88 0.00 \n", "2 0.00 0.00 \n", "3 0.03 0.04 \n", "4 0.00 0.00 \n", "... ... ... \n", "139899 1.00 0.00 \n", "139900 0.01 0.01 \n", "139901 0.01 0.01 \n", "139902 0.00 0.00 \n", "139903 0.00 0.00 \n", "\n", " dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate \\\n", "0 0.00 0.00 0.05 \n", "1 0.00 0.00 0.00 \n", "2 1.00 1.00 0.00 \n", "3 0.03 0.01 0.00 \n", "4 0.00 0.00 0.00 \n", "... ... ... ... \n", "139899 0.00 0.00 0.00 \n", "139900 0.01 0.00 0.00 \n", "139901 0.01 0.00 0.00 \n", "139902 0.00 0.00 0.00 \n", "139903 0.00 0.00 0.44 \n", "\n", " dst_host_srv_rerror_rate class \n", "0 0.00 normal \n", "1 0.00 normal \n", "2 0.00 anomaly \n", "3 0.01 normal \n", "4 0.00 normal \n", "... ... ... \n", "139899 0.00 anomaly \n", "139900 0.00 normal \n", "139901 0.00 normal \n", "139902 0.00 normal \n", "139903 1.00 anomaly \n", "\n", "[139904 rows x 42 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationprotocol_typeserviceflagsrc_bytesdst_byteslandwrong_fragmenturgenthot...dst_host_srv_countdst_host_same_srv_ratedst_host_diff_srv_ratedst_host_same_src_port_ratedst_host_srv_diff_host_ratedst_host_serror_ratedst_host_srv_serror_ratedst_host_rerror_ratedst_host_srv_rerror_rateclass
00120949100000...250.170.030.170.000.000.000.050.00normal
10244914600000...10.000.600.880.000.000.000.000.00normal
201495000000...260.100.050.000.001.001.000.000.00anomaly
30124923281530000...2551.000.000.030.040.030.010.000.01normal
4012491994200000...2551.000.000.000.000.000.000.000.00normal
..................................................................
13989900149103200000...2551.000.001.000.000.000.000.000.00anomaly
139900014997943330000...1410.720.060.010.010.010.000.000.00normal
139901012293179380000...2551.000.000.010.010.010.000.000.00normal
1399020211942420000...2520.990.010.000.000.000.000.000.00normal
13990301521000000...210.080.030.000.000.000.000.441.00anomaly
\n", "

139904 rows × 42 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "markdown", "source": [ "## Feature Selection" ], "metadata": { "id": "FiXD9jj7JzAt" } }, { "cell_type": "code", "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "X = df.drop(['class'], axis=1)\n", "Y = df[\"class\"].astype('category').cat.codes\n", "clf = RandomForestClassifier()\n", "clf.fit(X, Y)\n", "features = pd.Series(clf.feature_importances_, index=X.columns)\n", "features.sort_values(ascending=False, inplace=True)\n", "print(features.head(10))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vkQIwJLqxnb2", "outputId": "c71170f5-2ec3-4fa7-e5c3-0c2eef6e236c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "src_bytes 0.170986\n", "dst_bytes 0.136790\n", "flag 0.109430\n", "diff_srv_rate 0.071831\n", "logged_in 0.052381\n", "same_srv_rate 0.050228\n", "dst_host_srv_count 0.044186\n", "dst_host_diff_srv_rate 0.041677\n", "dst_host_same_srv_rate 0.037905\n", "service 0.037454\n", "dtype: float64\n" ] } ] }, { "cell_type": "code", "source": [ "fs = features.head(10).to_dict()\n", "fs" ], "metadata": { "id": "CcnIqyU6u_lU", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "fc65c693-356c-4641-b5f9-7ae6cfa56958" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'src_bytes': 0.17098620255574568,\n", " 'dst_bytes': 0.13679024895951913,\n", " 'flag': 0.10942976466676348,\n", " 'diff_srv_rate': 0.07183130071546076,\n", " 'logged_in': 0.05238089001369002,\n", " 'same_srv_rate': 0.05022756247592041,\n", " 'dst_host_srv_count': 0.04418600834209949,\n", " 'dst_host_diff_srv_rate': 0.04167726706300874,\n", " 'dst_host_same_srv_rate': 0.037905206562235054,\n", " 'service': 0.037454366858942253}" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "columns = []\n", "for k in fs:\n", " columns.append(str(k))\n", "columns" ], "metadata": { "id": "tUrifaWBvZpx", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "385f28a5-ab97-4248-c34e-7d306c58d4b4" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['src_bytes',\n", " 'dst_bytes',\n", " 'flag',\n", " 'diff_srv_rate',\n", " 'logged_in',\n", " 'same_srv_rate',\n", " 'dst_host_srv_count',\n", " 'dst_host_diff_srv_rate',\n", " 'dst_host_same_srv_rate',\n", " 'service']" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "code", "source": [ "X = df.drop(['class'], axis=1)\n", "Y = df[\"class\"].astype('category').cat.codes\n", "principalDf = pd.DataFrame(data = X\n", " , columns = columns)\n", "finalDf = pd.concat([principalDf, df[\"class\"]], axis = 1)\n", "finalDf" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "RzFbtWLlsmn0", "outputId": "5b44b483-bf9a-429e-b4bf-36e2f6b91afd" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " src_bytes dst_bytes flag diff_srv_rate logged_in same_srv_rate \\\n", "0 491 0 9 0.00 0 1.00 \n", "1 146 0 9 0.15 0 0.08 \n", "2 0 0 5 0.07 0 0.05 \n", "3 232 8153 9 0.00 1 1.00 \n", "4 199 420 9 0.00 1 1.00 \n", "... ... ... ... ... ... ... \n", "139899 1032 0 9 0.00 0 1.00 \n", "139900 794 333 9 0.00 1 1.00 \n", "139901 317 938 9 0.00 1 1.00 \n", "139902 42 42 9 0.00 0 1.00 \n", "139903 0 0 1 1.00 0 0.25 \n", "\n", " dst_host_srv_count dst_host_diff_srv_rate dst_host_same_srv_rate \\\n", "0 25 0.03 0.17 \n", "1 1 0.60 0.00 \n", "2 26 0.05 0.10 \n", "3 255 0.00 1.00 \n", "4 255 0.00 1.00 \n", "... ... ... ... \n", "139899 255 0.00 1.00 \n", "139900 141 0.06 0.72 \n", "139901 255 0.00 1.00 \n", "139902 252 0.01 0.99 \n", "139903 21 0.03 0.08 \n", "\n", " service class \n", "0 20 normal \n", "1 44 normal \n", "2 49 anomaly \n", "3 24 normal \n", "4 24 normal \n", "... ... ... \n", "139899 14 anomaly \n", "139900 49 normal \n", "139901 22 normal \n", "139902 11 normal \n", "139903 52 anomaly \n", "\n", "[139904 rows x 11 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
src_bytesdst_bytesflagdiff_srv_ratelogged_insame_srv_ratedst_host_srv_countdst_host_diff_srv_ratedst_host_same_srv_rateserviceclass
0491090.0001.00250.030.1720normal
1146090.1500.0810.600.0044normal
20050.0700.05260.050.1049anomaly
3232815390.0011.002550.001.0024normal
419942090.0011.002550.001.0024normal
....................................
1398991032090.0001.002550.001.0014anomaly
13990079433390.0011.001410.060.7249normal
13990131793890.0011.002550.001.0022normal
139902424290.0001.002520.010.9911normal
1399030011.0000.25210.030.0852anomaly
\n", "

139904 rows × 11 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "markdown", "source": [ "# Model" ], "metadata": { "id": "B6B5ZuR5J5WQ" } }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "train_data, test_data = train_test_split(finalDf, test_size=0.25)\n", "train_data_file = \"train_data.csv\"\n", "test_data_file = \"test_data.csv\"\n", "\n", "train_data.to_csv(train_data_file, index=False, header=False)\n", "test_data.to_csv(test_data_file, index=False, header=False)" ], "metadata": { "id": "_1N70b_DJb2m" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "CSV_HEADER = []\n", "for x in columns:\n", " CSV_HEADER.append(x)\n", "CSV_HEADER.append(\"class\")\n", "\n", "# A list of the numerical feature names.\n", "NUMERIC_FEATURE_NAMES = columns\n", "# A dictionary of the categorical features and their vocabulary.\n", "CATEGORICAL_FEATURES_WITH_VOCABULARY = {\n", "}\n", "# A list of the columns to ignore from the dataset.\n", "IGNORE_COLUMN_NAMES = []\n", "# A list of the categorical feature names.\n", "CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())\n", "# A list of all the input features.\n", "FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES\n", "# A list of column default values for each feature.\n", "COLUMN_DEFAULTS = [\n", " [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else [\"NA\"]\n", " for feature_name in CSV_HEADER\n", "]\n", "# The name of the target feature.\n", "TARGET_FEATURE_NAME = \"class\"\n", "# A list of the labels of the target features.\n", "TARGET_LABELS = [\"normal\", \"anomaly\"]\n", "\n", "from tensorflow.keras.layers import StringLookup\n", "\n", "target_label_lookup = StringLookup(\n", " vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0\n", ")\n", "\n", "\n", "def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):\n", " dataset = tf.data.experimental.make_csv_dataset(\n", " csv_file_path,\n", " batch_size=batch_size,\n", " column_names=CSV_HEADER,\n", " column_defaults=COLUMN_DEFAULTS,\n", " label_name=TARGET_FEATURE_NAME,\n", " num_epochs=1,\n", " header=False,\n", " na_value=\"?\",\n", " shuffle=shuffle,\n", " ).map(lambda features, target: (features, target_label_lookup(target)))\n", " return dataset.cache()\n", "\n", "def create_model_inputs():\n", " inputs = {}\n", " for feature_name in FEATURE_NAMES:\n", " if feature_name in NUMERIC_FEATURE_NAMES:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.float32\n", " )\n", " else:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.string\n", " )\n", " return inputs\n", "\n", "def encode_inputs(inputs):\n", " encoded_features = []\n", " for feature_name in inputs:\n", " if feature_name in CATEGORICAL_FEATURE_NAMES:\n", " vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]\n", " #print(vocabulary)\n", " # Create a lookup to convert a string values to an integer indices.\n", " # Since we are not using a mask token, nor expecting any out of vocabulary\n", " # (oov) token, we set mask_token to None and num_oov_indices to 0.\n", " lookup = StringLookup(\n", " vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n", " )\n", " # Convert the string input values into integer indices.\n", " value_index = lookup(inputs[feature_name])\n", " embedding_dims = int(math.sqrt(lookup.vocabulary_size()))\n", " # Create an embedding layer with the specified dimensions.\n", " embedding = layers.Embedding(\n", " input_dim=lookup.vocabulary_size(), output_dim=embedding_dims\n", " )\n", " # Convert the index values to embedding representations.\n", " encoded_feature = embedding(value_index)\n", " else:\n", " # Use the numerical features as-is.\n", " encoded_feature = inputs[feature_name]\n", " if inputs[feature_name].shape[-1] is None:\n", " encoded_feature = tf.expand_dims(encoded_feature, -1)\n", "\n", " encoded_features.append(encoded_feature)\n", "\n", " encoded_features = layers.concatenate(encoded_features)\n", " return encoded_features\n", "\n", "class NeuralDecisionTree(keras.Model):\n", " def __init__(self, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.depth = depth\n", " self.num_leaves = 2 ** depth\n", " self.num_classes = num_classes\n", "\n", " # Create a mask for the randomly selected features.\n", " num_used_features = int(num_features * used_features_rate)\n", " one_hot = np.eye(num_features)\n", " sampled_feature_indicies = np.random.choice(\n", " np.arange(num_features), num_used_features, replace=False\n", " )\n", " self.used_features_mask = one_hot[sampled_feature_indicies]\n", "\n", " # Initialize the weights of the classes in leaves.\n", " self.pi = tf.Variable(\n", " initial_value=tf.random_normal_initializer()(\n", " shape=[self.num_leaves, self.num_classes]\n", " ),\n", " dtype=\"float32\",\n", " trainable=True,\n", " )\n", "\n", " # Initialize the stochastic routing layer.\n", " self.decision_fn = layers.Dense(\n", " units=self.num_leaves, activation=\"sigmoid\", name=\"decision\"\n", " )\n", "\n", " def call(self, features):\n", " batch_size = tf.shape(features)[0]\n", "\n", " # Apply the feature mask to the input features.\n", " features = tf.matmul(\n", " features, self.used_features_mask, transpose_b=True\n", " ) # [batch_size, num_used_features]\n", " # Compute the routing probabilities.\n", " decisions = tf.expand_dims(\n", " self.decision_fn(features), axis=2\n", " ) # [batch_size, num_leaves, 1]\n", " # Concatenate the routing probabilities with their complements.\n", " decisions = layers.concatenate(\n", " [decisions, 1 - decisions], axis=2\n", " ) # [batch_size, num_leaves, 2]\n", "\n", " mu = tf.ones([batch_size, 1, 1])\n", "\n", " begin_idx = 1\n", " end_idx = 2\n", " # Traverse the tree in breadth-first order.\n", " for level in range(self.depth):\n", " mu = tf.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1]\n", " mu = tf.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2]\n", " level_decisions = decisions[\n", " :, begin_idx:end_idx, :\n", " ] # [batch_size, 2 ** level, 2]\n", " mu = mu * level_decisions # [batch_size, 2**level, 2]\n", " begin_idx = end_idx\n", " end_idx = begin_idx + 2 ** (level + 1)\n", "\n", " mu = tf.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves]\n", " probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes]\n", " outputs = tf.matmul(mu, probabilities) # [batch_size, num_classes]\n", " return outputs\n", "\n", "class NeuralDecisionForest(keras.Model):\n", " def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.ensemble = []\n", " # Initialize the ensemble by adding NeuralDecisionTree instances.\n", " # Each tree will have its own randomly selected input features to use.\n", " for _ in range(num_trees):\n", " self.ensemble.append(\n", " NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)\n", " )\n", "\n", " def call(self, inputs):\n", " # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.\n", " batch_size = tf.shape(inputs)[0]\n", " outputs = tf.zeros([batch_size, num_classes])\n", "\n", " # Aggregate the outputs of trees in the ensemble.\n", " for tree in self.ensemble:\n", " outputs += tree(inputs)\n", " # Divide the outputs by the ensemble size to get the average.\n", " outputs /= len(self.ensemble)\n", " return outputs\n", "learning_rate = 0.01\n", "batch_size = 128\n", "num_epochs = 10\n", "\n", "\n", "def run_experiment(model):\n", "\n", " # model.compile(\n", " # optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " # loss=keras.losses.SparseCategoricalCrossentropy(),\n", " # metrics=[keras.metrics.SparseCategoricalAccuracy()],\n", " # )\n", " model.compile(\n", " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " loss=keras.losses.SparseCategoricalCrossentropy(),\n", " metrics=[metrics.SparseCategoricalAccuracy()],\n", " )\n", " print(\"Start training the model...\")\n", " train_dataset = get_dataset_from_csv(\n", " train_data_file, shuffle=True, batch_size=batch_size\n", " )\n", "\n", " model.fit(train_dataset, epochs=num_epochs)\n", " print(\"Model training finished\")\n", "\n", " print(\"Evaluating the model on the test data...\")\n", " test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "\n", " _, accuracy = model.evaluate(test_dataset)\n", " print(f\"Test accuracy: {round(accuracy * 100, 2)}%\")\n", " return model" ], "metadata": { "id": "PGCEwlOPPpEP", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "64231f85-224e-4f74-a503-7635e02f0837" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/numpy/core/numeric.py:2463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", " return bool(asarray(a1 == a2).all())\n" ] } ] }, { "cell_type": "code", "source": [ "num_trees = 25\n", "depth = 5\n", "used_features_rate = 0.5\n", "num_classes = len(TARGET_LABELS)\n", "\n", "\n", "def create_forest_model():\n", " inputs = create_model_inputs()\n", " features = encode_inputs(inputs)\n", " features = layers.BatchNormalization()(features)\n", " num_features = features.shape[1]\n", "\n", " forest_model = NeuralDecisionForest(\n", " num_trees, depth, num_features, used_features_rate, num_classes\n", " )\n", "\n", " outputs = forest_model(features)\n", " model = keras.Model(inputs=inputs, outputs=outputs)\n", " return model\n", "\n", "\n", "forest_model = create_forest_model()\n", "\n", "finalModel = run_experiment(forest_model)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vLjxBfBIQUKR", "outputId": "2edcd248-dae4-48e6-9f14-02351e014f56" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Start training the model...\n", "Epoch 1/10\n", "820/820 [==============================] - 76s 44ms/step - loss: 0.2196 - sparse_categorical_accuracy: 0.9087\n", "Epoch 2/10\n", "820/820 [==============================] - 18s 22ms/step - loss: 0.1533 - sparse_categorical_accuracy: 0.9303\n", "Epoch 3/10\n", "820/820 [==============================] - 16s 20ms/step - loss: 0.1423 - sparse_categorical_accuracy: 0.9361\n", "Epoch 4/10\n", "820/820 [==============================] - 18s 22ms/step - loss: 0.1370 - sparse_categorical_accuracy: 0.9392\n", "Epoch 5/10\n", "820/820 [==============================] - 17s 21ms/step - loss: 0.1336 - sparse_categorical_accuracy: 0.9417\n", "Epoch 6/10\n", "820/820 [==============================] - 21s 25ms/step - loss: 0.1315 - sparse_categorical_accuracy: 0.9439\n", "Epoch 7/10\n", "820/820 [==============================] - 18s 22ms/step - loss: 0.1298 - sparse_categorical_accuracy: 0.9454\n", "Epoch 8/10\n", "820/820 [==============================] - 17s 21ms/step - loss: 0.1286 - sparse_categorical_accuracy: 0.9464\n", "Epoch 9/10\n", "820/820 [==============================] - 18s 22ms/step - loss: 0.1276 - sparse_categorical_accuracy: 0.9471\n", "Epoch 10/10\n", "820/820 [==============================] - 17s 20ms/step - loss: 0.1262 - sparse_categorical_accuracy: 0.9484\n", "Model training finished\n", "Evaluating the model on the test data...\n", "274/274 [==============================] - 6s 14ms/step - loss: 0.1234 - sparse_categorical_accuracy: 0.9498\n", "Test accuracy: 94.98%\n" ] } ] }, { "cell_type": "code", "source": [ "test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "colnames=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'target']\n", "data = pd.read_csv(\"test_data.csv\", names=colnames, header=None)\n", "data['target'].replace('normal', 0,inplace=True)\n", "data['target'].replace('anomaly', 1,inplace=True)\n", "y_test = data['target'].values # as a numpy array\n", "from sklearn.metrics import confusion_matrix\n", "y_prediction = finalModel.predict(test_dataset)\n", "y_prediction = np.argmax (y_prediction, axis = 1)\n", "result = confusion_matrix(y_test, y_prediction , normalize='pred')\n", "print(result)\n", "TP = result[0][0]\n", "FP = result[0][1]\n", "TN = result[1][1]\n", "FN = result[1][0]\n", "ACC = (TP+TN)/(TP+TN+FP+FN)\n", "PR = TP/(TP+FP) #precision\n", "TPR = TP/(TP+FN) #Recall or True positive rate\n", "FPR = FP/(FP+TN)\n", "F1Score = 2*(PR*TPR)/(PR+TPR)\n", "print(\"ACC: \" + str(ACC))\n", "print(\"PR: \" + str(PR))\n", "print(\"TPR: \" + str(TPR))\n", "print(\"FPR: \" + str(FPR))\n", "print(\"F1Score: \" + str(F1Score))\n" ], "metadata": { "id": "cIZKjFUeWRnr", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "cf4272ac-7502-4367-b5be-d6882609f39b" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "274/274 [==============================] - 5s 9ms/step\n", "[[0.92530258 0.02028617]\n", " [0.07469742 0.97971383]]\n", "ACC: 0.9525082065813029\n", "PR: 0.9785465208664409\n", "TPR: 0.9253025816840683\n", "FPR: 0.02028616852146264\n", "F1Score: 0.9511800287561489\n" ] } ] }, { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "import numpy\n", "from sklearn import metrics\n", "\n", "\n", "cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = result, display_labels = [True, False])\n", "\n", "cm_display.plot()\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "id": "0go6QErNNH9Z", "outputId": "c6f718b2-dd4f-41be-fbae-7d4d3a244252" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## CISIDS2017" ], "metadata": { "id": "lx4RpN47ybt6" } }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kHU-PAI6yhyz", "outputId": "4e95ed59-4691-4fcc-cf81-e13ee713dfdd" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import glob\n", "import os\n", "\n", "path = r'C:\\DRO\\DCL_rawdata_files' # use your path\n", "all_files = glob.glob(os.path.join(path , \"/content/drive/MyDrive/datasets/CISIDS2017/*.csv\"))\n", "\n", "li = []\n", "\n", "for filename in all_files:\n", " df = pd.read_csv(filename, index_col=None, header=0)\n", " li.append(df)\n", "\n", "frame = pd.concat(li, axis=0, ignore_index=True)" ], "metadata": { "id": "orUywltc2yaK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "column_headers = list(frame.columns.values)\n", "column_headers" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eJcGQKBn4Abs", "outputId": "da7510b0-433e-4504-80e6-1923149d460d" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[' Destination Port',\n", " ' Flow Duration',\n", " ' Total Fwd Packets',\n", " ' Total Backward Packets',\n", " 'Total Length of Fwd Packets',\n", " ' Total Length of Bwd Packets',\n", " ' Fwd Packet Length Max',\n", " ' Fwd Packet Length Min',\n", " ' Fwd Packet Length Mean',\n", " ' Fwd Packet Length Std',\n", " 'Bwd Packet Length Max',\n", " ' Bwd Packet Length Min',\n", " ' Bwd Packet Length Mean',\n", " ' Bwd Packet Length Std',\n", " 'Flow Bytes/s',\n", " ' Flow Packets/s',\n", " ' Flow IAT Mean',\n", " ' Flow IAT Std',\n", " ' Flow IAT Max',\n", " ' Flow IAT Min',\n", " 'Fwd IAT Total',\n", " ' Fwd IAT Mean',\n", " ' Fwd IAT Std',\n", " ' Fwd IAT Max',\n", " ' Fwd IAT Min',\n", " 'Bwd IAT Total',\n", " ' Bwd IAT Mean',\n", " ' Bwd IAT Std',\n", " ' Bwd IAT Max',\n", " ' Bwd IAT Min',\n", " 'Fwd PSH Flags',\n", " ' Bwd PSH Flags',\n", " ' Fwd URG Flags',\n", " ' Bwd URG Flags',\n", " ' Fwd Header Length',\n", " ' Bwd Header Length',\n", " 'Fwd Packets/s',\n", " ' Bwd Packets/s',\n", " ' Min Packet Length',\n", " ' Max Packet Length',\n", " ' Packet Length Mean',\n", " ' Packet Length Std',\n", " ' Packet Length Variance',\n", " 'FIN Flag Count',\n", " ' SYN Flag Count',\n", " ' RST Flag Count',\n", " ' PSH Flag Count',\n", " ' ACK Flag Count',\n", " ' URG Flag Count',\n", " ' CWE Flag Count',\n", " ' ECE Flag Count',\n", " ' Down/Up Ratio',\n", " ' Average Packet Size',\n", " ' Avg Fwd Segment Size',\n", " ' Avg Bwd Segment Size',\n", " ' Fwd Header Length.1',\n", " 'Fwd Avg Bytes/Bulk',\n", " ' Fwd Avg Packets/Bulk',\n", " ' Fwd Avg Bulk Rate',\n", " ' Bwd Avg Bytes/Bulk',\n", " ' Bwd Avg Packets/Bulk',\n", " 'Bwd Avg Bulk Rate',\n", " 'Subflow Fwd Packets',\n", " ' Subflow Fwd Bytes',\n", " ' Subflow Bwd Packets',\n", " ' Subflow Bwd Bytes',\n", " 'Init_Win_bytes_forward',\n", " ' Init_Win_bytes_backward',\n", " ' act_data_pkt_fwd',\n", " ' min_seg_size_forward',\n", " 'Active Mean',\n", " ' Active Std',\n", " ' Active Max',\n", " ' Active Min',\n", " 'Idle Mean',\n", " ' Idle Std',\n", " ' Idle Max',\n", " ' Idle Min',\n", " ' Label']" ] }, "metadata": {}, "execution_count": 3 } ] }, { "cell_type": "code", "source": [ "frame[' Label'] = frame[' Label'].str.replace(r\"^(.(?:1: FutureWarning: The default value of regex will change from True to False in a future version.\n", " frame[' Label'] = frame[' Label'].str.replace(r\"^(.(? (Q3 + 1.5 * IQR)))\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n", " return trueList\n", "\n", "nonOutlierList = Remove_Outlier_Indices(df)\n", "new_data = df[nonOutlierList]\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "96cb862f-57f8-4621-e93a-f1d9d3994ae4", "id": "i_PXbZyt5-fR" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":2: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q1 = df.quantile(0.02)\n", ":3: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q3 = df.quantile(0.98)\n", ":6: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n" ] } ] }, { "cell_type": "code", "source": [ "df = new_data\n", "df = df.reset_index(drop=True)\n", "del new_data\n", "del nonOutlierList\n", "del li\n", "del frame" ], "metadata": { "id": "FSPL95xg5-fT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "EeRnvhzX5-fV" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "X = df.drop([' Label'], axis=1)\n", "Y = df[\" Label\"].astype('category').cat.codes\n", "clf = RandomForestClassifier()\n", "clf.fit(X, Y)\n", "features = pd.Series(clf.feature_importances_, index=X.columns)\n", "features.sort_values(ascending=False, inplace=True)\n", "print(features.head(10))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "ce896ae8-6065-417d-a3dc-796d8befbd13", "id": "uYcOxDMSW0qE" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Packet Length Variance 0.081247\n", " Packet Length Std 0.064861\n", " Packet Length Mean 0.056147\n", " Average Packet Size 0.050787\n", " Destination Port 0.049960\n", "Bwd Packet Length Max 0.043204\n", " Avg Bwd Segment Size 0.037427\n", "Total Length of Fwd Packets 0.034475\n", " Bwd Packet Length Mean 0.032600\n", "Init_Win_bytes_forward 0.030282\n", "dtype: float64\n" ] } ] }, { "cell_type": "code", "source": [ "fs = features.head(10).to_dict()\n", "fs" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "16d8a6f1-c828-4acf-8614-d93af9375a38", "id": "M3KuY2m0W0qF" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{' Packet Length Variance': 0.08124749509423052,\n", " ' Packet Length Std': 0.06486087933746576,\n", " ' Packet Length Mean': 0.05614707208016595,\n", " ' Average Packet Size': 0.050786574411919236,\n", " ' Destination Port': 0.049960097719106485,\n", " 'Bwd Packet Length Max': 0.04320449616955384,\n", " ' Avg Bwd Segment Size': 0.037427049333999665,\n", " 'Total Length of Fwd Packets': 0.0344753595505703,\n", " ' Bwd Packet Length Mean': 0.03260012604646854,\n", " 'Init_Win_bytes_forward': 0.03028173100385773}" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "columns = []\n", "for k in fs:\n", " columns.append(str(k))\n", "columns" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8aa71c49-3bf6-424c-dffd-892fbfa3ef84", "id": "Ds6Ia777W0qG" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[' Packet Length Variance',\n", " ' Packet Length Std',\n", " ' Packet Length Mean',\n", " ' Average Packet Size',\n", " ' Destination Port',\n", " 'Bwd Packet Length Max',\n", " ' Avg Bwd Segment Size',\n", " 'Total Length of Fwd Packets',\n", " ' Bwd Packet Length Mean',\n", " 'Init_Win_bytes_forward']" ] }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "code", "source": [ "principalDf = pd.DataFrame(data = X\n", " , columns = columns)\n", "finalDf = pd.concat([principalDf, df[\" Label\"]], axis = 1)\n", "finalDf" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 441 }, "outputId": "ad223296-2c36-4346-eb6b-50fb72488f8d", "id": "Js17hn8LW0qI" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Packet Length Variance Packet Length Std Packet Length Mean \\\n", "0 0.000000 0.000000 6.000000 \n", "1 0.000000 0.000000 6.000000 \n", "2 0.000000 0.000000 6.000000 \n", "3 0.000000 0.000000 6.000000 \n", "4 0.000000 0.000000 6.000000 \n", "... ... ... ... \n", "2539413 548.571429 23.421602 41.714286 \n", "2539414 5796.300000 76.133435 97.600000 \n", "2539415 267.333333 16.350331 17.000000 \n", "2539416 1792.000000 42.332021 53.333333 \n", "2539417 1037.142857 32.204702 65.857143 \n", "\n", " Average Packet Size Destination Port Bwd Packet Length Max \\\n", "0 9.000000 54865 0 \n", "1 9.000000 55054 6 \n", "2 9.000000 55055 6 \n", "3 9.000000 46236 6 \n", "4 9.000000 54863 0 \n", "... ... ... ... \n", "2539413 48.666667 53 76 \n", "2539414 122.000000 53 181 \n", "2539415 22.666667 58030 6 \n", "2539416 60.000000 53 128 \n", "2539417 76.833333 53 113 \n", "\n", " Avg Bwd Segment Size Total Length of Fwd Packets \\\n", "0 0.0 12 \n", "1 6.0 6 \n", "2 6.0 6 \n", "3 6.0 6 \n", "4 0.0 12 \n", "... ... ... \n", "2539413 76.0 112 \n", "2539414 181.0 84 \n", "2539415 6.0 31 \n", "2539416 128.0 192 \n", "2539417 113.0 188 \n", "\n", " Bwd Packet Length Mean Init_Win_bytes_forward Label \n", "0 0.0 33 BENIGN \n", "1 6.0 29 BENIGN \n", "2 6.0 29 BENIGN \n", "3 6.0 31 BENIGN \n", "4 0.0 32 BENIGN \n", "... ... ... ... \n", "2539413 76.0 -1 BENIGN \n", "2539414 181.0 -1 BENIGN \n", "2539415 6.0 1006 BENIGN \n", "2539416 128.0 -1 BENIGN \n", "2539417 113.0 -1 BENIGN \n", "\n", "[2539418 rows x 11 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Packet Length VariancePacket Length StdPacket Length MeanAverage Packet SizeDestination PortBwd Packet Length MaxAvg Bwd Segment SizeTotal Length of Fwd PacketsBwd Packet Length MeanInit_Win_bytes_forwardLabel
00.0000000.0000006.0000009.0000005486500.0120.033BENIGN
10.0000000.0000006.0000009.0000005505466.066.029BENIGN
20.0000000.0000006.0000009.0000005505566.066.029BENIGN
30.0000000.0000006.0000009.0000004623666.066.031BENIGN
40.0000000.0000006.0000009.0000005486300.0120.032BENIGN
....................................
2539413548.57142923.42160241.71428648.666667537676.011276.0-1BENIGN
25394145796.30000076.13343597.600000122.00000053181181.084181.0-1BENIGN
2539415267.33333316.35033117.00000022.6666675803066.0316.01006BENIGN
25394161792.00000042.33202153.33333360.00000053128128.0192128.0-1BENIGN
25394171037.14285732.20470265.85714376.83333353113113.0188113.0-1BENIGN
\n", "

2539418 rows × 11 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "train_data, test_data = train_test_split(finalDf, test_size=0.25)\n", "train_data_file = \"train_data.csv\"\n", "test_data_file = \"test_data.csv\"\n", "\n", "train_data.to_csv(train_data_file, index=False, header=False)\n", "test_data.to_csv(test_data_file, index=False, header=False)" ], "metadata": { "id": "c85wOCBv5-fX" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "del finalDf\n", "del principalDf\n", "del train_data\n", "del test_data\n", "del clf\n", "del X\n", "del Y" ], "metadata": { "id": "szYkC6TjPd0U" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import tensorflow as tf\n", "import numpy as np\n", "import pandas as pd\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "from keras import losses\n", "from keras import optimizers\n", "from keras import metrics\n", "import math\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "CSV_HEADER = []\n", "for x in columns:\n", " CSV_HEADER.append(x)\n", "CSV_HEADER.append(\" Label\")\n", "\n", "# A list of the numerical feature names.\n", "NUMERIC_FEATURE_NAMES = columns\n", "# A dictionary of the categorical features and their vocabulary.\n", "CATEGORICAL_FEATURES_WITH_VOCABULARY = {\n", "}\n", "# A list of the columns to ignore from the dataset.\n", "IGNORE_COLUMN_NAMES = []\n", "# A list of the categorical feature names.\n", "CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())\n", "# A list of all the input features.\n", "FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES\n", "# A list of column default values for each feature.\n", "COLUMN_DEFAULTS = [\n", " [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else [\"NA\"]\n", " for feature_name in CSV_HEADER\n", "]\n", "# The name of the target feature.\n", "TARGET_FEATURE_NAME = \" Label\"\n", "# A list of the labels of the target features.\n", "TARGET_LABELS = [\"BENIGN\", \"ANOMALY\"]\n", "\n", "from tensorflow.keras.layers import StringLookup\n", "\n", "target_label_lookup = StringLookup(\n", " vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0\n", ")\n", "\n", "\n", "def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):\n", " dataset = tf.data.experimental.make_csv_dataset(\n", " csv_file_path,\n", " batch_size=batch_size,\n", " column_names=CSV_HEADER,\n", " column_defaults=COLUMN_DEFAULTS,\n", " label_name=TARGET_FEATURE_NAME,\n", " num_epochs=1,\n", " header=False,\n", " na_value=\"?\",\n", " shuffle=shuffle,\n", " ).map(lambda features, target: (features, target_label_lookup(target)))\n", " return dataset.cache()\n", "\n", "def create_model_inputs():\n", " inputs = {}\n", " for feature_name in FEATURE_NAMES:\n", " if feature_name in NUMERIC_FEATURE_NAMES:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.float32\n", " )\n", " else:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.string\n", " )\n", " return inputs\n", "\n", "def encode_inputs(inputs):\n", " encoded_features = []\n", " for feature_name in inputs:\n", " if feature_name in CATEGORICAL_FEATURE_NAMES:\n", " vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]\n", " #print(vocabulary)\n", " # Create a lookup to convert a string values to an integer indices.\n", " # Since we are not using a mask token, nor expecting any out of vocabulary\n", " # (oov) token, we set mask_token to None and num_oov_indices to 0.\n", " lookup = StringLookup(\n", " vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n", " )\n", " # Convert the string input values into integer indices.\n", " value_index = lookup(inputs[feature_name])\n", " embedding_dims = int(math.sqrt(lookup.vocabulary_size()))\n", " # Create an embedding layer with the specified dimensions.\n", " embedding = layers.Embedding(\n", " input_dim=lookup.vocabulary_size(), output_dim=embedding_dims\n", " )\n", " # Convert the index values to embedding representations.\n", " encoded_feature = embedding(value_index)\n", " else:\n", " # Use the numerical features as-is.\n", " encoded_feature = inputs[feature_name]\n", " if inputs[feature_name].shape[-1] is None:\n", " encoded_feature = tf.expand_dims(encoded_feature, -1)\n", "\n", " encoded_features.append(encoded_feature)\n", "\n", " encoded_features = layers.concatenate(encoded_features)\n", " return encoded_features\n", "\n", "class NeuralDecisionTree(keras.Model):\n", " def __init__(self, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.depth = depth\n", " self.num_leaves = 2 ** depth\n", " self.num_classes = num_classes\n", "\n", " # Create a mask for the randomly selected features.\n", " num_used_features = int(num_features * used_features_rate)\n", " one_hot = np.eye(num_features)\n", " sampled_feature_indicies = np.random.choice(\n", " np.arange(num_features), num_used_features, replace=False\n", " )\n", " self.used_features_mask = one_hot[sampled_feature_indicies]\n", "\n", " # Initialize the weights of the classes in leaves.\n", " self.pi = tf.Variable(\n", " initial_value=tf.random_normal_initializer()(\n", " shape=[self.num_leaves, self.num_classes]\n", " ),\n", " dtype=\"float32\",\n", " trainable=True,\n", " )\n", "\n", " # Initialize the stochastic routing layer.\n", " self.decision_fn = layers.Dense(\n", " units=self.num_leaves, activation=\"sigmoid\", name=\"decision\"\n", " )\n", "\n", " def call(self, features):\n", " batch_size = tf.shape(features)[0]\n", "\n", " # Apply the feature mask to the input features.\n", " features = tf.matmul(\n", " features, self.used_features_mask, transpose_b=True\n", " ) # [batch_size, num_used_features]\n", " # Compute the routing probabilities.\n", " decisions = tf.expand_dims(\n", " self.decision_fn(features), axis=2\n", " ) # [batch_size, num_leaves, 1]\n", " # Concatenate the routing probabilities with their complements.\n", " decisions = layers.concatenate(\n", " [decisions, 1 - decisions], axis=2\n", " ) # [batch_size, num_leaves, 2]\n", "\n", " mu = tf.ones([batch_size, 1, 1])\n", "\n", " begin_idx = 1\n", " end_idx = 2\n", " # Traverse the tree in breadth-first order.\n", " for level in range(self.depth):\n", " mu = tf.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1]\n", " mu = tf.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2]\n", " level_decisions = decisions[\n", " :, begin_idx:end_idx, :\n", " ] # [batch_size, 2 ** level, 2]\n", " mu = mu * level_decisions # [batch_size, 2**level, 2]\n", " begin_idx = end_idx\n", " end_idx = begin_idx + 2 ** (level + 1)\n", "\n", " mu = tf.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves]\n", " probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes]\n", " outputs = tf.matmul(mu, probabilities) # [batch_size, num_classes]\n", " return outputs\n", "\n", "class NeuralDecisionForest(keras.Model):\n", " def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.ensemble = []\n", " # Initialize the ensemble by adding NeuralDecisionTree instances.\n", " # Each tree will have its own randomly selected input features to use.\n", " for _ in range(num_trees):\n", " self.ensemble.append(\n", " NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)\n", " )\n", "\n", " def call(self, inputs):\n", " # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.\n", " batch_size = tf.shape(inputs)[0]\n", " outputs = tf.zeros([batch_size, num_classes])\n", "\n", " # Aggregate the outputs of trees in the ensemble.\n", " for tree in self.ensemble:\n", " outputs += tree(inputs)\n", " # Divide the outputs by the ensemble size to get the average.\n", " outputs /= len(self.ensemble)\n", " return outputs\n", "learning_rate = 0.01\n", "batch_size = 128\n", "num_epochs = 10\n", "\n", "\n", "def run_experiment(model):\n", "\n", " # model.compile(\n", " # optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " # loss=keras.losses.SparseCategoricalCrossentropy(),\n", " # metrics=[keras.metrics.SparseCategoricalAccuracy()],\n", " # )\n", " model.compile(\n", " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " loss=keras.losses.SparseCategoricalCrossentropy(),\n", " metrics=[metrics.SparseCategoricalAccuracy()],\n", " )\n", " print(\"Start training the model...\")\n", " train_dataset = get_dataset_from_csv(\n", " train_data_file, shuffle=True, batch_size=batch_size\n", " )\n", "\n", " model.fit(train_dataset, epochs=num_epochs)\n", " print(\"Model training finished\")\n", "\n", " print(\"Evaluating the model on the test data...\")\n", " test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "\n", " _, accuracy = model.evaluate(test_dataset)\n", " print(f\"Test accuracy: {round(accuracy * 100, 2)}%\")\n", " return model" ], "metadata": { "id": "8txIkhqk5-fX", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "64697009-140c-4682-aeea-7ab2060fe6f6" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/numpy/core/numeric.py:2463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", " return bool(asarray(a1 == a2).all())\n" ] } ] }, { "cell_type": "code", "source": [ "num_trees = 25\n", "depth = 5\n", "used_features_rate = 0.5\n", "num_classes = len(TARGET_LABELS)\n", "\n", "\n", "def create_forest_model():\n", " inputs = create_model_inputs()\n", " features = encode_inputs(inputs)\n", " features = layers.BatchNormalization()(features)\n", " num_features = features.shape[1]\n", "\n", " forest_model = NeuralDecisionForest(\n", " num_trees, depth, num_features, used_features_rate, num_classes\n", " )\n", "\n", " outputs = forest_model(features)\n", " model = keras.Model(inputs=inputs, outputs=outputs)\n", " return model\n", "\n", "\n", "forest_model = create_forest_model()\n", "\n", "finalModel = run_experiment(forest_model)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "3843437d-e139-434d-9d9c-ef64cdf93845", "id": "cEIPaJMi5-fa" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Start training the model...\n", "Epoch 1/10\n", "14880/14880 [==============================] - 474s 29ms/step - loss: 0.2110 - sparse_categorical_accuracy: 0.8957\n", "Epoch 2/10\n", "14880/14880 [==============================] - 354s 24ms/step - loss: 0.1903 - sparse_categorical_accuracy: 0.9018\n", "Epoch 3/10\n", "14880/14880 [==============================] - 353s 24ms/step - loss: 0.1861 - sparse_categorical_accuracy: 0.9025\n", "Epoch 4/10\n", "14880/14880 [==============================] - 379s 25ms/step - loss: 0.1827 - sparse_categorical_accuracy: 0.9029\n", "Epoch 5/10\n", "14880/14880 [==============================] - 375s 25ms/step - loss: 0.1809 - sparse_categorical_accuracy: 0.9032\n", "Epoch 6/10\n", "14880/14880 [==============================] - 378s 25ms/step - loss: 0.1792 - sparse_categorical_accuracy: 0.9037\n", "Epoch 7/10\n", "14880/14880 [==============================] - 373s 25ms/step - loss: 0.1784 - sparse_categorical_accuracy: 0.9039\n", "Epoch 8/10\n", "14880/14880 [==============================] - 374s 25ms/step - loss: 0.1780 - sparse_categorical_accuracy: 0.9040\n", "Epoch 9/10\n", "14880/14880 [==============================] - 375s 25ms/step - loss: 0.1776 - sparse_categorical_accuracy: 0.9041\n", "Epoch 10/10\n", "14880/14880 [==============================] - 375s 25ms/step - loss: 0.1773 - sparse_categorical_accuracy: 0.9042\n", "Model training finished\n", "Evaluating the model on the test data...\n", "4960/4960 [==============================] - 62s 12ms/step - loss: 0.1709 - sparse_categorical_accuracy: 0.9068\n", "Test accuracy: 90.68%\n" ] } ] }, { "cell_type": "code", "source": [ "test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "colnames=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'target']\n", "data = pd.read_csv(\"test_data.csv\", names=colnames, header=None)\n", "data['target'].replace('BENIGN', 0,inplace=True)\n", "data['target'].replace('ANOMALY', 1,inplace=True)\n", "y_test = data['target'].values # as a numpy array\n", "from sklearn.metrics import confusion_matrix\n", "y_prediction = finalModel.predict(test_dataset)\n", "y_prediction = np.argmax (y_prediction, axis = 1)\n", "result = confusion_matrix(y_test, y_prediction , normalize='pred')\n", "print(result)\n", "TP = result[0][0]\n", "FP = result[0][1]\n", "TN = result[1][1]\n", "FN = result[1][0]\n", "ACC = (TP+TN)/(TP+TN+FP+FN)\n", "PR = TP/(TP+FP) #precision\n", "TPR = TP/(TP+FN) #Recall or True positive rate\n", "FPR = FP/(FP+TN)\n", "F1Score = 2*(PR*TPR)/(PR+TPR)\n", "print(\"ACC: \" + str(ACC))\n", "print(\"PR: \" + str(PR))\n", "print(\"TPR: \" + str(TPR))\n", "print(\"FPR: \" + str(FPR))\n", "print(\"F1Score: \" + str(F1Score))\n", "import matplotlib.pyplot as plt\n", "import numpy\n", "from sklearn import metrics\n", "\n", "\n", "cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = result, display_labels = [True, False])\n", "\n", "cm_display.plot()\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EJpkW1Ibmy7w", "outputId": "782a7032-fb76-4a75-add6-f0e2d3099b1c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "4960/4960 [==============================] - 59s 12ms/step\n", "[[0.90649143 0.090948 ]\n", " [0.09350857 0.909052 ]]\n", "ACC: 0.9077717104763482\n", "PR: 0.9088185189042338\n", "TPR: 0.906491425484989\n", "FPR: 0.09094800453229258\n", "F1Score: 0.9076534806135734\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## UNSW-NB15" ], "metadata": { "id": "XXzn-n1vx0F_" } }, { "cell_type": "code", "source": [ "import tensorflow as tf\n", "import numpy as np\n", "import pandas as pd\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "from keras import losses\n", "from keras import optimizers\n", "from keras import metrics\n", "import math\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ], "metadata": { "id": "_8cYJEktx9_V" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "095fab46-02b6-4b4b-f4de-6e6afb89fb90", "id": "l68KcWcMx0GM" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "code", "source": [ "CSV_HEADER = [\n", "\"srcip\",\n", "\"sport\",\n", "\"dstip\",\n", "\"dsport\",\n", "\"proto\",\n", "\"state\",\n", "\"dur\",\n", "\"sbytes\",\n", "\"dbytes\",\n", "\"sttl\",\n", "\"dttl\",\n", "\"sloss\",\n", "\"dloss\",\n", "\"service\",\n", "\"Sload\",\n", "\"Dload\",\n", "\"Spkts\",\n", "\"Dpkts\",\n", "\"swin\",\n", "\"dwin\",\n", "\"stcpb\",\n", "\"dtcpb\",\n", "\"smeansz\",\n", "\"dmeansz\",\n", "\"trans_depth\",\n", "\"res_bdy_len\",\n", "\"Sjit\",\n", "\"Djit\",\n", "\"Stime\",\n", "\"Ltime\",\n", "\"Sintpkt\",\n", "\"Dintpkt\",\n", "\"tcprtt\",\n", "\"synack\",\n", "\"ackdat\",\n", "\"is_sm_ips_ports\",\n", "\"ct_state_ttl\",\n", "\"ct_flw_http_mthd\",\n", "\"is_ftp_login\",\n", "\"ct_ftp_cmd\",\n", "\"ct_srv_src\",\n", "\"ct_srv_dst\",\n", "\"ct_dst_ltm\",\n", "\"ct_src_ ltm\",\n", "\"ct_src_dport_ltm\",\n", "\"ct_dst_sport_ltm\",\n", "\"ct_dst_src_ltm\",\n", "\"attack_cat\",\n", "\"Label\"\n", "]" ], "metadata": { "id": "IqHizbGsyBNd" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import glob\n", "import os\n", "\n", "path = r'C:\\DRO\\DCL_rawdata_files' # use your path\n", "all_files = glob.glob(os.path.join(path , \"/content/drive/MyDrive/datasets/UNSW-NB15/*.csv\"))\n", "\n", "li = []\n", "\n", "for filename in all_files:\n", " df = pd.read_csv(filename, index_col=None, header=None, names=CSV_HEADER,low_memory=False)\n", " li.append(df)\n", "\n", "frame = pd.concat(li, axis=0, ignore_index=True)\n", "pd.set_option('display.max_columns', None)\n", "frame" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 461 }, "id": "4YYTcz4sx0GM", "outputId": "5292e7a1-af35-42d3-cdfc-f471a7778a4b" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " srcip sport dstip dsport proto state dur \\\n", "0 59.166.0.9 7045 149.171.126.7 25 tcp FIN 0.201886 \n", "1 59.166.0.9 9685 149.171.126.2 80 tcp FIN 5.864748 \n", "2 59.166.0.2 1421 149.171.126.4 53 udp CON 0.001391 \n", "3 59.166.0.2 21553 149.171.126.2 25 tcp FIN 0.053948 \n", "4 59.166.0.8 45212 149.171.126.4 53 udp CON 0.000953 \n", "... ... ... ... ... ... ... ... \n", "2540042 59.166.0.8 12520 149.171.126.6 31010 tcp FIN 0.020383 \n", "2540043 59.166.0.0 18895 149.171.126.9 80 tcp FIN 1.402957 \n", "2540044 59.166.0.0 30103 149.171.126.5 5190 tcp FIN 0.007108 \n", "2540045 59.166.0.6 30388 149.171.126.5 111 udp CON 0.004435 \n", "2540046 59.166.0.0 6055 149.171.126.5 54145 tcp FIN 0.072974 \n", "\n", " sbytes dbytes sttl dttl sloss dloss service Sload \\\n", "0 37552 3380 31 29 18 8 smtp 1.459438e+06 \n", "1 19410 1087890 31 29 2 370 http 2.640454e+04 \n", "2 146 178 31 29 0 0 dns 4.198418e+05 \n", "3 37812 3380 31 29 19 8 smtp 5.503374e+06 \n", "4 146 178 31 29 0 0 dns 6.128017e+05 \n", "... ... ... ... ... ... ... ... ... \n", "2540042 320 1874 31 29 1 2 - 1.047932e+05 \n", "2540043 19410 1087890 31 29 2 370 http 1.103783e+05 \n", "2540044 2158 2464 31 29 6 6 - 2.328644e+06 \n", "2540045 568 304 31 29 0 0 - 7.684329e+05 \n", "2540046 4238 60788 31 29 7 30 - 4.582454e+05 \n", "\n", " Dload Spkts Dpkts swin dwin stcpb dtcpb \\\n", "0 1.307669e+05 52 42 255 255 1422136554 3572668484 \n", "1 1.481983e+06 364 746 255 255 389619597 394688654 \n", "2 5.118620e+05 2 2 0 0 0 0 \n", "3 4.893601e+05 54 42 255 255 4047523379 1903327524 \n", "4 7.471144e+05 2 2 0 0 0 0 \n", "... ... ... ... ... ... ... ... \n", "2540042 6.436736e+05 6 8 255 255 3208686479 3225486168 \n", "2540043 6.195098e+06 364 746 255 255 283296697 2429736754 \n", "2540044 2.658413e+06 24 24 255 255 703293844 2848960529 \n", "2540045 4.112740e+05 4 4 0 0 0 0 \n", "2540046 6.571546e+06 72 72 255 255 1003293149 1003585034 \n", "\n", " smeansz dmeansz trans_depth res_bdy_len Sjit Djit \\\n", "0 722 80 0 0 456.043567 15.530109 \n", "1 53 1458 1 0 1031.366423 690.219581 \n", "2 73 89 0 0 0.000000 0.000000 \n", "3 700 80 0 0 65.909688 3.155258 \n", "4 73 89 0 0 0.000000 0.000000 \n", "... ... ... ... ... ... ... \n", "2540042 53 234 0 0 212.810729 3.079195 \n", "2540043 53 1458 1 3924 203.808900 114.173588 \n", "2540044 90 103 0 0 17.627831 0.432619 \n", "2540045 142 76 0 0 1.638604 1.390643 \n", "2540046 59 844 0 0 62.045310 61.899776 \n", "\n", " Stime Ltime Sintpkt Dintpkt tcprtt synack \\\n", "0 1424250009 1424250009 3.943843 4.912488 0.000590 0.000473 \n", "1 1424250003 1424250009 16.155447 7.871279 0.000771 0.000638 \n", "2 1424250009 1424250009 0.009000 0.002000 0.000000 0.000000 \n", "3 1424250009 1424250009 1.011547 1.302561 0.000674 0.000540 \n", "4 1424250009 1424250009 0.009000 0.004000 0.000000 0.000000 \n", "... ... ... ... ... ... ... \n", "2540042 1421955842 1421955842 4.007400 2.027429 0.006386 0.006189 \n", "2540043 1421955841 1421955842 3.864028 1.882421 0.000712 0.000550 \n", "2540044 1421955842 1421955842 0.274261 0.285478 0.000657 0.000532 \n", "2540045 1421955842 1421955842 1.165667 0.987333 0.000000 0.000000 \n", "2540046 1421955842 1421955842 1.022690 0.997042 0.002317 0.002173 \n", "\n", " ackdat is_sm_ips_ports ct_state_ttl ct_flw_http_mthd \\\n", "0 0.000117 0 0 NaN \n", "1 0.000133 0 0 1.0 \n", "2 0.000000 0 0 NaN \n", "3 0.000134 0 0 NaN \n", "4 0.000000 0 0 NaN \n", "... ... ... ... ... \n", "2540042 0.000197 0 0 0.0 \n", "2540043 0.000162 0 0 4.0 \n", "2540044 0.000125 0 0 0.0 \n", "2540045 0.000000 0 0 0.0 \n", "2540046 0.000144 0 0 0.0 \n", "\n", " is_ftp_login ct_ftp_cmd ct_srv_src ct_srv_dst ct_dst_ltm \\\n", "0 NaN 2 2 7 \n", "1 NaN 3 1 4 \n", "2 NaN 3 5 2 \n", "3 NaN 1 1 4 \n", "4 NaN 2 5 2 \n", "... ... ... ... ... ... \n", "2540042 0.0 0 8 20 7 \n", "2540043 0.0 0 1 1 2 \n", "2540044 0.0 0 13 13 6 \n", "2540045 0.0 0 10 13 6 \n", "2540046 0.0 0 13 13 6 \n", "\n", " ct_src_ ltm ct_src_dport_ltm ct_dst_sport_ltm ct_dst_src_ltm \\\n", "0 4 1 1 3 \n", "1 4 1 1 1 \n", "2 7 1 1 4 \n", "3 7 1 1 3 \n", "4 1 1 1 2 \n", "... ... ... ... ... \n", "2540042 5 1 1 4 \n", "2540043 7 2 2 2 \n", "2540044 7 2 1 2 \n", "2540045 5 1 1 3 \n", "2540046 7 1 1 2 \n", "\n", " attack_cat Label \n", "0 NaN 0 \n", "1 NaN 0 \n", "2 NaN 0 \n", "3 NaN 0 \n", "4 NaN 0 \n", "... ... ... \n", "2540042 NaN 0 \n", "2540043 NaN 0 \n", "2540044 NaN 0 \n", "2540045 NaN 0 \n", "2540046 NaN 0 \n", "\n", "[2540047 rows x 49 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
srcipsportdstipdsportprotostatedursbytesdbytessttldttlslossdlossserviceSloadDloadSpktsDpktsswindwinstcpbdtcpbsmeanszdmeansztrans_depthres_bdy_lenSjitDjitStimeLtimeSintpktDintpkttcprttsynackackdatis_sm_ips_portsct_state_ttlct_flw_http_mthdis_ftp_loginct_ftp_cmdct_srv_srcct_srv_dstct_dst_ltmct_src_ ltmct_src_dport_ltmct_dst_sport_ltmct_dst_src_ltmattack_catLabel
059.166.0.97045149.171.126.725tcpFIN0.2018863755233803129188smtp1.459438e+061.307669e+055242255255142213655435726684847228000456.04356715.530109142425000914242500093.9438434.9124880.0005900.0004730.00011700NaNNaN2274113NaN0
159.166.0.99685149.171.126.280tcpFIN5.86474819410108789031292370http2.640454e+041.481983e+06364746255255389619597394688654531458101031.366423690.2195811424250003142425000916.1554477.8712790.0007710.0006380.000133001.0NaN3144111NaN0
259.166.0.21421149.171.126.453udpCON0.001391146178312900dns4.198418e+055.118620e+052200007389000.0000000.000000142425000914242500090.0090000.0020000.0000000.0000000.00000000NaNNaN3527114NaN0
359.166.0.221553149.171.126.225tcpFIN0.0539483781233803129198smtp5.503374e+064.893601e+05544225525540475233791903327524700800065.9096883.155258142425000914242500091.0115471.3025610.0006740.0005400.00013400NaNNaN1147113NaN0
459.166.0.845212149.171.126.453udpCON0.000953146178312900dns6.128017e+057.471144e+052200007389000.0000000.000000142425000914242500090.0090000.0040000.0000000.0000000.00000000NaNNaN2521112NaN0
......................................................................................................................................................
254004259.166.0.812520149.171.126.631010tcpFIN0.0203833201874312912-1.047932e+056.436736e+0568255255320868647932254861685323400212.8107293.079195142195584214219558424.0074002.0274290.0063860.0061890.000197000.00.0082075114NaN0
254004359.166.0.018895149.171.126.980tcpFIN1.40295719410108789031292370http1.103783e+056.195098e+06364746255255283296697242973675453145813924203.808900114.173588142195584114219558423.8640281.8824210.0007120.0005500.000162004.00.001127222NaN0
254004459.166.0.030103149.171.126.55190tcpFIN0.00710821582464312966-2.328644e+062.658413e+0624242552557032938442848960529901030017.6278310.432619142195584214219558420.2742610.2854780.0006570.0005320.000125000.00.00131367212NaN0
254004559.166.0.630388149.171.126.5111udpCON0.004435568304312900-7.684329e+054.112740e+0544000014276001.6386041.390643142195584214219558421.1656670.9873330.0000000.0000000.000000000.00.00101365113NaN0
254004659.166.0.06055149.171.126.554145tcpFIN0.0729744238607883129730-4.582454e+056.571546e+06727225525510032931491003585034598440062.04531061.899776142195584214219558421.0226900.9970420.0023170.0021730.000144000.00.00131367112NaN0
\n", "

2540047 rows × 49 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "source": [ "frame.srcip = frame.srcip.astype('category').cat.codes\n", "frame.dstip = frame.dstip.astype('category').cat.codes\n", "frame.proto = frame.proto.astype('category').cat.codes\n", "frame.state = frame.state.astype('category').cat.codes\n", "frame.service = frame.service.astype('category').cat.codes\n", "frame.ct_flw_http_mthd = frame.ct_flw_http_mthd.astype('category').cat.codes\n", "frame.is_ftp_login = frame.is_ftp_login.astype('category').cat.codes\n", "frame.ct_ftp_cmd = frame.ct_ftp_cmd.astype('category').cat.codes\n", "frame['Label'] = frame['Label'].astype(str)\n", "frame['Label'] = frame['Label'].str.replace(\"1\", \"anomaly\")\n", "frame['Label'] = frame['Label'].str.replace(\"0\", \"normal\")\n", "frame = frame.drop('attack_cat', axis=1)\n", "frame" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 461 }, "id": "RqDbXr3nyRYg", "outputId": "91d7d17c-bfca-4f7a-981a-804c39f5844a" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " srcip sport dstip dsport proto state dur sbytes dbytes \\\n", "0 42 7045 25 25 114 5 0.201886 37552 3380 \n", "1 42 9685 20 80 114 5 5.864748 19410 1087890 \n", "2 35 1421 22 53 120 2 0.001391 146 178 \n", "3 35 21553 20 25 114 5 0.053948 37812 3380 \n", "4 41 45212 22 53 120 2 0.000953 146 178 \n", "... ... ... ... ... ... ... ... ... ... \n", "2540042 41 12520 24 31010 114 5 0.020383 320 1874 \n", "2540043 33 18895 27 80 114 5 1.402957 19410 1087890 \n", "2540044 33 30103 23 5190 114 5 0.007108 2158 2464 \n", "2540045 39 30388 23 111 120 2 0.004435 568 304 \n", "2540046 33 6055 23 54145 114 5 0.072974 4238 60788 \n", "\n", " sttl dttl sloss dloss service Sload Dload Spkts \\\n", "0 31 29 18 8 9 1.459438e+06 1.307669e+05 52 \n", "1 31 29 2 370 5 2.640454e+04 1.481983e+06 364 \n", "2 31 29 0 0 2 4.198418e+05 5.118620e+05 2 \n", "3 31 29 19 8 9 5.503374e+06 4.893601e+05 54 \n", "4 31 29 0 0 2 6.128017e+05 7.471144e+05 2 \n", "... ... ... ... ... ... ... ... ... \n", "2540042 31 29 1 2 0 1.047932e+05 6.436736e+05 6 \n", "2540043 31 29 2 370 5 1.103783e+05 6.195098e+06 364 \n", "2540044 31 29 6 6 0 2.328644e+06 2.658413e+06 24 \n", "2540045 31 29 0 0 0 7.684329e+05 4.112740e+05 4 \n", "2540046 31 29 7 30 0 4.582454e+05 6.571546e+06 72 \n", "\n", " Dpkts swin dwin stcpb dtcpb smeansz dmeansz \\\n", "0 42 255 255 1422136554 3572668484 722 80 \n", "1 746 255 255 389619597 394688654 53 1458 \n", "2 2 0 0 0 0 73 89 \n", "3 42 255 255 4047523379 1903327524 700 80 \n", "4 2 0 0 0 0 73 89 \n", "... ... ... ... ... ... ... ... \n", "2540042 8 255 255 3208686479 3225486168 53 234 \n", "2540043 746 255 255 283296697 2429736754 53 1458 \n", "2540044 24 255 255 703293844 2848960529 90 103 \n", "2540045 4 0 0 0 0 142 76 \n", "2540046 72 255 255 1003293149 1003585034 59 844 \n", "\n", " trans_depth res_bdy_len Sjit Djit Stime \\\n", "0 0 0 456.043567 15.530109 1424250009 \n", "1 1 0 1031.366423 690.219581 1424250003 \n", "2 0 0 0.000000 0.000000 1424250009 \n", "3 0 0 65.909688 3.155258 1424250009 \n", "4 0 0 0.000000 0.000000 1424250009 \n", "... ... ... ... ... ... \n", "2540042 0 0 212.810729 3.079195 1421955842 \n", "2540043 1 3924 203.808900 114.173588 1421955841 \n", "2540044 0 0 17.627831 0.432619 1421955842 \n", "2540045 0 0 1.638604 1.390643 1421955842 \n", "2540046 0 0 62.045310 61.899776 1421955842 \n", "\n", " Ltime Sintpkt Dintpkt tcprtt synack ackdat \\\n", "0 1424250009 3.943843 4.912488 0.000590 0.000473 0.000117 \n", "1 1424250009 16.155447 7.871279 0.000771 0.000638 0.000133 \n", "2 1424250009 0.009000 0.002000 0.000000 0.000000 0.000000 \n", "3 1424250009 1.011547 1.302561 0.000674 0.000540 0.000134 \n", "4 1424250009 0.009000 0.004000 0.000000 0.000000 0.000000 \n", "... ... ... ... ... ... ... \n", "2540042 1421955842 4.007400 2.027429 0.006386 0.006189 0.000197 \n", "2540043 1421955842 3.864028 1.882421 0.000712 0.000550 0.000162 \n", "2540044 1421955842 0.274261 0.285478 0.000657 0.000532 0.000125 \n", "2540045 1421955842 1.165667 0.987333 0.000000 0.000000 0.000000 \n", "2540046 1421955842 1.022690 0.997042 0.002317 0.002173 0.000144 \n", "\n", " is_sm_ips_ports ct_state_ttl ct_flw_http_mthd is_ftp_login \\\n", "0 0 0 -1 -1 \n", "1 0 0 1 -1 \n", "2 0 0 -1 -1 \n", "3 0 0 -1 -1 \n", "4 0 0 -1 -1 \n", "... ... ... ... ... \n", "2540042 0 0 0 0 \n", "2540043 0 0 4 0 \n", "2540044 0 0 0 0 \n", "2540045 0 0 0 0 \n", "2540046 0 0 0 0 \n", "\n", " ct_ftp_cmd ct_srv_src ct_srv_dst ct_dst_ltm ct_src_ ltm \\\n", "0 8 2 2 7 4 \n", "1 8 3 1 4 4 \n", "2 8 3 5 2 7 \n", "3 8 1 1 4 7 \n", "4 8 2 5 2 1 \n", "... ... ... ... ... ... \n", "2540042 0 8 20 7 5 \n", "2540043 0 1 1 2 7 \n", "2540044 0 13 13 6 7 \n", "2540045 0 10 13 6 5 \n", "2540046 0 13 13 6 7 \n", "\n", " ct_src_dport_ltm ct_dst_sport_ltm ct_dst_src_ltm Label \n", "0 1 1 3 normal \n", "1 1 1 1 normal \n", "2 1 1 4 normal \n", "3 1 1 3 normal \n", "4 1 1 2 normal \n", "... ... ... ... ... \n", "2540042 1 1 4 normal \n", "2540043 2 2 2 normal \n", "2540044 2 1 2 normal \n", "2540045 1 1 3 normal \n", "2540046 1 1 2 normal \n", "\n", "[2540047 rows x 48 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
srcipsportdstipdsportprotostatedursbytesdbytessttldttlslossdlossserviceSloadDloadSpktsDpktsswindwinstcpbdtcpbsmeanszdmeansztrans_depthres_bdy_lenSjitDjitStimeLtimeSintpktDintpkttcprttsynackackdatis_sm_ips_portsct_state_ttlct_flw_http_mthdis_ftp_loginct_ftp_cmdct_srv_srcct_srv_dstct_dst_ltmct_src_ ltmct_src_dport_ltmct_dst_sport_ltmct_dst_src_ltmLabel
0427045252511450.201886375523380312918891.459438e+061.307669e+055242255255142213655435726684847228000456.04356715.530109142425000914242500093.9438434.9124880.0005900.0004730.00011700-1-182274113normal
1429685208011455.8647481941010878903129237052.640454e+041.481983e+06364746255255389619597394688654531458101031.366423690.2195811424250003142425000916.1554477.8712790.0007710.0006380.000133001-183144111normal
2351421225312020.00139114617831290024.198418e+055.118620e+052200007389000.0000000.000000142425000914242500090.0090000.0020000.0000000.0000000.00000000-1-183527114normal
33521553202511450.053948378123380312919895.503374e+064.893601e+05544225525540475233791903327524700800065.9096883.155258142425000914242500091.0115471.3025610.0006740.0005400.00013400-1-181147113normal
44145212225312020.00095314617831290026.128017e+057.471144e+052200007389000.0000000.000000142425000914242500090.0090000.0040000.0000000.0000000.00000000-1-182521112normal
...................................................................................................................................................
25400424112520243101011450.020383320187431291201.047932e+056.436736e+0568255255320868647932254861685323400212.8107293.079195142195584214219558424.0074002.0274290.0063860.0061890.0001970000082075114normal
25400433318895278011451.4029571941010878903129237051.103783e+056.195098e+06364746255255283296697242973675453145813924203.808900114.173588142195584114219558423.8640281.8824210.0007120.0005500.000162004001127222normal
2540044333010323519011450.0071082158246431296602.328644e+062.658413e+0624242552557032938442848960529901030017.6278310.432619142195584214219558420.2742610.2854780.0006570.0005320.00012500000131367212normal
254004539303882311112020.00443556830431290007.684329e+054.112740e+0544000014276001.6386041.390643142195584214219558421.1656670.9873330.0000000.0000000.00000000000101365113normal
2540046336055235414511450.072974423860788312973004.582454e+056.571546e+06727225525510032931491003585034598440062.04531061.899776142195584214219558421.0226900.9970420.0023170.0021730.00014400000131367112normal
\n", "

2540047 rows × 48 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "column_headers = list(frame.columns.values)\n", "column_headers" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "129f02d4-aa06-4207-9bb2-155ec6037ae3", "id": "7SRWhQp9x0GN" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['srcip',\n", " 'sport',\n", " 'dstip',\n", " 'dsport',\n", " 'proto',\n", " 'state',\n", " 'dur',\n", " 'sbytes',\n", " 'dbytes',\n", " 'sttl',\n", " 'dttl',\n", " 'sloss',\n", " 'dloss',\n", " 'service',\n", " 'Sload',\n", " 'Dload',\n", " 'Spkts',\n", " 'Dpkts',\n", " 'swin',\n", " 'dwin',\n", " 'stcpb',\n", " 'dtcpb',\n", " 'smeansz',\n", " 'dmeansz',\n", " 'trans_depth',\n", " 'res_bdy_len',\n", " 'Sjit',\n", " 'Djit',\n", " 'Stime',\n", " 'Ltime',\n", " 'Sintpkt',\n", " 'Dintpkt',\n", " 'tcprtt',\n", " 'synack',\n", " 'ackdat',\n", " 'is_sm_ips_ports',\n", " 'ct_state_ttl',\n", " 'ct_flw_http_mthd',\n", " 'is_ftp_login',\n", " 'ct_ftp_cmd',\n", " 'ct_srv_src',\n", " 'ct_srv_dst',\n", " 'ct_dst_ltm',\n", " 'ct_src_ ltm',\n", " 'ct_src_dport_ltm',\n", " 'ct_dst_sport_ltm',\n", " 'ct_dst_src_ltm',\n", " 'Label']" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "code", "source": [ "CSV_HEADER = column_headers\n", "df = frame\n", "def Remove_Outlier_Indices(df):\n", " Q1 = df.quantile(0.02)\n", " Q3 = df.quantile(0.98)\n", " IQR = Q3 - Q1\n", " #trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR)))\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n", " return trueList\n", "\n", "nonOutlierList = Remove_Outlier_Indices(df)\n", "new_data = df[nonOutlierList]\n", "\n", "df = new_data\n", "df = df.reset_index(drop=True)\n", "del new_data\n", "del nonOutlierList\n", "del li\n", "del frame\n", "import gc\n", "gc.collect()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8c16535c-9431-40a2-d3e2-a73bc674d019", "id": "T5I2816Kx0GN" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":4: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q1 = df.quantile(0.02)\n", ":5: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " Q3 = df.quantile(0.98)\n", ":8: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`\n", " trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "49" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "X = df.drop(['Label'], axis=1)\n", "Y = df[\"Label\"].astype('category').cat.codes\n", "clf = RandomForestClassifier()\n", "clf.fit(X, Y)\n", "features = pd.Series(clf.feature_importances_, index=X.columns)\n", "features.sort_values(ascending=False, inplace=True)\n", "print(features.head(10))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "a8522517-7f2d-41f5-a92c-8308f255382c", "id": "r7pBpWdLx0GP" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "ct_state_ttl 0.204888\n", "sttl 0.152128\n", "srcip 0.092287\n", "sbytes 0.076845\n", "smeansz 0.073014\n", "dttl 0.051160\n", "dstip 0.041404\n", "dmeansz 0.034131\n", "Dload 0.030871\n", "Dpkts 0.027830\n", "dtype: float64\n" ] } ] }, { "cell_type": "code", "source": [ "fs = features.head(10).to_dict()\n", "fs" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c0183c77-c508-4c47-801c-853fe74567be", "id": "dlYFTup_x0GP" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'ct_state_ttl': 0.20488763325717038,\n", " 'sttl': 0.1521284631007735,\n", " 'srcip': 0.09228731567941095,\n", " 'sbytes': 0.07684506148216279,\n", " 'smeansz': 0.07301405337046245,\n", " 'dttl': 0.05116042357030089,\n", " 'dstip': 0.04140361232353322,\n", " 'dmeansz': 0.03413087150829379,\n", " 'Dload': 0.030870500320368942,\n", " 'Dpkts': 0.02783016435570764}" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "columns = []\n", "for k in fs:\n", " columns.append(str(k))\n", "columns" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "7ccaa0c4-5f38-43e8-9333-5411caedbfce", "id": "ZGxa18v3x0GQ" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['ct_state_ttl',\n", " 'sttl',\n", " 'srcip',\n", " 'sbytes',\n", " 'smeansz',\n", " 'dttl',\n", " 'dstip',\n", " 'dmeansz',\n", " 'Dload',\n", " 'Dpkts']" ] }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "code", "source": [ "principalDf = pd.DataFrame(data = X\n", " , columns = columns)\n", "finalDf = pd.concat([principalDf, df[\"Label\"]], axis = 1)\n", "finalDf" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "b8abcf74-2b37-4d21-ce8d-8a39d6377cda", "id": "y56kSk6wx0GQ" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ct_state_ttl sttl srcip sbytes smeansz dttl dstip dmeansz \\\n", "0 0 31 42 37552 722 29 25 80 \n", "1 0 31 42 19410 53 29 20 1458 \n", "2 0 31 35 146 73 29 22 89 \n", "3 0 31 35 37812 700 29 20 80 \n", "4 0 31 41 146 73 29 22 89 \n", "... ... ... ... ... ... ... ... ... \n", "2438669 0 31 41 320 53 29 24 234 \n", "2438670 0 31 33 19410 53 29 27 1458 \n", "2438671 0 31 33 2158 90 29 23 103 \n", "2438672 0 31 39 568 142 29 23 76 \n", "2438673 0 31 33 4238 59 29 23 844 \n", "\n", " Dload Dpkts Label \n", "0 1.307669e+05 42 normal \n", "1 1.481983e+06 746 normal \n", "2 5.118620e+05 2 normal \n", "3 4.893601e+05 42 normal \n", "4 7.471144e+05 2 normal \n", "... ... ... ... \n", "2438669 6.436736e+05 8 normal \n", "2438670 6.195098e+06 746 normal \n", "2438671 2.658413e+06 24 normal \n", "2438672 4.112740e+05 4 normal \n", "2438673 6.571546e+06 72 normal \n", "\n", "[2438674 rows x 11 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ct_state_ttlsttlsrcipsbytessmeanszdttldstipdmeanszDloadDpktsLabel
003142375527222925801.307669e+0542normal
1031421941053292014581.481983e+06746normal
203135146732922895.118620e+052normal
303135378127002920804.893601e+0542normal
403141146732922897.471144e+052normal
....................................
2438669031413205329242346.436736e+058normal
2438670031331941053292714586.195098e+06746normal
24386710313321589029231032.658413e+0624normal
2438672031395681422923764.112740e+054normal
24386730313342385929238446.571546e+0672normal
\n", "

2438674 rows × 11 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "train_data, test_data = train_test_split(finalDf, test_size=0.25)\n", "train_data_file = \"train_data.csv\"\n", "test_data_file = \"test_data.csv\"\n", "\n", "train_data.to_csv(train_data_file, index=False, header=False)\n", "test_data.to_csv(test_data_file, index=False, header=False)" ], "metadata": { "id": "g9hl51Nyx0GQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "del finalDf\n", "del principalDf\n", "del train_data\n", "del test_data\n", "del clf\n", "del X\n", "del Y" ], "metadata": { "id": "wUSnhmoax0GR" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "CSV_HEADER = []\n", "for x in columns:\n", " CSV_HEADER.append(x)\n", "CSV_HEADER.append(\"Label\")\n", "\n", "# A list of the numerical feature names.\n", "NUMERIC_FEATURE_NAMES = columns\n", "# A dictionary of the categorical features and their vocabulary.\n", "CATEGORICAL_FEATURES_WITH_VOCABULARY = {\n", "}\n", "# A list of the columns to ignore from the dataset.\n", "IGNORE_COLUMN_NAMES = []\n", "# A list of the categorical feature names.\n", "CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())\n", "# A list of all the input features.\n", "FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES\n", "# A list of column default values for each feature.\n", "COLUMN_DEFAULTS = [\n", " [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else [\"NA\"]\n", " for feature_name in CSV_HEADER\n", "]\n", "# The name of the target feature.\n", "TARGET_FEATURE_NAME = \"Label\"\n", "# A list of the labels of the target features.\n", "TARGET_LABELS = [\"normal\", \"anomaly\"]\n", "\n", "from tensorflow.keras.layers import StringLookup\n", "\n", "target_label_lookup = StringLookup(\n", " vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0\n", ")\n", "\n", "\n", "def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):\n", " dataset = tf.data.experimental.make_csv_dataset(\n", " csv_file_path,\n", " batch_size=batch_size,\n", " column_names=CSV_HEADER,\n", " column_defaults=COLUMN_DEFAULTS,\n", " label_name=TARGET_FEATURE_NAME,\n", " num_epochs=1,\n", " header=False,\n", " na_value=\"?\",\n", " shuffle=shuffle,\n", " ).map(lambda features, target: (features, target_label_lookup(target)))\n", " return dataset.cache()\n", "\n", "def create_model_inputs():\n", " inputs = {}\n", " for feature_name in FEATURE_NAMES:\n", " if feature_name in NUMERIC_FEATURE_NAMES:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.float32\n", " )\n", " else:\n", " inputs[feature_name] = layers.Input(\n", " name=feature_name, shape=(), dtype=tf.string\n", " )\n", " return inputs\n", "\n", "def encode_inputs(inputs):\n", " encoded_features = []\n", " for feature_name in inputs:\n", " if feature_name in CATEGORICAL_FEATURE_NAMES:\n", " vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]\n", " #print(vocabulary)\n", " # Create a lookup to convert a string values to an integer indices.\n", " # Since we are not using a mask token, nor expecting any out of vocabulary\n", " # (oov) token, we set mask_token to None and num_oov_indices to 0.\n", " lookup = StringLookup(\n", " vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n", " )\n", " # Convert the string input values into integer indices.\n", " value_index = lookup(inputs[feature_name])\n", " embedding_dims = int(math.sqrt(lookup.vocabulary_size()))\n", " # Create an embedding layer with the specified dimensions.\n", " embedding = layers.Embedding(\n", " input_dim=lookup.vocabulary_size(), output_dim=embedding_dims\n", " )\n", " # Convert the index values to embedding representations.\n", " encoded_feature = embedding(value_index)\n", " else:\n", " # Use the numerical features as-is.\n", " encoded_feature = inputs[feature_name]\n", " if inputs[feature_name].shape[-1] is None:\n", " encoded_feature = tf.expand_dims(encoded_feature, -1)\n", "\n", " encoded_features.append(encoded_feature)\n", "\n", " encoded_features = layers.concatenate(encoded_features)\n", " return encoded_features\n", "\n", "class NeuralDecisionTree(keras.Model):\n", " def __init__(self, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.depth = depth\n", " self.num_leaves = 2 ** depth\n", " self.num_classes = num_classes\n", "\n", " # Create a mask for the randomly selected features.\n", " num_used_features = int(num_features * used_features_rate)\n", " one_hot = np.eye(num_features)\n", " sampled_feature_indicies = np.random.choice(\n", " np.arange(num_features), num_used_features, replace=False\n", " )\n", " self.used_features_mask = one_hot[sampled_feature_indicies]\n", "\n", " # Initialize the weights of the classes in leaves.\n", " self.pi = tf.Variable(\n", " initial_value=tf.random_normal_initializer()(\n", " shape=[self.num_leaves, self.num_classes]\n", " ),\n", " dtype=\"float32\",\n", " trainable=True,\n", " )\n", "\n", " # Initialize the stochastic routing layer.\n", " self.decision_fn = layers.Dense(\n", " units=self.num_leaves, activation=\"sigmoid\", name=\"decision\"\n", " )\n", "\n", " def call(self, features):\n", " batch_size = tf.shape(features)[0]\n", "\n", " # Apply the feature mask to the input features.\n", " features = tf.matmul(\n", " features, self.used_features_mask, transpose_b=True\n", " ) # [batch_size, num_used_features]\n", " # Compute the routing probabilities.\n", " decisions = tf.expand_dims(\n", " self.decision_fn(features), axis=2\n", " ) # [batch_size, num_leaves, 1]\n", " # Concatenate the routing probabilities with their complements.\n", " decisions = layers.concatenate(\n", " [decisions, 1 - decisions], axis=2\n", " ) # [batch_size, num_leaves, 2]\n", "\n", " mu = tf.ones([batch_size, 1, 1])\n", "\n", " begin_idx = 1\n", " end_idx = 2\n", " # Traverse the tree in breadth-first order.\n", " for level in range(self.depth):\n", " mu = tf.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1]\n", " mu = tf.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2]\n", " level_decisions = decisions[\n", " :, begin_idx:end_idx, :\n", " ] # [batch_size, 2 ** level, 2]\n", " mu = mu * level_decisions # [batch_size, 2**level, 2]\n", " begin_idx = end_idx\n", " end_idx = begin_idx + 2 ** (level + 1)\n", "\n", " mu = tf.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves]\n", " probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes]\n", " outputs = tf.matmul(mu, probabilities) # [batch_size, num_classes]\n", " return outputs\n", "\n", "class NeuralDecisionForest(keras.Model):\n", " def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):\n", " super().__init__()\n", " self.ensemble = []\n", " # Initialize the ensemble by adding NeuralDecisionTree instances.\n", " # Each tree will have its own randomly selected input features to use.\n", " for _ in range(num_trees):\n", " self.ensemble.append(\n", " NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)\n", " )\n", "\n", " def call(self, inputs):\n", " # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.\n", " batch_size = tf.shape(inputs)[0]\n", " outputs = tf.zeros([batch_size, num_classes])\n", "\n", " # Aggregate the outputs of trees in the ensemble.\n", " for tree in self.ensemble:\n", " outputs += tree(inputs)\n", " # Divide the outputs by the ensemble size to get the average.\n", " outputs /= len(self.ensemble)\n", " return outputs\n", "learning_rate = 0.01\n", "batch_size = 128\n", "num_epochs = 10\n", "\n", "\n", "def run_experiment(model):\n", "\n", " # model.compile(\n", " # optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " # loss=keras.losses.SparseCategoricalCrossentropy(),\n", " # metrics=[keras.metrics.SparseCategoricalAccuracy()],\n", " # )\n", " model.compile(\n", " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n", " loss=keras.losses.SparseCategoricalCrossentropy(),\n", " metrics=[metrics.SparseCategoricalAccuracy()],\n", " )\n", " print(\"Start training the model...\")\n", " train_dataset = get_dataset_from_csv(\n", " train_data_file, shuffle=True, batch_size=batch_size\n", " )\n", "\n", " model.fit(train_dataset, epochs=num_epochs)\n", " print(\"Model training finished\")\n", "\n", " print(\"Evaluating the model on the test data...\")\n", " test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "\n", " _, accuracy = model.evaluate(test_dataset)\n", " print(f\"Test accuracy: {round(accuracy * 100, 2)}%\")\n", " return model" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "icdTdlZHy2ki", "outputId": "8817a92e-a394-4017-d1c3-6e7924ac2240" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/numpy/core/numeric.py:2463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", " return bool(asarray(a1 == a2).all())\n" ] } ] }, { "cell_type": "code", "source": [ "num_trees = 25\n", "depth = 5\n", "used_features_rate = 0.5\n", "num_classes = len(TARGET_LABELS)\n", "\n", "\n", "def create_forest_model():\n", " inputs = create_model_inputs()\n", " features = encode_inputs(inputs)\n", " features = layers.BatchNormalization()(features)\n", " num_features = features.shape[1]\n", "\n", " forest_model = NeuralDecisionForest(\n", " num_trees, depth, num_features, used_features_rate, num_classes\n", " )\n", "\n", " outputs = forest_model(features)\n", " model = keras.Model(inputs=inputs, outputs=outputs)\n", " return model\n", "\n", "\n", "forest_model = create_forest_model()\n", "\n", "finalModel = run_experiment(forest_model)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "49d868c0-6ce9-4386-9317-c1c467f33ded", "id": "Q16_BCDcx0GS" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Start training the model...\n", "Epoch 1/10\n", "14290/14290 [==============================] - 467s 31ms/step - loss: 0.0292 - sparse_categorical_accuracy: 0.9877\n", "Epoch 2/10\n", "14290/14290 [==============================] - 400s 28ms/step - loss: 0.0240 - sparse_categorical_accuracy: 0.9881\n", "Epoch 3/10\n", "14290/14290 [==============================] - 375s 26ms/step - loss: 0.0235 - sparse_categorical_accuracy: 0.9881\n", "Epoch 4/10\n", "14290/14290 [==============================] - 377s 26ms/step - loss: 0.0233 - sparse_categorical_accuracy: 0.9885\n", "Epoch 5/10\n", "14290/14290 [==============================] - 374s 26ms/step - loss: 0.0231 - sparse_categorical_accuracy: 0.9887\n", "Epoch 6/10\n", "14290/14290 [==============================] - 364s 25ms/step - loss: 0.0230 - sparse_categorical_accuracy: 0.9887\n", "Epoch 7/10\n", "14290/14290 [==============================] - 376s 26ms/step - loss: 0.0230 - sparse_categorical_accuracy: 0.9887\n", "Epoch 8/10\n", "14290/14290 [==============================] - 373s 26ms/step - loss: 0.0229 - sparse_categorical_accuracy: 0.9888\n", "Epoch 9/10\n", "14290/14290 [==============================] - 371s 26ms/step - loss: 0.0228 - sparse_categorical_accuracy: 0.9888\n", "Epoch 10/10\n", "14290/14290 [==============================] - 372s 26ms/step - loss: 0.0227 - sparse_categorical_accuracy: 0.9888\n", "Model training finished\n", "Evaluating the model on the test data...\n", "4764/4764 [==============================] - 63s 13ms/step - loss: 0.0219 - sparse_categorical_accuracy: 0.9888\n", "Test accuracy: 98.88%\n" ] } ] }, { "cell_type": "code", "source": [ "test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)\n", "colnames=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'target']\n", "data = pd.read_csv(\"test_data.csv\", names=colnames, header=None)\n", "data['target'].replace('normal', 0,inplace=True)\n", "data['target'].replace('anomaly', 1,inplace=True)\n", "y_test = data['target'].values # as a numpy array\n", "from sklearn.metrics import confusion_matrix\n", "y_prediction = finalModel.predict(test_dataset)\n", "y_prediction = np.argmax (y_prediction, axis = 1)\n", "result = confusion_matrix(y_test, y_prediction , normalize='pred')\n", "print(result)\n", "TP = result[0][0]\n", "FP = result[0][1]\n", "TN = result[1][1]\n", "FN = result[1][0]\n", "ACC = (TP+TN)/(TP+TN+FP+FN)\n", "PR = TP/(TP+FP) #precision\n", "TPR = TP/(TP+FN) #Recall or True positive rate\n", "FPR = FP/(FP+TN)\n", "F1Score = 2*(PR*TPR)/(PR+TPR)\n", "print(\"ACC: \" + str(ACC))\n", "print(\"PR: \" + str(PR))\n", "print(\"TPR: \" + str(TPR))\n", "print(\"FPR: \" + str(FPR))\n", "print(\"F1Score: \" + str(F1Score))\n", "import matplotlib.pyplot as plt\n", "import numpy\n", "from sklearn import metrics\n", "\n", "\n", "cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = result, display_labels = [True, False])\n", "\n", "cm_display.plot()\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 588 }, "outputId": "a6eac3c6-c3ee-42e8-e321-a4c157407e4c", "id": "23gZJAZax0GS" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "4764/4764 [==============================] - 64s 13ms/step\n", "[[9.99784982e-01 8.43356819e-02]\n", " [2.15017664e-04 9.15664318e-01]]\n", "ACC: 0.957724650201963\n", "PR: 0.9222082147198775\n", "TPR: 0.9997849823364876\n", "FPR: 0.08433568193256165\n", "F1Score: 0.9594309959851927\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] } ] }