{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "pycharm": { "name": "#%% Imporintg packages\n" } }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "from sklearn.preprocessing import OrdinalEncoder\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import *\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%% Reading befign dataset\n" } }, "outputs": [], "source": [ "df = pd.read_csv('D://OneDrive - Higher Education Commission//Collaborations//Daud Khan//Paper02//CICBellEXFDNS2021//Attack_heavy_Benign - Copy//Dataset-stateless_attack_heavyAll.csv', header=0)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "pycharm": { "name": "#%% Printing dataset summary\n" } }, "outputs": [ { "data": { "text/plain": " timestamp FQDN_count subdomain_length upper lower numeric \\\n0 52:31.2 25 8 0 10 9 \n1 52:31.7 25 8 0 10 9 \n2 52:31.9 15 0 11 0 3 \n3 52:32.1 24 7 0 10 8 \n4 52:32.5 24 7 0 10 8 \n... ... ... ... ... ... ... \n433359 53:18.0 26 9 0 10 10 \n433360 53:18.4 27 10 0 10 11 \n433361 53:18.8 27 10 0 10 11 \n433362 53:20.1 24 7 0 10 8 \n433363 53:20.5 24 7 0 10 8 \n\n entropy special labels labels_max labels_average longest_word \\\n0 2.556642 6 6 7 3.333333 2 \n1 2.556642 6 6 7 3.333333 2 \n2 3.625000 1 1 15 15.000000 C \n3 2.054029 6 6 7 3.166667 4 \n4 2.054029 6 6 7 3.166667 4 \n... ... ... ... ... ... ... \n433359 2.742338 6 6 7 3.500000 2 \n433360 2.767195 6 6 7 3.666667 2 \n433361 2.767195 6 6 7 3.666667 2 \n433362 2.054029 6 6 7 3.166667 4 \n433363 2.054029 6 6 7 3.166667 4 \n\n sld len subdomain Class \n0 192 12 1 1 \n1 192 12 1 1 \n2 DESKTOP-3JF04TC 16 0 1 \n3 224 11 1 1 \n4 224 11 1 1 \n... ... ... ... ... \n433359 192 13 1 0 \n433360 192 14 1 0 \n433361 192 14 1 0 \n433362 224 11 1 0 \n433363 224 11 1 0 \n\n[433364 rows x 16 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timestampFQDN_countsubdomain_lengthupperlowernumericentropyspeciallabelslabels_maxlabels_averagelongest_wordsldlensubdomainClass
052:31.225801092.5566426673.33333321921211
152:31.725801092.5566426673.33333321921211
252:31.915011033.625000111515.000000CDESKTOP-3JF04TC1601
352:32.124701082.0540296673.16666742241111
452:32.524701082.0540296673.16666742241111
...................................................
43335953:18.0269010102.7423386673.50000021921310
43336053:18.42710010112.7671956673.66666721921410
43336153:18.82710010112.7671956673.66666721921410
43336253:20.124701082.0540296673.16666742241110
43336353:20.524701082.0540296673.16666742241110
\n

433364 rows × 16 columns

\n
" }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "pycharm": { "name": "#%% Printing dataset header\n" } }, "outputs": [ { "data": { "text/plain": " timestamp FQDN_count subdomain_length upper lower numeric entropy \\\n0 52:31.2 25 8 0 10 9 2.556642 \n1 52:31.7 25 8 0 10 9 2.556642 \n2 52:31.9 15 0 11 0 3 3.625000 \n3 52:32.1 24 7 0 10 8 2.054029 \n4 52:32.5 24 7 0 10 8 2.054029 \n\n special labels labels_max labels_average longest_word sld \\\n0 6 6 7 3.333333 2 192 \n1 6 6 7 3.333333 2 192 \n2 1 1 15 15.000000 C DESKTOP-3JF04TC \n3 6 6 7 3.166667 4 224 \n4 6 6 7 3.166667 4 224 \n\n len subdomain Class \n0 12 1 1 \n1 12 1 1 \n2 16 0 1 \n3 11 1 1 \n4 11 1 1 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timestampFQDN_countsubdomain_lengthupperlowernumericentropyspeciallabelslabels_maxlabels_averagelongest_wordsldlensubdomainClass
052:31.225801092.5566426673.33333321921211
152:31.725801092.5566426673.33333321921211
252:31.915011033.625000111515.000000CDESKTOP-3JF04TC1601
352:32.124701082.0540296673.16666742241111
452:32.524701082.0540296673.16666742241111
\n
" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "pycharm": { "name": "#%% Printing dataset information\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 433364 entries, 0 to 433363\n", "Data columns (total 16 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 timestamp 433364 non-null object \n", " 1 FQDN_count 433364 non-null int64 \n", " 2 subdomain_length 433364 non-null int64 \n", " 3 upper 433364 non-null int64 \n", " 4 lower 433364 non-null int64 \n", " 5 numeric 433364 non-null int64 \n", " 6 entropy 433364 non-null float64\n", " 7 special 433364 non-null int64 \n", " 8 labels 433364 non-null int64 \n", " 9 labels_max 433364 non-null int64 \n", " 10 labels_average 433364 non-null float64\n", " 11 longest_word 433354 non-null object \n", " 12 sld 433364 non-null object \n", " 13 len 433364 non-null int64 \n", " 14 subdomain 433364 non-null int64 \n", " 15 Class 433364 non-null int64 \n", "dtypes: float64(2), int64(11), object(3)\n", "memory usage: 52.9+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "pycharm": { "name": "#%% Selection of columns for encoding\n" } }, "outputs": [], "source": [ "cat_cols=['timestamp','longest_word','sld']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "pycharm": { "name": "#%% Selection of encoder\n" } }, "outputs": [], "source": [ "encoder = OrdinalEncoder()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "pd.options.mode.chained_assignment = None" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "pycharm": { "name": "#%% Data ending function for IP addresses and dateTime columns\n" } }, "outputs": [], "source": [ "def encode(data):\n", " '''function to encode non-null data and replace it in the original data'''\n", " #retains only non-null values\n", " nonulls = np.array(data.dropna())\n", " #reshapes the data for encoding\n", " impute_reshape = nonulls.reshape(-1,1)\n", " #encode date\n", " impute_ordinal = encoder.fit_transform(impute_reshape)\n", " #Assign back encoded values to non-null values\n", " data.loc[data.notnull()] = np.squeeze(impute_ordinal)\n", " return data\n", "#create a for loop to iterate through each column in the data\n", "for columns in cat_cols:\n", " encode(df[columns])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "pycharm": { "name": "#%% Printing dataset header\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 433364 entries, 0 to 433363\n", "Data columns (total 16 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 timestamp 433364 non-null float64\n", " 1 FQDN_count 433364 non-null int64 \n", " 2 subdomain_length 433364 non-null int64 \n", " 3 upper 433364 non-null int64 \n", " 4 lower 433364 non-null int64 \n", " 5 numeric 433364 non-null int64 \n", " 6 entropy 433364 non-null float64\n", " 7 special 433364 non-null int64 \n", " 8 labels 433364 non-null int64 \n", " 9 labels_max 433364 non-null int64 \n", " 10 labels_average 433364 non-null float64\n", " 11 longest_word 433354 non-null object \n", " 12 sld 433364 non-null float64\n", " 13 len 433364 non-null int64 \n", " 14 subdomain 433364 non-null int64 \n", " 15 Class 433364 non-null int64 \n", "dtypes: float64(4), int64(11), object(1)\n", "memory usage: 52.9+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "pycharm": { "name": "#%% Removinf NA values\n" } }, "outputs": [], "source": [ "df=df.dropna()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "pycharm": { "name": "#%% Selection of features from the dataset\n" } }, "outputs": [], "source": [ "X=df.loc[1:,:'subdomain']\n", "y=df.loc[1:,'Class']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "pycharm": { "name": "#%% Splitting the dataset into train and test parts\n" } }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)" ] }, { "cell_type": "code", "execution_count": 15, "outputs": [ { "data": { "text/plain": " timestamp FQDN_count subdomain_length upper lower numeric \\\n0 31504.0 25 8 0 10 9 \n1 31509.0 25 8 0 10 9 \n2 31511.0 15 0 11 0 3 \n3 31513.0 24 7 0 10 8 \n4 31517.0 24 7 0 10 8 \n... ... ... ... ... ... ... \n433359 31972.0 26 9 0 10 10 \n433360 31976.0 27 10 0 10 11 \n433361 31980.0 27 10 0 10 11 \n433362 31993.0 24 7 0 10 8 \n433363 31997.0 24 7 0 10 8 \n\n entropy special labels labels_max labels_average longest_word \\\n0 2.556642 6 6 7 3.333333 2 \n1 2.556642 6 6 7 3.333333 2 \n2 3.625000 1 1 15 15.000000 11 \n3 2.054029 6 6 7 3.166667 4 \n4 2.054029 6 6 7 3.166667 4 \n... ... ... ... ... ... ... \n433359 2.742338 6 6 7 3.500000 2 \n433360 2.767195 6 6 7 3.666667 2 \n433361 2.767195 6 6 7 3.666667 2 \n433362 2.054029 6 6 7 3.166667 4 \n433363 2.054029 6 6 7 3.166667 4 \n\n sld len subdomain Class \n0 110.0 12 1 1 \n1 110.0 12 1 1 \n2 467.0 16 0 1 \n3 178.0 11 1 1 \n4 178.0 11 1 1 \n... ... ... ... ... \n433359 110.0 13 1 0 \n433360 110.0 14 1 0 \n433361 110.0 14 1 0 \n433362 178.0 11 1 0 \n433363 178.0 11 1 0 \n\n[433354 rows x 16 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timestampFQDN_countsubdomain_lengthupperlowernumericentropyspeciallabelslabels_maxlabels_averagelongest_wordsldlensubdomainClass
031504.025801092.5566426673.3333332110.01211
131509.025801092.5566426673.3333332110.01211
231511.015011033.625000111515.00000011467.01601
331513.024701082.0540296673.1666674178.01111
431517.024701082.0540296673.1666674178.01111
...................................................
43335931972.0269010102.7423386673.5000002110.01310
43336031976.02710010112.7671956673.6666672110.01410
43336131980.02710010112.7671956673.6666672110.01410
43336231993.024701082.0540296673.1666674178.01110
43336331997.024701082.0540296673.1666674178.01110
\n

433354 rows × 16 columns

\n
" }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 16, "outputs": [], "source": [ "df.to_csv('Dataset-stateless_attack_heavyAllProcessed.csv')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 1 }