From 002507e201adf1de2cee388d6dab7b6c7f122f43 Mon Sep 17 00:00:00 2001 From: mas1u19 <mas1u19@soton.ac.uk> Date: Wed, 31 May 2023 06:10:52 +0000 Subject: [PATCH] Upload New File --- assign_1.ipynb | 2514 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2514 insertions(+) create mode 100644 assign_1.ipynb diff --git a/assign_1.ipynb b/assign_1.ipynb new file mode 100644 index 0000000..6f8ee8c --- /dev/null +++ b/assign_1.ipynb @@ -0,0 +1,2514 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5354ae81", + "metadata": {}, + "source": [ + "# Coursework 2\n", + "\n", + "## Assignment 1\n", + "\n", + "We will use `pandas library` for data processing given data is available in tabular form. We would first check data for consistency, missing or invalid entries. We will look at categorical data and will map to binary using dummies. Non-categorical data will be normalied.\n", + "\n", + "Given target values are split between normal and anomalous samples we can start with a logistical regression to find a good fit. For training purpose we may further break the data into training and test. The test data that is provided will be used as final validator on accuracy of ML model.\n", + "\n", + "First, let's do some basic analysis of input data. (Please note that both Training and Test data had missing headers as described in the assignment. This has been manually corrected.)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3997c6f9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 6000 entries, 0 to 5999\n", + "Columns: 129 entries, R1-PA1:VH to marker\n", + "dtypes: float64(113), int64(16)\n", + "memory usage: 5.9 MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "raw_data = pd.read_csv('TrainingDataBinary.csv')\n", + "raw_data.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a10544aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(6000, 129)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "65f549be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_5052/39642624.py:1: FutureWarning: In a future version of pandas all arguments of DataFrame.any and Series.any will be keyword-only.\n", + " len(raw_data[raw_data.isnull().any(1)])\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(raw_data[raw_data.isnull().any(1)])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d592ca2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 6000 entries, 0 to 5999\n", + "Columns: 129 entries, R1-PA1:VH to marker\n", + "dtypes: float64(113), int64(16)\n", + "memory usage: 6.0 MB\n" + ] + } + ], + "source": [ + "raw_data.drop_duplicates(keep='first', inplace=True)\n", + "raw_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "283c569c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data.columns[raw_data.isnull().any()].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ed5a0f8a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3000\n", + "1 3000\n", + "Name: marker, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data['marker'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "8d15070d", + "metadata": {}, + "source": [ + "### Comment\n", + "So, there are 6000 rows of all not-null data and the target column *marker* has value 0 and 1. Further, it's equally divided giving us a good stable sample." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "081e4ad7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>R1-PA1:VH</th>\n", + " <th>R1-PM1:V</th>\n", + " <th>R1-PA2:VH</th>\n", + " <th>R1-PM2:V</th>\n", + " <th>R1-PA3:VH</th>\n", + " <th>R1-PM3:V</th>\n", + " <th>R1-PA4:IH</th>\n", + " <th>R1-PM4:I</th>\n", + " <th>R1-PA5:IH</th>\n", + " <th>R1-PM5:I</th>\n", + " <th>...</th>\n", + " <th>control_panel_log4</th>\n", + " <th>relay1_log</th>\n", + " <th>relay2_log</th>\n", + " <th>relay3_log</th>\n", + " <th>relay4_log</th>\n", + " <th>snort_log1</th>\n", + " <th>snort_log2</th>\n", + " <th>snort_log3</th>\n", + " <th>snort_log4</th>\n", + " <th>marker</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1521</th>\n", + " <td>96.228262</td>\n", + " <td>132010.7666</td>\n", + " <td>-23.754830</td>\n", + " <td>131985.6933</td>\n", + " <td>-143.743652</td>\n", + " <td>132085.9864</td>\n", + " <td>100.153023</td>\n", + " <td>296.08887</td>\n", + " <td>-23.657427</td>\n", + " <td>311.28700</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2236</th>\n", + " <td>-30.406870</td>\n", + " <td>135395.6580</td>\n", + " <td>-150.401421</td>\n", + " <td>135345.5115</td>\n", + " <td>89.599140</td>\n", + " <td>135445.8045</td>\n", + " <td>-26.161253</td>\n", + " <td>517.65197</td>\n", + " <td>-149.381556</td>\n", + " <td>538.16029</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4081</th>\n", + " <td>97.471580</td>\n", + " <td>131835.2537</td>\n", + " <td>-22.494323</td>\n", + " <td>131785.1071</td>\n", + " <td>-142.483145</td>\n", + " <td>131885.4002</td>\n", + " <td>98.084645</td>\n", + " <td>349.37388</td>\n", + " <td>-23.800667</td>\n", + " <td>356.88139</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5096</th>\n", + " <td>-71.957769</td>\n", + " <td>131709.8873</td>\n", + " <td>168.082899</td>\n", + " <td>131684.8140</td>\n", + " <td>48.071159</td>\n", + " <td>131760.0339</td>\n", + " <td>-74.874125</td>\n", + " <td>401.01090</td>\n", + " <td>165.252487</td>\n", + " <td>402.65889</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5882</th>\n", + " <td>-87.427630</td>\n", + " <td>131459.1546</td>\n", + " <td>152.607309</td>\n", + " <td>131409.0081</td>\n", + " <td>32.595569</td>\n", + " <td>131509.3012</td>\n", + " <td>-90.641923</td>\n", + " <td>418.58946</td>\n", + " <td>149.484689</td>\n", + " <td>418.04013</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 129 columns</p>\n", + "</div>" + ], + "text/plain": [ + " R1-PA1:VH R1-PM1:V R1-PA2:VH R1-PM2:V R1-PA3:VH \\\n", + "1521 96.228262 132010.7666 -23.754830 131985.6933 -143.743652 \n", + "2236 -30.406870 135395.6580 -150.401421 135345.5115 89.599140 \n", + "4081 97.471580 131835.2537 -22.494323 131785.1071 -142.483145 \n", + "5096 -71.957769 131709.8873 168.082899 131684.8140 48.071159 \n", + "5882 -87.427630 131459.1546 152.607309 131409.0081 32.595569 \n", + "\n", + " R1-PM3:V R1-PA4:IH R1-PM4:I R1-PA5:IH R1-PM5:I ... \\\n", + "1521 132085.9864 100.153023 296.08887 -23.657427 311.28700 ... \n", + "2236 135445.8045 -26.161253 517.65197 -149.381556 538.16029 ... \n", + "4081 131885.4002 98.084645 349.37388 -23.800667 356.88139 ... \n", + "5096 131760.0339 -74.874125 401.01090 165.252487 402.65889 ... \n", + "5882 131509.3012 -90.641923 418.58946 149.484689 418.04013 ... \n", + "\n", + " control_panel_log4 relay1_log relay2_log relay3_log relay4_log \\\n", + "1521 0 0 0 0 0 \n", + "2236 0 0 0 0 0 \n", + "4081 0 0 0 0 0 \n", + "5096 0 0 0 0 0 \n", + "5882 0 0 0 0 0 \n", + "\n", + " snort_log1 snort_log2 snort_log3 snort_log4 marker \n", + "1521 0 0 0 0 1 \n", + "2236 0 0 0 0 1 \n", + "4081 0 0 0 0 1 \n", + "5096 0 0 0 0 0 \n", + "5882 0 0 0 0 0 \n", + "\n", + "[5 rows x 129 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data.sample(n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "989201f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['R1-PA1:VH', 'R1-PM1:V', 'R1-PA2:VH', 'R1-PM2:V', 'R1-PA3:VH',\n", + " 'R1-PM3:V', 'R1-PA4:IH', 'R1-PM4:I', 'R1-PA5:IH', 'R1-PM5:I',\n", + " 'R1-PA6:IH', 'R1-PM6:I', 'R1-PA7:VH', 'R1-PM7:V', 'R1-PA8:VH',\n", + " 'R1-PM8:V', 'R1-PA9:VH', 'R1-PM9:V', 'R1-PA10:IH', 'R1-PM10:I',\n", + " 'R1-PA11:IH', 'R1-PM11:I', 'R1-PA12:IH', 'R1-PM12:I', 'R1:F',\n", + " 'R1:DF', 'R1-PA:Z', 'R1-PA:ZH', 'R1:S', 'R2-PA1:VH', 'R2-PM1:V',\n", + " 'R2-PA2:VH', 'R2-PM2:V', 'R2-PA3:VH', 'R2-PM3:V', 'R2-PA4:IH',\n", + " 'R2-PM4:I', 'R2-PA5:IH', 'R2-PM5:I', 'R2-PA6:IH', 'R2-PM6:I',\n", + " 'R2-PA7:VH', 'R2-PM7:V', 'R2-PA8:VH', 'R2-PM8:V', 'R2-PA9:VH',\n", + " 'R2-PM9:V', 'R2-PA10:IH', 'R2-PM10:I', 'R2-PA11:IH', 'R2-PM11:I',\n", + " 'R2-PA12:IH', 'R2-PM12:I', 'R2:F', 'R2:DF', 'R2-PA:Z', 'R2-PA:ZH',\n", + " 'R2:S', 'R3-PA1:VH', 'R3-PM1:V', 'R3-PA2:VH', 'R3-PM2:V',\n", + " 'R3-PA3:VH', 'R3-PM3:V', 'R3-PA4:IH', 'R3-PM4:I', 'R3-PA5:IH',\n", + " 'R3-PM5:I', 'R3-PA6:IH', 'R3-PM6:I', 'R3-PA7:VH', 'R3-PM7:V',\n", + " 'R3-PA8:VH', 'R3-PM8:V', 'R3-PA9:VH', 'R3-PM9:V', 'R3-PA10:IH',\n", + " 'R3-PM10:I', 'R3-PA11:IH', 'R3-PM11:I', 'R3-PA12:IH', 'R3-PM12:I',\n", + " 'R3:F', 'R3:DF', 'R3-PA:Z', 'R3-PA:ZH', 'R3:S', 'R4-PA1:VH',\n", + " 'R4-PM1:V', 'R4-PA2:VH', 'R4-PM2:V', 'R4-PA3:VH', 'R4-PM3:V',\n", + " 'R4-PA4:IH', 'R4-PM4:I', 'R4-PA5:IH', 'R4-PM5:I', 'R4-PA6:IH',\n", + " 'R4-PM6:I', 'R4-PA7:VH', 'R4-PM7:V', 'R4-PA8:VH', 'R4-PM8:V',\n", + " 'R4-PA9:VH', 'R4-PM9:V', 'R4-PA10:IH', 'R4-PM10:I', 'R4-PA11:IH',\n", + " 'R4-PM11:I', 'R4-PA12:IH', 'R4-PM12:I', 'R4:F', 'R4:DF', 'R4-PA:Z',\n", + " 'R4-PA:ZH', 'R4:S', 'control_panel_log1', 'control_panel_log2',\n", + " 'control_panel_log3', 'control_panel_log4', 'relay1_log',\n", + " 'relay2_log', 'relay3_log', 'relay4_log', 'snort_log1',\n", + " 'snort_log2', 'snort_log3', 'snort_log4', 'marker'], dtype=object)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "column_names = raw_data.columns.values\n", + "column_names" + ] + }, + { + "cell_type": "markdown", + "id": "131b20d5", + "metadata": {}, + "source": [ + "### Data analysis\n", + "\n", + "We understand that first 116 columns have current/voltage data so these are continuous real value data. Next 16 columns are switch (0/1) and last column is target that we just observed to contain half as 0 and other half as 1.\n", + "\n", + "Let's normalie real value data removing anomalous values, if any.\n", + "\n", + "We find out minimum, maximum, and number of unique data points for each feature." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "80f7ee9a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 129 entries, 0 to 128\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 minn 129 non-null float64\n", + " 1 maxx 129 non-null float64\n", + " 2 uniq 129 non-null int64 \n", + "dtypes: float64(2), int64(1)\n", + "memory usage: 4.0 KB\n" + ] + } + ], + "source": [ + "minn = [raw_data[c].min() for c in column_names]\n", + "maxx = [raw_data[c].max() for c in column_names]\n", + "uniq = [len(raw_data[c].unique()) for c in column_names]\n", + "idx=[x for x in range(len(minn))]\n", + "data_stat = pd.DataFrame({\"minn\": minn, \"maxx\": maxx, \"uniq\": uniq},index=idx)\n", + "data_stat.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0d21a61e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>minn</th>\n", + " <th>maxx</th>\n", + " <th>uniq</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>116</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>117</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>118</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>119</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>126</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>127</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " minn maxx uniq\n", + "116 0.0 0.0 1\n", + "117 0.0 0.0 1\n", + "118 0.0 0.0 1\n", + "119 0.0 0.0 1\n", + "126 0.0 0.0 1\n", + "127 0.0 0.0 1" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Feature with the same value across the sample\n", + "data_stat[data_stat.uniq == 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ad863ae4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>minn</th>\n", + " <th>maxx</th>\n", + " <th>uniq</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>0.0</td>\n", + " <td>270336.0</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57</th>\n", + " <td>0.0</td>\n", + " <td>270336.0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>86</th>\n", + " <td>0.0</td>\n", + " <td>270336.0</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>115</th>\n", + " <td>0.0</td>\n", + " <td>270336.0</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>116</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>117</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>118</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>119</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>120</th>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>121</th>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>122</th>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>123</th>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>124</th>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>125</th>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>126</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>127</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>128</th>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " minn maxx uniq\n", + "28 0.0 270336.0 4\n", + "57 0.0 270336.0 2\n", + "86 0.0 270336.0 3\n", + "115 0.0 270336.0 3\n", + "116 0.0 0.0 1\n", + "117 0.0 0.0 1\n", + "118 0.0 0.0 1\n", + "119 0.0 0.0 1\n", + "120 0.0 1.0 2\n", + "121 0.0 1.0 2\n", + "122 0.0 1.0 2\n", + "123 0.0 1.0 2\n", + "124 0.0 1.0 2\n", + "125 0.0 1.0 2\n", + "126 0.0 0.0 1\n", + "127 0.0 0.0 1\n", + "128 0.0 1.0 2" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Looking for categorical features... looking for feature with no more than 9 unique values\n", + "data_stat[data_stat.uniq < 10]" + ] + }, + { + "cell_type": "markdown", + "id": "2ff927af", + "metadata": {}, + "source": [ + "### Comment\n", + "Columns 116 onward are control values and were expected to be 0 or 1. Moreover column# 116-119 and 126-127 have all 0 and therefore, can be dropped.\n", + "\n", + "It's also interesting to note that 4 other columns have categorical data. Let's explore these." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "71c92946", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 2058, 2048, 270336])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data[column_names[28]].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6f86ffd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 270336])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data[column_names[57]].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2311b43d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 2048, 270336])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data[column_names[86]].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "726a7bb9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0., 2058., 270336.])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data[column_names[115]].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1cfdd4ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['R1:S', 'R2:S', 'R3:S', 'R4:S']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[column_names[28], column_names[57], column_names[86], column_names[115]]" + ] + }, + { + "cell_type": "markdown", + "id": "b3d245bd", + "metadata": {}, + "source": [ + "### Comment\n", + "\n", + "So, columns R#:S have categorical values that take on 4 possible values of 0, 2048, 2058, and 270336. We can create dummy columns to hold binary and drop original." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "9f105bf3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>R1:S_0</th>\n", + " <th>R1:S_2048</th>\n", + " <th>R1:S_2058</th>\n", + " <th>R1:S_270336</th>\n", + " <th>R2:S_0</th>\n", + " <th>R2:S_270336</th>\n", + " <th>R3:S_0</th>\n", + " <th>R3:S_2048</th>\n", + " <th>R3:S_270336</th>\n", + " <th>R4:S_0</th>\n", + " <th>R4:S_2058</th>\n", + " <th>R4:S_270336</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>4472</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1837</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1054</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>460</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5263</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1395</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " R1:S_0 R1:S_2048 R1:S_2058 R1:S_270336 R2:S_0 R2:S_270336 R3:S_0 \\\n", + "4472 1 0 0 0 1 0 1 \n", + "1837 1 0 0 0 1 0 1 \n", + "1054 1 0 0 0 1 0 1 \n", + "460 1 0 0 0 1 0 1 \n", + "5263 1 0 0 0 1 0 1 \n", + "1395 1 0 0 0 1 0 1 \n", + "\n", + " R3:S_2048 R3:S_270336 R4:S_0 R4:S_2058 R4:S_270336 \n", + "4472 0 0 1 0 0 \n", + "1837 0 0 1 0 0 \n", + "1054 0 0 1 0 0 \n", + "460 0 0 1 0 0 \n", + "5263 0 0 1 0 0 \n", + "1395 0 0 1 0 0 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#lets make a copy of original\n", + "df = raw_data.copy()\n", + "df.replace([np.inf, -np.inf], np.nan, inplace=True)\n", + "df.dropna(inplace=True)\n", + "\n", + "column_names_RS = [column_names[28], column_names[57], column_names[86], column_names[115]]\n", + "RS_frame = df[column_names_RS]\n", + "RS_frame = RS_frame.astype('int')\n", + "RS_frame = RS_frame.astype('category')\n", + "RS_frame = pd.get_dummies(RS_frame, dtype=int)\n", + "RS_frame.sample(n=6)" + ] + }, + { + "cell_type": "markdown", + "id": "54517212", + "metadata": {}, + "source": [ + "### Comment\n", + "\n", + "We have created dummy columns corresponding to categorical values of RS. We can drop these now.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "19de15f6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "6000 rows\n", + "** R1-PA1:VH:5530 (92%)\n", + "** R1-PM1:V:357 (5%)\n", + "** R1-PA2:VH:5550 (92%)\n", + "** R1-PM2:V:347 (5%)\n", + "** R1-PA3:VH:5515 (91%)\n", + "** R1-PM3:V:350 (5%)\n", + "** R1-PA4:IH:5616 (93%)\n", + "** R1-PM4:I:1587 (26%)\n", + "** R1-PA5:IH:5566 (92%)\n", + "** R1-PM5:I:1489 (24%)\n", + "** R1-PA6:IH:5569 (92%)\n", + "** R1-PM6:I:1570 (26%)\n", + "** R1-PA7:VH:5533 (92%)\n", + "** R1-PM7:V:360 (6%)\n", + "** R1-PA8:VH:48 (0%)\n", + "** R1-PM8:V:47 (0%)\n", + "** R1-PA9:VH:44 (0%)\n", + "** R1-PM9:V:40 (0%)\n", + "** R1-PA10:IH:5561 (92%)\n", + "** R1-PM10:I:1526 (25%)\n", + "** R1-PA11:IH:2152 (35%)\n", + "** R1-PM11:I:184 (3%)\n", + "** R1-PA12:IH:2185 (36%)\n", + "** R1-PM12:I:187 (3%)\n", + "** R1:F:103 (1%)\n", + "** R1:DF:101 (1%)\n", + "** R1-PA:Z:5999 (99%)\n", + "** R1-PA:ZH:5913 (98%)\n", + "** R1:S:[0:99.22%,2058:0.6%,270336:0.17%,2048:0.02%]\n", + "** R2-PA1:VH:5659 (94%)\n", + "** R2-PM1:V:4602 (76%)\n", + "** R2-PA2:VH:5661 (94%)\n", + "** R2-PM2:V:4577 (76%)\n", + "** R2-PA3:VH:5658 (94%)\n", + "** R2-PM3:V:4593 (76%)\n", + "** R2-PA4:IH:5690 (94%)\n", + "** R2-PM4:I:4968 (82%)\n", + "** R2-PA5:IH:5711 (95%)\n", + "** R2-PM5:I:4838 (80%)\n", + "** R2-PA6:IH:5743 (95%)\n", + "** R2-PM6:I:4957 (82%)\n", + "** R2-PA7:VH:5670 (94%)\n", + "** R2-PM7:V:4599 (76%)\n", + "** R2-PA8:VH:50 (0%)\n", + "** R2-PM8:V:50 (0%)\n", + "** R2-PA9:VH:47 (0%)\n", + "** R2-PM9:V:47 (0%)\n", + "** R2-PA10:IH:5695 (94%)\n", + "** R2-PM10:I:4924 (82%)\n", + "** R2-PA11:IH:2131 (35%)\n", + "** R2-PM11:I:1253 (20%)\n", + "** R2-PA12:IH:2162 (36%)\n", + "** R2-PM12:I:1255 (20%)\n", + "** R2:F:108 (1%)\n", + "** R2:DF:107 (1%)\n", + "** R2-PA:Z:6000 (100%)\n", + "** R2-PA:ZH:5886 (98%)\n", + "** R2:S:[0:99.83%,270336:0.17%]\n", + "** R3-PA1:VH:5534 (92%)\n", + "** R3-PM1:V:393 (6%)\n", + "** R3-PA2:VH:5536 (92%)\n", + "** R3-PM2:V:389 (6%)\n", + "** R3-PA3:VH:5539 (92%)\n", + "** R3-PM3:V:384 (6%)\n", + "** R3-PA4:IH:5602 (93%)\n", + "** R3-PM4:I:1601 (26%)\n", + "** R3-PA5:IH:5575 (92%)\n", + "** R3-PM5:I:1464 (24%)\n", + "** R3-PA6:IH:5614 (93%)\n", + "** R3-PM6:I:1571 (26%)\n", + "** R3-PA7:VH:5554 (92%)\n", + "** R3-PM7:V:392 (6%)\n", + "** R3-PA8:VH:50 (0%)\n", + "** R3-PM8:V:49 (0%)\n", + "** R3-PA9:VH:47 (0%)\n", + "** R3-PM9:V:46 (0%)\n", + "** R3-PA10:IH:5548 (92%)\n", + "** R3-PM10:I:1560 (26%)\n", + "** R3-PA11:IH:2063 (34%)\n", + "** R3-PM11:I:181 (3%)\n", + "** R3-PA12:IH:2053 (34%)\n", + "** R3-PM12:I:175 (2%)\n", + "** R3:F:92 (1%)\n", + "** R3:DF:111 (1%)\n", + "** R3-PA:Z:5999 (99%)\n", + "** R3-PA:ZH:5902 (98%)\n", + "** R3:S:[0:99.77%,270336:0.17%,2048:0.07%]\n", + "** R4-PA1:VH:5664 (94%)\n", + "** R4-PM1:V:1792 (29%)\n", + "** R4-PA2:VH:5668 (94%)\n", + "** R4-PM2:V:1781 (29%)\n", + "** R4-PA3:VH:5674 (94%)\n", + "** R4-PM3:V:1794 (29%)\n", + "** R4-PA4:IH:5699 (94%)\n", + "** R4-PM4:I:2866 (47%)\n", + "** R4-PA5:IH:5692 (94%)\n", + "** R4-PM5:I:2752 (45%)\n", + "** R4-PA6:IH:5693 (94%)\n", + "** R4-PM6:I:2859 (47%)\n", + "** R4-PA7:VH:5687 (94%)\n", + "** R4-PM7:V:1781 (29%)\n", + "** R4-PA8:VH:50 (0%)\n", + "** R4-PM8:V:48 (0%)\n", + "** R4-PA9:VH:45 (0%)\n", + "** R4-PM9:V:39 (0%)\n", + "** R4-PA10:IH:5695 (94%)\n", + "** R4-PM10:I:2826 (47%)\n", + "** R4-PA11:IH:2090 (34%)\n", + "** R4-PM11:I:696 (11%)\n", + "** R4-PA12:IH:2078 (34%)\n", + "** R4-PM12:I:726 (12%)\n", + "** R4:F:117 (1%)\n", + "** R4:DF:96 (1%)\n", + "** R4-PA:Z:5999 (99%)\n", + "** R4-PA:ZH:5891 (98%)\n", + "** R4:S:[0.0:99.82%,270336.0:0.17%,2058.0:0.02%]\n", + "** control_panel_log1:[0:100.0%]\n", + "** control_panel_log2:[0:100.0%]\n", + "** control_panel_log3:[0:100.0%]\n", + "** control_panel_log4:[0:100.0%]\n", + "** relay1_log:[0:99.87%,1:0.13%]\n", + "** relay2_log:[0:99.88%,1:0.12%]\n", + "** relay3_log:[0:99.95%,1:0.05%]\n", + "** relay4_log:[0:99.97%,1:0.03%]\n", + "** snort_log1:[0:99.98%,1:0.02%]\n", + "** snort_log2:[0:99.98%,1:0.02%]\n", + "** snort_log3:[0:100.0%]\n", + "** snort_log4:[0:100.0%]\n", + "** marker:[0:50.0%,1:50.0%]\n" + ] + } + ], + "source": [ + "ENCODING = 'utf-8'\n", + "\n", + "def expand_categories(values):\n", + " result = []\n", + " s = values.value_counts()\n", + " t = float(len(values))\n", + " for v in s.index:\n", + " result.append(\"{}:{}%\".format(v,round(100*(s[v]/t),2)))\n", + " return \"[{}]\".format(\",\".join(result))\n", + " \n", + "def analyze(df):\n", + " print()\n", + " cols = df.columns.values\n", + " total = float(len(df))\n", + "\n", + " print(\"{} rows\".format(int(total)))\n", + " for col in cols:\n", + " \n", + " uniques = df[col].unique()\n", + " unique_count = len(uniques)\n", + " if unique_count>10:\n", + " print(\"** {}:{} ({}%)\".format(col,unique_count,int(((unique_count)/total)*100)))\n", + " else:\n", + " print(\"** {}:{}\".format(col,expand_categories(df[col])))\n", + " expand_categories(df[col])\n", + "\n", + "# Analyze KDD-99\n", + "\n", + "import pandas as pd\n", + "import os\n", + "import numpy as np\n", + "from sklearn import metrics\n", + "from scipy.stats import zscore\n", + "\n", + "analyze(df)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4311c15b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 6000 entries, 0 to 5999\n", + "Columns: 125 entries, R1-PA1:VH to marker\n", + "dtypes: float64(112), int64(13)\n", + "memory usage: 5.8 MB\n" + ] + } + ], + "source": [ + "column_names_updated = np.delete(column_names, [28, 57, 86, 115])\n", + "df = df.drop(column_names_RS, axis=1)\n", + "\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f21a5139", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(6000, 112)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "#let's scale numerical feature\n", + "unscaled_input = df.iloc[:,:-13]\n", + "scaler = StandardScaler().fit(unscaled_input)\n", + "scaled_input = scaler.transform(unscaled_input)\n", + "scaled_input[np.isnan(scaled_input)] = 1\n", + "#np.isnan(scaled_input.sum())\n", + "scaled_input.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "91c3e8c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 6000 entries, 0 to 5999\n", + "Columns: 112 entries, R1-PA1:VH to R4-PA:ZH\n", + "dtypes: float64(112)\n", + "memory usage: 5.2 MB\n" + ] + } + ], + "source": [ + "df1_scaled = pd.DataFrame(scaled_input, index=df.index, columns=column_names_updated[:scaled_input.shape[1]])\n", + "df1_scaled.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "4a23f18c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 6000 entries, 0 to 5999\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 control_panel_log1 6000 non-null int64\n", + " 1 control_panel_log2 6000 non-null int64\n", + " 2 control_panel_log3 6000 non-null int64\n", + " 3 control_panel_log4 6000 non-null int64\n", + " 4 relay1_log 6000 non-null int64\n", + " 5 relay2_log 6000 non-null int64\n", + " 6 relay3_log 6000 non-null int64\n", + " 7 relay4_log 6000 non-null int64\n", + " 8 snort_log1 6000 non-null int64\n", + " 9 snort_log2 6000 non-null int64\n", + " 10 snort_log3 6000 non-null int64\n", + " 11 snort_log4 6000 non-null int64\n", + "dtypes: int64(12)\n", + "memory usage: 609.4 KB\n" + ] + } + ], + "source": [ + "df2 = df.iloc[:,-13:-1]\n", + "df2.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ba3d7444", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 6000 entries, 0 to 5999\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 R1:S_0 6000 non-null int64\n", + " 1 R1:S_2048 6000 non-null int64\n", + " 2 R1:S_2058 6000 non-null int64\n", + " 3 R1:S_270336 6000 non-null int64\n", + " 4 R2:S_0 6000 non-null int64\n", + " 5 R2:S_270336 6000 non-null int64\n", + " 6 R3:S_0 6000 non-null int64\n", + " 7 R3:S_2048 6000 non-null int64\n", + " 8 R3:S_270336 6000 non-null int64\n", + " 9 R4:S_0 6000 non-null int64\n", + " 10 R4:S_2058 6000 non-null int64\n", + " 11 R4:S_270336 6000 non-null int64\n", + "dtypes: int64(12)\n", + "memory usage: 609.4 KB\n" + ] + } + ], + "source": [ + "#RS dummies\n", + "RS_frame.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2e67286d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 6000 entries, 0 to 5999\n", + "Columns: 136 entries, R1-PA1:VH to R4:S_270336\n", + "dtypes: float64(112), int64(24)\n", + "memory usage: 6.3 MB\n" + ] + } + ], + "source": [ + "X = pd.concat([df1_scaled, df2, RS_frame], axis=1)\n", + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "7d4368da", + "metadata": {}, + "outputs": [], + "source": [ + "y = df[column_names[-1]]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "5ac91fca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(6000,)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "4335edba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9052380952380953" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "#using SVM analysis\n", + "from sklearn import svm\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size=0.35)\n", + "\n", + "clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)\n", + "clf.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "aea2f41f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9238095238095239" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "#using Logistic regression\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size=0.35)\n", + "\n", + "clf = LogisticRegression(C=1e2, max_iter=5000).fit(X_train, y_train)\n", + "clf.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "76cd1a27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.91 0.94 0.92 1051\n", + " 1 0.93 0.91 0.92 1049\n", + "\n", + " accuracy 0.92 2100\n", + " macro avg 0.92 0.92 0.92 2100\n", + "weighted avg 0.92 0.92 0.92 2100\n", + "\n", + "----------------------------------------\n", + "True negatives: 984 \n", + "False positives: 67 \n", + "False negatives: 93 \n", + "True Positives: 956\n" + ] + }, + { + "data": { + "text/plain": [ + "<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f92f587fb80>" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfsAAAGwCAYAAACuFMx9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAA0cElEQVR4nO3de3QU9f3/8dcmITdyI2ASAiGAKBBBUFCMN7SmBKEK4q+WGjRSxFYDChQQKncEFC9QKIJXkH7BS1WoUKQiVi4SUIJQhBjkmggkqCGEBHPbnd8fyNotULPsJsvOPB/nzDnuzGdm39tyePN+fz4zYzMMwxAAADCtAF8HAAAA6hbJHgAAkyPZAwBgciR7AABMjmQPAIDJkewBADA5kj0AACYX5OsAPOFwOHTkyBFFRkbKZrP5OhwAgJsMw9DJkyeVmJiogIC6qz8rKipUVVXl8XWCg4MVGhrqhYjql18n+yNHjigpKcnXYQAAPFRQUKDmzZvXybUrKirUKjlChcfsHl8rISFBBw4c8LuE79fJPjIyUpJ0aFtLRUUwIwFzuuvyjr4OAagzNarWRq1y/n1eF6qqqlR4zK5DOS0VFXnhuaL0pEPJXQ6qqqqKZF+fzrTuoyICPPo/ELiYBdka+DoEoO78+MD2+piKjYi0KSLywr/HIf+dLvbrZA8AQG3ZDYfsHrwNxm44vBdMPSPZAwAswSFDDl14tvfkXF+j9w0AgMlR2QMALMEhhzxpxHt2tm+R7AEAlmA3DNmNC2/Fe3Kur9HGBwDA5KjsAQCWYOUFeiR7AIAlOGTIbtFkTxsfAACTo7IHAFgCbXwAAEyO1fgAAMC0qOwBAJbg+HHz5Hx/RbIHAFiC3cPV+J6c62skewCAJdgNefjWO+/FUt+YswcAwOSo7AEAlsCcPQAAJueQTXbZPDrfX9HGBwDA5KjsAQCW4DBOb56c769I9gAAS7B72Mb35Fxfo40PAIDJUdkDACzBypU9yR4AYAkOwyaH4cFqfA/O9TXa+AAAmByVPQDAEmjjAwBgcnYFyO5BQ9vuxVjqG8keAGAJhodz9gZz9gAA4GJFZQ8AsATm7AEAMDm7ESC74cGcvR8/Lpc2PgAAJkdlDwCwBIdscnhQ4zrkv6U9yR4AYAlWnrOnjQ8AgMlR2QMALMHzBXq08QEAuKidnrP34EU4tPEBAMDFisoeAGAJDg+fjc9qfAAALnLM2QMAYHIOBVj2Pnvm7AEAMDkqewCAJdgNm+wevKbWk3N9jWQPALAEu4cL9Oy08QEAwMWKyh4AYAkOI0AOD1bjO1iNDwDAxY02PgAAMC0qewCAJTjk2Yp6h/dCqXckewCAJXj+UB3/bYb7b+QAAKBWqOwBAJbg+bPx/bc+JtkDACzByu+zJ9kDACzBypW9/0YOAABqhcoeAGAJnj9Ux3/rY5I9AMASHIZNDk/us/fjt9757z9TAABArVDZAwAsweFhG9+fH6pDsgcAWILnb73z32Tvv5EDAIBaobIHAFiCXTbZPXgwjifn+hrJHgBgCbTxAQCAaVHZAwAswS7PWvF274VS70j2AABLsHIbn2QPALAEXoQDAABMi2QPALAE48f32V/oZrg532+32zV+/Hi1atVKYWFhuvTSSzV16lQZhvFTTIahCRMmqGnTpgoLC1NaWpq+/vprl+sUFxcrIyNDUVFRiomJ0aBBg1RWVuZWLCR7AIAlnGnje7K54+mnn9b8+fP1l7/8Rbm5uXr66ac1c+ZMzZ071zlm5syZmjNnjhYsWKAtW7aoYcOGSk9PV0VFhXNMRkaGdu3apTVr1mjlypVav369HnroIbdiYc4eAAA3lJaWunwOCQlRSEjIWeM2bdqkPn36qHfv3pKkli1b6o033tBnn30m6XRVP3v2bI0bN059+vSRJC1evFjx8fFavny5+vfvr9zcXK1evVqff/65unbtKkmaO3euevXqpWeffVaJiYm1ipnKHgBgCWdecevJJklJSUmKjo52bjNmzDjn911//fVau3at9uzZI0nasWOHNm7cqNtvv12SdODAARUWFiotLc15TnR0tLp166bs7GxJUnZ2tmJiYpyJXpLS0tIUEBCgLVu21Pq3U9kDACzB7uFb786cW1BQoKioKOf+c1X1kjRmzBiVlpaqXbt2CgwMlN1u17Rp05SRkSFJKiwslCTFx8e7nBcfH+88VlhYqLi4OJfjQUFBio2NdY6pDZI9AABuiIqKckn25/P2229ryZIlWrp0qa644gpt375dw4YNU2JiojIzM+sh0p+Q7AEAlvCfrfgLPd8do0aN0pgxY9S/f39JUseOHXXo0CHNmDFDmZmZSkhIkCQVFRWpadOmzvOKiorUuXNnSVJCQoKOHTvmct2amhoVFxc7z68N5uwBAJbgUIDHmztOnTqlgADXcwIDA+VwOCRJrVq1UkJCgtauXes8Xlpaqi1btig1NVWSlJqaqpKSEuXk5DjHfPzxx3I4HOrWrVutY6GyBwCgDtxxxx2aNm2aWrRooSuuuEJffPGFnn/+ef3ud7+TJNlsNg0bNkxPPvmkLrvsMrVq1Urjx49XYmKi+vbtK0lq3769evbsqcGDB2vBggWqrq7WkCFD1L9//1qvxJdI9gAAi7AbNtk9aOO7e+7cuXM1fvx4PfLIIzp27JgSExP1+9//XhMmTHCOGT16tMrLy/XQQw+ppKREN954o1avXq3Q0FDnmCVLlmjIkCG67bbbFBAQoLvvvltz5sxxKxab8Z+P8vEzpaWlio6O1vE9rRUVyYwEzCk9sbOvQwDqTI1RrU/0d504caJWi94uxJlc8fv1dyskosEFX6eyrFov3vxuncZaV6jsAQCWYHj41juDF+EAAICLFZU9AMAS7LLJ7ubLbP77fH9FsgcAWILDcP9e+f8+31/RxgcAwOSo7KFTZQF6fWZTbfogWiXfB+nSK37Qw1O/UdvOP0iSfigP0KvTmir7n9EqPR6khKQq9Rn0rX51//dnXcswpHEDWmvrv6I08dUDuv72E/X9c4BaaZxQrUFPHNE1t55USJhDRw6G6LnhSfr63+GSpH8e2XHO816e2lTvzI875zFc3BweLtDz5FxfI9lDs/6YpIN5oRo995Bi46v18buxGvObNnr5k6/UpGm1XpyUqO2fRmr03HzFJ1Vp27pIzR3bXI3jq5Wa7vqqx2UvXyKb/05rwSIiomv0/N+/1r83RWjcgNYq+T5QzVpXqexEoHNM/04pLudc84uTGv5cgTb+I7q+w4WXOGSTw4N5d0/O9bWL4p8p8+bNU8uWLRUaGqpu3bo53/WLulf5g00bV8XowXFH1fG6cjVrVaX7RhYqsWWlVi5uLEnavbWhfvnrYnW6vkwJSVXqNeB7tU75QXnbw12ute/LML374iUa8Xy+L34KUGv3ZB3Td0eC9dzwFsrbHq6ighBtWxepo4d+envZ8W8buGyp6Se049MIFeaf+w1nwMXM58n+rbfe0ogRIzRx4kRt27ZNnTp1Unp6+lkP/kfdsNttcthtCg5xuOwPCXVo12cRkqSUruXa/GG0vjvaQIYhbf80Qof3h6hL95PO8RWnbHoqK1lZ075RbFxNvf4GwF3X9SjVnh1heuLFg3rr37s078M83X7v2dNSZ8Q0qda1t5Xqn2/G1mOU8LYzT9DzZPNXPk/2zz//vAYPHqyBAwcqJSVFCxYsUHh4uF577TVfh2YJ4REOte9SrqWzE/R9YZDsdmntu42Um9NQxUWnZ3keefKwWlxeoYwuV6h3cieNy2itrOnfqON15c7rvDipmVK6luv6nqXn+yrgotG0RZV+df/3OnIgRH+6t5VWvt5ED089rLRfF59z/C/vOa4fygK1cRUtfH92Zs7ek81f+XTOvqqqSjk5ORo7dqxzX0BAgNLS0pSdnX3W+MrKSlVWVjo/l5aSWLxh9NxDen5EC917dQcFBBpq0/GUbul73LlQ6e+vNdFXOeGavGi/4ppXaefmCM370+k5+6tvLlP2P6O0/dNIvfBhno9/CVA7tgDp63+HaeFTp18ruu/LcLVsV6He932vj/52dvWe3r9YHy+LUXWl//5lD2vzabL/7rvvZLfbFR8f77I/Pj5eX3311VnjZ8yYocmTJ9dXeJaR2LJKz763VxWnAlR+MkCN42s07ffJappcqcofbFr0VFNNePWguqWd/sdV65QK7d8VpncWxOnqm8u0/dNIHT0YrH7tOrpcd+rglurQrVzPvLvXFz8LOK/iY0E6tCfUZV/B1yG6sVfJWWM7XFumpDaVmv6H5HqKDnXFIQ/fZ+/HC/T8ajX+2LFjNWLECOfn0tJSJSUl+TAicwkNdyg03KGTJYHKWRelB8cdUU2NTTXVAQoIcH2aRECgIePHaf7fDCk6a77z979op99POqzretB9wcVn9+cNlXRppcu+Zq0rdexw8Flj039brD07wrR/d1h9hYc6Yni4Gt8g2V+YJk2aKDAwUEVFRS77i4qKlJCQcNb4kJAQhYSwEtbbtn4SKcOQki6t1OEDwXplajMltalQj998r6AG0pWpZXp5aqKCQw8rvnmV/p0doY/eidVDEw9LkmLjas65KC+uWbUSWlTV988BftZ7L12iWe9/rf5Di7R+RYzaXnVKvQYUa/ao5i7jwiPsuvmOE3ppclMfRQpvchgeVvZ+vEDPp8k+ODhYXbp00dq1a9W3b19JksPh0Nq1azVkyBBfhmYp5aWBWjijqb472kCRMXbd0KtEA8ccVdCPb4IcO/+gXpveVE8PaaGTJUGKa1alBx4/es6H6gD+YM+OcE0Z1EoDxx5VxvAiFRYEa8GERP1rWSOXcd37lEg2Q/9a3ujcFwL8hM/b+CNGjFBmZqa6du2qa6+9VrNnz1Z5ebkGDhzo69Aso/udJep+Z8l5j8fG1Wjk7AK3rvnPI9s9CwqoY1s+itKWj/73O8k/WNJYHyxpXE8Roa7xBD0f+s1vfqNvv/1WEyZMUGFhoTp37qzVq1eftWgPAABP0Mb3sSFDhtC2BwCgjlwUyR4AgLpm5Wfjk+wBAJZg5Ta+/642AAAAtUJlDwCwBCtX9iR7AIAlWDnZ08YHAMDkqOwBAJZg5cqeZA8AsARDnt0+Z/z8kIsWyR4AYAlWruyZswcAwOSo7AEAlmDlyp5kDwCwBCsne9r4AACYHJU9AMASrFzZk+wBAJZgGDYZHiRsT871Ndr4AACYHJU9AMASeJ89AAAmZ+U5e9r4AACYHJU9AMASrLxAj2QPALAEK7fxSfYAAEuwcmXPnD0AACZHZQ8AsATDwza+P1f2JHsAgCUYkgzDs/P9FW18AABMjsoeAGAJDtlk4wl6AACYF6vxAQCAaVHZAwAswWHYZOOhOgAAmJdheLga34+X49PGBwDA5KjsAQCWYOUFeiR7AIAlkOwBADA5Ky/QY84eAACTo7IHAFiClVfjk+wBAJZwOtl7MmfvxWDqGW18AABMjsoeAGAJrMYHAMDkDHn2Tno/7uLTxgcAwOyo7AEAlkAbHwAAs7NwH59kDwCwBg8re/lxZc+cPQAAJkdlDwCwBJ6gBwCAyVl5gR5tfAAATI7KHgBgDYbNs0V2flzZk+wBAJZg5Tl72vgAANSRw4cPa8CAAWrcuLHCwsLUsWNHbd261XncMAxNmDBBTZs2VVhYmNLS0vT111+7XKO4uFgZGRmKiopSTEyMBg0apLKyMrfiINkDAKzB8MLmhuPHj+uGG25QgwYN9MEHH2j37t167rnn1KhRI+eYmTNnas6cOVqwYIG2bNmihg0bKj09XRUVFc4xGRkZ2rVrl9asWaOVK1dq/fr1euihh9yKhTY+AMAS6ns1/tNPP62kpCQtXLjQua9Vq1b/cT1Ds2fP1rhx49SnTx9J0uLFixUfH6/ly5erf//+ys3N1erVq/X555+ra9eukqS5c+eqV69eevbZZ5WYmFirWGqV7N9///1a/7g777yz1mMBAPA3paWlLp9DQkIUEhJy1rj3339f6enp+vWvf61169apWbNmeuSRRzR48GBJ0oEDB1RYWKi0tDTnOdHR0erWrZuys7PVv39/ZWdnKyYmxpnoJSktLU0BAQHasmWL7rrrrlrFXKtk37dv31pdzGazyW6312osAAD1zguL7JKSklw+T5w4UZMmTTpr3P79+zV//nyNGDFCf/rTn/T555/r0UcfVXBwsDIzM1VYWChJio+PdzkvPj7eeaywsFBxcXEux4OCghQbG+scUxu1SvYOh6PWFwQA4GLkrTZ+QUGBoqKinPvPVdVLp3Nn165dNX36dEnSVVddpS+//FILFixQZmbmBcdxITxaoPefCwgAALioeWmBXlRUlMt2vmTftGlTpaSkuOxr37698vPzJUkJCQmSpKKiIpcxRUVFzmMJCQk6duyYy/GamhoVFxc7x9SG28nebrdr6tSpatasmSIiIrR//35J0vjx4/Xqq6+6ezkAAEzphhtuUF5ensu+PXv2KDk5WdLpxXoJCQlau3at83hpaam2bNmi1NRUSVJqaqpKSkqUk5PjHPPxxx/L4XCoW7dutY7F7WQ/bdo0LVq0SDNnzlRwcLBzf4cOHfTKK6+4ezkAAOqJzQtb7Q0fPlybN2/W9OnTtXfvXi1dulQvvfSSsrKyTkdjs2nYsGF68skn9f7772vnzp26//77lZiY6Fwr1759e/Xs2VODBw/WZ599pk8//VRDhgxR//79a70SX7qAZL948WK99NJLysjIUGBgoHN/p06d9NVXX7l7OQAA6kc932d/zTXXaNmyZXrjjTfUoUMHTZ06VbNnz1ZGRoZzzOjRozV06FA99NBDuuaaa1RWVqbVq1crNDTUOWbJkiVq166dbrvtNvXq1Us33nijXnrpJbdicfs++8OHD6tNmzZn7Xc4HKqurnb3cgAAmNavfvUr/epXvzrvcZvNpilTpmjKlCnnHRMbG6ulS5d6FIfblX1KSoo2bNhw1v533nlHV111lUfBAABQZ+q5sr+YuF3ZT5gwQZmZmTp8+LAcDofee+895eXlafHixVq5cmVdxAgAgOcs/NY7tyv7Pn36aMWKFfroo4/UsGFDTZgwQbm5uVqxYoV++ctf1kWMAADAAxf0bPybbrpJa9as8XYsAADUGSu/4vaCX4SzdetW5ebmSjo9j9+lSxevBQUAgNd5Ou9upWT/zTff6Le//a0+/fRTxcTESJJKSkp0/fXX680331Tz5s29HSMAAPCA23P2Dz74oKqrq5Wbm6vi4mIVFxcrNzdXDodDDz74YF3ECACA584s0PNk81NuV/br1q3Tpk2b1LZtW+e+tm3bau7cubrpppu8GhwAAN5iM05vnpzvr9xO9klJSed8eI7dbnfr0X0AANQrC8/Zu93Gf+aZZzR06FBt3brVuW/r1q167LHH9Oyzz3o1OAAA4LlaVfaNGjWSzfbTXEV5ebm6deumoKDTp9fU1CgoKEi/+93vnA/vBwDgomLhh+rUKtnPnj27jsMAAKCOWbiNX6tkn5mZWddxAACAOnLBD9WRpIqKClVVVbnsi4qK8iggAADqhIUre7cX6JWXl2vIkCGKi4tTw4YN1ahRI5cNAICLkoXfeud2sh89erQ+/vhjzZ8/XyEhIXrllVc0efJkJSYmavHixXURIwAA8IDbbfwVK1Zo8eLFuuWWWzRw4EDddNNNatOmjZKTk7VkyRJlZGTURZwAAHjGwqvx3a7si4uL1bp1a0mn5+eLi4slSTfeeKPWr1/v3egAAPCSM0/Q82TzV24n+9atW+vAgQOSpHbt2untt9+WdLriP/NiHAAAcPFwO9kPHDhQO3bskCSNGTNG8+bNU2hoqIYPH65Ro0Z5PUAAALzCwgv03J6zHz58uPO/09LS9NVXXyknJ0dt2rTRlVde6dXgAACA5zy6z16SkpOTlZyc7I1YAACoMzZ5+NY7r0VS/2qV7OfMmVPrCz766KMXHAwAAPC+WiX7WbNm1epiNpvNJ8n+7s7XKsgWXO/fC9SHNws+8nUIQJ05edKhVu3r6cssfOtdrZL9mdX3AAD4LR6XCwAAzMrjBXoAAPgFC1f2JHsAgCV4+hQ8Sz1BDwAA+BcqewCANVi4jX9Blf2GDRs0YMAApaam6vDhw5Kkv/71r9q4caNXgwMAwGss/Lhct5P9u+++q/T0dIWFhemLL75QZWWlJOnEiROaPn261wMEAACecTvZP/nkk1qwYIFefvllNWjQwLn/hhtu0LZt27waHAAA3mLlV9y6PWefl5enm2+++az90dHRKikp8UZMAAB4n4WfoOd2ZZ+QkKC9e/eetX/jxo1q3bq1V4ICAMDrmLOvvcGDB+uxxx7Tli1bZLPZdOTIES1ZskQjR47Uww8/XBcxAgAAD7jdxh8zZowcDoduu+02nTp1SjfffLNCQkI0cuRIDR06tC5iBADAY1Z+qI7byd5ms+mJJ57QqFGjtHfvXpWVlSklJUURERF1ER8AAN5h4fvsL/ihOsHBwUpJSfFmLAAAoA64nexvvfVW2WznX5H48ccfexQQAAB1wtPb56xU2Xfu3Nnlc3V1tbZv364vv/xSmZmZ3ooLAADvoo1fe7NmzTrn/kmTJqmsrMzjgAAAgHd57a13AwYM0GuvveatywEA4F0Wvs/ea2+9y87OVmhoqLcuBwCAV3HrnRv69evn8tkwDB09elRbt27V+PHjvRYYAADwDreTfXR0tMvngIAAtW3bVlOmTFGPHj28FhgAAPAOt5K93W7XwIED1bFjRzVq1KiuYgIAwPssvBrfrQV6gYGB6tGjB2+3AwD4HSu/4tbt1fgdOnTQ/v376yIWAABQB9xO9k8++aRGjhyplStX6ujRoyotLXXZAAC4aFnwtjvJjTn7KVOm6I9//KN69eolSbrzzjtdHptrGIZsNpvsdrv3owQAwFMWnrOvdbKfPHmy/vCHP+hf//pXXcYDAAC8rNbJ3jBO/5Ome/fudRYMAAB1hYfq1NL/etsdAAAXNdr4tXP55Zf/bMIvLi72KCAAAOBdbiX7yZMnn/UEPQAA/AFt/Frq37+/4uLi6ioWAADqjoXb+LW+z575egAA/JPbq/EBAPBLFq7sa53sHQ5HXcYBAECdYs4eAACzs3Bl7/az8QEAgH+hsgcAWIOFK3uSPQDAEqw8Z08bHwAAk6OyBwBYA218AADMjTY+AAAwLSp7AIA10MYHAMDkLJzsaeMDAFDHnnrqKdlsNg0bNsy5r6KiQllZWWrcuLEiIiJ09913q6ioyOW8/Px89e7dW+Hh4YqLi9OoUaNUU1Pj9veT7AEAlmDzwnYhPv/8c7344ou68sorXfYPHz5cK1as0N/+9jetW7dOR44cUb9+/ZzH7Xa7evfuraqqKm3atEmvv/66Fi1apAkTJrgdA8keAGANhhc2N5WVlSkjI0Mvv/yyGjVq5Nx/4sQJvfrqq3r++ef1i1/8Ql26dNHChQu1adMmbd68WZL04Ycfavfu3fq///s/de7cWbfffrumTp2qefPmqaqqyq04SPYAAEs4c+udJ5sklZaWumyVlZXn/c6srCz17t1baWlpLvtzcnJUXV3tsr9du3Zq0aKFsrOzJUnZ2dnq2LGj4uPjnWPS09NVWlqqXbt2ufXbSfYAALghKSlJ0dHRzm3GjBnnHPfmm29q27Zt5zxeWFio4OBgxcTEuOyPj49XYWGhc8x/Jvozx88ccwer8QEA1uCl1fgFBQWKiopy7g4JCTlraEFBgR577DGtWbNGoaGhHnypd1DZAwCswwvz9VFRUS7buZJ9Tk6Ojh07pquvvlpBQUEKCgrSunXrNGfOHAUFBSk+Pl5VVVUqKSlxOa+oqEgJCQmSpISEhLNW55/5fGZMbZHsAQDwsttuu007d+7U9u3bnVvXrl2VkZHh/O8GDRpo7dq1znPy8vKUn5+v1NRUSVJqaqp27typY8eOOcesWbNGUVFRSklJcSse2vgAAEuoz2fjR0ZGqkOHDi77GjZsqMaNGzv3Dxo0SCNGjFBsbKyioqI0dOhQpaam6rrrrpMk9ejRQykpKbrvvvs0c+ZMFRYWaty4ccrKyjpnN+F/IdkDAKzhInuC3qxZsxQQEKC7775blZWVSk9P1wsvvOA8HhgYqJUrV+rhhx9WamqqGjZsqMzMTE2ZMsXt7yLZAwBQDz755BOXz6GhoZo3b57mzZt33nOSk5O1atUqj7+bZA8AsAQrv+KWZA8AsIaLrI1fn1iNDwCAyVHZAwAsgTY+AABmZ+E2PskeAGANFk72zNkDAGByVPYAAEtgzh4AALOjjQ8AAMyKyh4AYAk2w5DNuPDy3JNzfY1kDwCwBtr4AADArKjsAQCWwGp8AADMjjY+AAAwKyp7AIAl0MYHAMDsLNzGJ9kDACzBypU9c/YAAJgclT0AwBpo4wMAYH7+3Ir3BG18AABMjsoeAGANhnF68+R8P0WyBwBYAqvxAQCAaVHZAwCsgdX4AACYm81xevPkfH9FGx8AAJOjssc5hTW06/5h+UrtUayYxtXat7uhXpzaSnt2RkiSMh4tUPfe3+mSplWqrrZp75cRev35JOXtiPRx5MDZfigL0NvPttDnqxvrxHdBatmhXA9MOqhLO5dJkl4Y3kbr34lzOadT9+Ma+3+5Lvu2rW2kd2c3V35uuIJDDbXvdkIjX82rt98BD9HGB1w9Nn2fWl5+Ss+OvEzfH2ugX/T5TtMX79bve3bS90UhOnwgVC9MbqXCglAFhzp018CjmrYoV4Nuu0onihv4OnzAxYuj2uibPeHKmv21GsVXacOyS/TkvSl6bu12xTatkiR1uuW4Hn5ur/OcoGDXnu2WVbF6afSl6v94vq644YQcNTYV5IXX6++AZ1iN7yPr16/XHXfcocTERNlsNi1fvtyX4eBHwSF23Zj+vV59Ollffh6lo4fCtGROko4cClXve4skSZ+suETbN8WosCBU+V+H6+XpyWoYaVertqd8HD3gquqHAH32QWPd+6dDan9dqRJaVejXIwqU0LJCa/4a7xzXINihmLhq5xYRY3ces9dIr09spYxxh/TL+4qU2LpCzS//Qal3fO+Ln4QLdeY+e082P+XTZF9eXq5OnTpp3rx5vgwD/yUw6PRWXen6x6OqIkBXdD151vigBg7d/ptjKisN1P6vqHRwcbHbJYfdpgYhrpV6cKhDX30e5fy8e3O0Hup8jYZ3v0qvjG2tk8d/anwe2Bmh4sIQBdikMT2v1B+6dNWM+9qrgD/v8BM+bePffvvtuv3222s9vrKyUpWVlc7PpaWldRGW5f1QHqjd2yL02yHfKH9fmEq+a6Dud3yndled1NFDoc5x1956XGNm71FImEPFxxroicwUlR6nhY+LS1iEQ5d1KdV7f26uZm1OKeaSan369ybakxOphJYVkqTOtxzXtbd/r7ikShUdCtWbM1voqfvaa+rfdyogUDqWHyJJemdWku6bcECXNK/UypcSNeWeKzRr3ReKaFTjy5+IWqKN7ydmzJih6Oho55aUlOTrkEzr2ZGXyWYztGRTjt7fvVl97j+qdSubyOGwOcfs2BylrDuv1B/v6aCcDTEaO2ePomOrfRg1cG5Zs7+WDOmRa67RgEtTtfq1prqhz3eyBZz+2/v6Pt+ra4/jatH+lK7pWazRC3O1b0ekdmVHS5Lzz33fod+oW69itb6y/PT8vk3a/I/GPvtdcJPhhc1P+dUCvbFjx2rEiBHOz6WlpST8OnI0P1Sj7+2gkDC7wiPsOv5tsMb8eY8KC0KcYyp/CNTRQ2E6ekj6anukXvnoC6Xfc0xvL2jmw8iBsyW0rNTEd3ap4lSAfjgZqEbx1Zr98OWKb1F5zvHxyZWKjK1W0cFQdbzxhBrFn17E1/yyn9akNAgxFNeiQt8dDjnnNYCLiV9V9iEhIYqKinLZULcqfwjU8W+DFRFVoy43lWjzR7HnHRsQYKhBsB8/dQKmFxruUKP4apWVBOrf62PUpUfxOcd9fzRYZceDFBN3Osm36liuBiEOHdkf5hxTU23Td9+EqEnzc/+DARefM218TzZ/5VeVPerP1TeVyGYz9M3+MCUmV2jQ44f0zf4wffjuJQoJs6v/I4e1ZW0jFR8LVlSjat0xoFCN46u04QNamrj47PgkRoYhJV76gwoPhmrJtJZKvPQH3XLPMVWUB+idWUnq1ut7RV9SraJDoVo6PVnxLSvUqXuJJCk80q60AYV657kkNW5aqUuaV2rFjx2s63p/58NfBrfw1jvAVcPIGg0cma8mCVU6WRKkjf+M1evPtZC9JkABAQ4ltf5BaXcdU3RsjUqPB2nPzgiN6t9B+V+zOhkXn1MnA/XGU8kqLgxWREyNrr39e/Ufna+gBoYcNTbl54Zr/TtxKi8NVKP4Kl15c4nuGVmgBiE//eWe8cQhBQQaemHYZaqqCFCbq8o07s1dLrfoARcrnyb7srIy7d3700MsDhw4oO3btys2NlYtWrTwYWTYsKqJNqxqcs5j1VUBejKrbT1HBFy41Du+P+898cFhDv1pSe45j/2noAaG7ht/SPeNP+Tt8FBPrLwa36fJfuvWrbr11ludn88svsvMzNSiRYt8FBUAwJR4XK5v3HLLLTL8eA4EAAB/wJw9AMASaOMDAGB2DuP05sn5fopkDwCwBgvP2fvVQ3UAAID7qOwBAJZgk4dz9l6LpP6R7AEA1mDhJ+jRxgcAwOSo7AEAlsCtdwAAmB2r8QEAgFlR2QMALMFmGLJ5sMjOk3N9jWQPALAGx4+bJ+f7Kdr4AACYHJU9AMASaOMDAGB2Fl6NT7IHAFgDT9ADAABmRWUPALAEnqAHAIDZ0cYHAABmRWUPALAEm+P05sn5/opkDwCwBtr4AADArKjsAQDWwEN1AAAwNys/Lpc2PgAAJkdlDwCwBhboAQBgcoZ+eqf9hWxu5voZM2bommuuUWRkpOLi4tS3b1/l5eW5jKmoqFBWVpYaN26siIgI3X333SoqKnIZk5+fr969eys8PFxxcXEaNWqUampq3IqFZA8AsIQzc/aebO5Yt26dsrKytHnzZq1Zs0bV1dXq0aOHysvLnWOGDx+uFStW6G9/+5vWrVunI0eOqF+/fs7jdrtdvXv3VlVVlTZt2qTXX39dixYt0oQJE9yKhTY+AAB1YPXq1S6fFy1apLi4OOXk5Ojmm2/WiRMn9Oqrr2rp0qX6xS9+IUlauHCh2rdvr82bN+u6667Thx9+qN27d+ujjz5SfHy8OnfurKlTp+rxxx/XpEmTFBwcXKtYqOwBANZg6Kd5+wvaTl+mtLTUZausrKzV1584cUKSFBsbK0nKyclRdXW10tLSnGPatWunFi1aKDs7W5KUnZ2tjh07Kj4+3jkmPT1dpaWl2rVrV61/OskeAGANHiX6nxb3JSUlKTo62rnNmDHjZ7/a4XBo2LBhuuGGG9ShQwdJUmFhoYKDgxUTE+MyNj4+XoWFhc4x/5nozxw/c6y2aOMDAOCGgoICRUVFOT+HhIT87DlZWVn68ssvtXHjxroM7bxI9gAAa3BIsnl4vqSoqCiXZP9zhgwZopUrV2r9+vVq3ry5c39CQoKqqqpUUlLiUt0XFRUpISHBOeazzz5zud6Z1fpnxtQGbXwAgCXU92p8wzA0ZMgQLVu2TB9//LFatWrlcrxLly5q0KCB1q5d69yXl5en/Px8paamSpJSU1O1c+dOHTt2zDlmzZo1ioqKUkpKSq1jobIHAKAOZGVlaenSpfr73/+uyMhI5xx7dHS0wsLCFB0drUGDBmnEiBGKjY1VVFSUhg4dqtTUVF133XWSpB49eiglJUX33XefZs6cqcLCQo0bN05ZWVm1mj44g2QPALCGen6C3vz58yVJt9xyi8v+hQsX6oEHHpAkzZo1SwEBAbr77rtVWVmp9PR0vfDCC86xgYGBWrlypR5++GGlpqaqYcOGyszM1JQpU9yKhWQPALCGek72Ri3Gh4aGat68eZo3b955xyQnJ2vVqlVuffd/Y84eAACTo7IHAFiDhV+EQ7IHAFiDl26980ckewCAJVzI7XP/fb6/Ys4eAACTo7IHAFgDc/YAAJicw5BsHiRsh/8me9r4AACYHJU9AMAaaOMDAGB2HiZ7+W+yp40PAIDJUdkDAKyBNj4AACbnMORRK57V+AAA4GJFZQ8AsAbDcXrz5Hw/RbIHAFgDc/YAAJgcc/YAAMCsqOwBANZAGx8AAJMz5GGy91ok9Y42PgAAJkdlDwCwBtr4AACYnMMhyYN75R3+e589bXwAAEyOyh4AYA208QEAMDkLJ3va+AAAmByVPQDAGiz8uFySPQDAEgzDIcODN9d5cq6vkewBANZgGJ5V58zZAwCAixWVPQDAGgwP5+z9uLIn2QMArMHhkGwezLv78Zw9bXwAAEyOyh4AYA208QEAMDfD4ZDhQRvfn2+9o40PAIDJUdkDAKyBNj4AACbnMCSbNZM9bXwAAEyOyh4AYA2GIcmT++z9t7In2QMALMFwGDI8aOMbJHsAAC5yhkOeVfbcegcAAC5SVPYAAEugjQ8AgNlZuI3v18n+zL+yaoxqH0cC1J2TJ/33Lxjg55wsO/3nuz6q5hpVe/RMnRr5b67x62R/8uRJSdL6H971cSRA3WnV3tcRAHXv5MmTio6OrpNrBwcHKyEhQRsLV3l8rYSEBAUHB3shqvplM/x4EsLhcOjIkSOKjIyUzWbzdTiWUFpaqqSkJBUUFCgqKsrX4QBexZ/v+mcYhk6ePKnExEQFBNTdmvGKigpVVVV5fJ3g4GCFhoZ6IaL65deVfUBAgJo3b+7rMCwpKiqKvwxhWvz5rl91VdH/p9DQUL9M0t7CrXcAAJgcyR4AAJMj2cMtISEhmjhxokJCQnwdCuB1/PmGWfn1Aj0AAPDzqOwBADA5kj0AACZHsgcAwORI9gAAmBzJHrU2b948tWzZUqGhoerWrZs+++wzX4cEeMX69et1xx13KDExUTabTcuXL/d1SIBXkexRK2+99ZZGjBihiRMnatu2berUqZPS09N17NgxX4cGeKy8vFydOnXSvHnzfB0KUCe49Q610q1bN11zzTX6y1/+Iun0ewmSkpI0dOhQjRkzxsfRAd5js9m0bNky9e3b19ehAF5DZY+fVVVVpZycHKWlpTn3BQQEKC0tTdnZ2T6MDABQGyR7/KzvvvtOdrtd8fHxLvvj4+NVWFjoo6gAALVFsgcAwORI9vhZTZo0UWBgoIqKilz2FxUVKSEhwUdRAQBqi2SPnxUcHKwuXbpo7dq1zn0Oh0Nr165VamqqDyMDANRGkK8DgH8YMWKEMjMz1bVrV1177bWaPXu2ysvLNXDgQF+HBnisrKxMe/fudX4+cOCAtm/frtjYWLVo0cKHkQHewa13qLW//OUveuaZZ1RYWKjOnTtrzpw56tatm6/DAjz2ySef6NZbbz1rf2ZmphYtWlT/AQFeRrIHAMDkmLMHAMDkSPYAAJgcyR4AAJMj2QMAYHIkewAATI5kDwCAyZHsAQAwOZI9AAAmR7IHPPTAAw+ob9++zs+33HKLhg0bVu9xfPLJJ7LZbCopKTnvGJvNpuXLl9f6mpMmTVLnzp09iuvgwYOy2Wzavn27R9cBcOFI9jClBx54QDabTTabTcHBwWrTpo2mTJmimpqaOv/u9957T1OnTq3V2NokaADwFC/CgWn17NlTCxcuVGVlpVatWqWsrCw1aNBAY8eOPWtsVVWVgoODvfK9sbGxXrkOAHgLlT1MKyQkRAkJCUpOTtbDDz+stLQ0vf/++5J+ar1PmzZNiYmJatu2rSSpoKBA99xzj2JiYhQbG6s+ffro4MGDzmva7XaNGDFCMTExaty4sUaPHq3/fr3Ef7fxKysr9fjjjyspKUkhISFq06aNXn31VR08eND58pVGjRrJZrPpgQcekHT6FcIzZsxQq1atFBYWpk6dOumdd95x+Z5Vq1bp8ssvV1hYmG699VaXOGvr8ccf1+WXX67w8HC1bt1a48ePV3V19VnjXnzxRSUlJSk8PFz33HOPTpw44XL8lVdeUfv27RUaGqp27drphRdecDsWAHWHZA/LCAsLU1VVlfPz2rVrlZeXpzVr1mjlypWqrq5Wenq6IiMjtWHDBn366aeKiIhQz549nec999xzWrRokV577TVt3LhRxcXFWrZs2f/83vvvv19vvPGG5syZo9zcXL344ouKiIhQUlKS3n33XUlSXl6ejh49qj//+c+SpBkzZmjx4sVasGCBdu3apeHDh2vAgAFat26dpNP/KOnXr5/uuOMObd++XQ8++KDGjBnj9v8mkZGRWrRokXbv3q0///nPevnllzVr1iyXMXv37tXbb7+tFStWaPXq1friiy/0yCOPOI8vWbJEEyZM0LRp05Sbm6vp06dr/Pjxev31192OB0AdMQATyszMNPr06WMYhmE4HA5jzZo1RkhIiDFy5Ejn8fj4eKOystJ5zl//+lejbdu2hsPhcO6rrKw0wsLCjH/+85+GYRhG06ZNjZkzZzqPV1dXG82bN3d+l2EYRvfu3Y3HHnvMMAzDyMvLMyQZa9asOWec//rXvwxJxvHjx537KioqjPDwcGPTpk0uYwcNGmT89re/NQzDMMaOHWukpKS4HH/88cfPutZ/k2QsW7bsvMefeeYZo0uXLs7PEydONAIDA41vvvnGue+DDz4wAgICjKNHjxqGYRiXXnqpsXTpUpfrTJ061UhNTTUMwzAOHDhgSDK++OKL834vgLrFnD1Ma+XKlYqIiFB1dbUcDofuvfdeTZo0yXm8Y8eOLvP0O3bs0N69exUZGelynYqKCu3bt08nTpzQ0aNH1a1bN+exoKAgde3a9axW/hnbt29XYGCgunfvXuu49+7dq1OnTumXv/yly/6qqipdddVVkqTc3FyXOCQpNTW11t9xxltvvaU5c+Zo3759KisrU01NjaKiolzGtGjRQs2aNXP5HofDoby8PEVGRmrfvn0aNGiQBg8e7BxTU1Oj6Ohot+MBUDdI9jCtW2+9VfPnz1dwcLASExMVFOT6x71hw4Yun8vKytSlSxctWbLkrGtdcsklFxRDWFiY2+eUlZVJkv7xj3+4JFnp9DoEb8nOzlZGRoYmT56s9PR0RUdH680339Rzzz3ndqwvv/zyWf/4CAwM9FqsADxDsodpNWzYUG3atKn1+KuvvlpvvfWW4uLizqpuz2jatKm2bNmim2++WdLpCjYnJ0dXX331Ocd37NhRDodD69atU1pa2lnHz3QW7Ha7c19KSopCQkKUn59/3o5A+/btnYsNz9i8efPP/8j/sGnTJiUnJ+uJJ55w7jt06NBZ4/Lz83XkyBElJiY6vycgIEBt27ZVfHy8EhMTtX//fmVkZLj1/QDqDwv0gB9lZGSoSZMm6tOnjzZs2KADBw7ok08+0aOPPqpvvvlGkvTYY4/pqaee0vLly/XVV1/pkUce+Z/3yLds2VKZmZn63e9+p+XLlzuv+fbbb0uSkpOTZbPZtHLlSn377bcqKytTZGSkRo4cqeHDh+v111/Xvn37tG3bNs2dO9e56O0Pf/iDvv76a40aNUp5eXlaunSpFi1a5Nbvveyyy5Sfn68333xT+/bt05w5c8652DA0NFSZmZnasWOHNmzYoEcffVT33HOPEhISJEmTJ0/WjBkzNGfOHO3Zs0c7d+7UwoUL9fzzz7sVD4C6Q7IHfhQeHq7169erRYsW6tevn9q3b69BgwapoqLCWen/8Y9/1H333afMzEylpqYqMjJSd9111/+87vz58/X//t//0yOPPKJ27dpp8ODBKi8vlyQ1a9ZMkydP1pgxYxQfH68hQ4ZIkqZOnarx48drxowZat++vXr27Kl//OMfatWqlaTT8+jvvvuuli9frk6dOmnBggWaPn26W7/3zjvv1PDhwzVkyBB17txZmzZt0vjx488a16ZNG/Xr10+9evVSjx49dOWVV7rcWvfggw/qlVde0cKFC9WxY0d1795dixYtcsYKwPdsxvlWFgEAAFOgsgcAwORI9gAAmBzJHgAAkyPZAwBgciR7AABMjmQPAIDJkewBADA5kj0AACZHsgcAwORI9gAAmBzJHgAAk/v/D8Uj2orp3IsAAAAASUVORK5CYII=\n", + "text/plain": [ + "<Figure size 640x480 with 2 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "\n", + "y_pred = clf.predict(X_test)\n", + "#accuracy, precision, recall, F1\n", + "print(classification_report(y_test, y_pred))\n", + "print('-'*40)\n", + "tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()\n", + "print('True negatives: ', tn, '\\nFalse positives: ', fp, '\\nFalse negatives: ', fn, '\\nTrue Positives: ', tp)\n", + "ConfusionMatrixDisplay.from_predictions(y_test, y_pred)" + ] + }, + { + "cell_type": "markdown", + "id": "30062966", + "metadata": {}, + "source": [ + "## Training Result\n", + "\n", + "So, we have about 90% accuracy on training result using Logistical Regression.\n", + "\n", + "Let's do some cross-validation to see result stability." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4791c3eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5 fold cross-validation accuracy [0.89] with std deviation [0.03]\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import cross_val_score\n", + "\n", + "scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')\n", + "print (\"5 fold cross-validation accuracy [%0.2f%%] with std deviation [%0.2f%%]\" % (100*scores.mean(), 100*scores.std()))" + ] + }, + { + "cell_type": "markdown", + "id": "a4cd28dc", + "metadata": {}, + "source": [ + "We see the result to be stable with cross-validation. This was expected given target data has a good split between positive and negative.\n", + "\n", + "## Test Result\n", + "\n", + "Let's predict outcome for our test data using the chosen classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "77fbdfde", + "metadata": {}, + "outputs": [], + "source": [ + "test_raw_data = pd.read_csv('TestingDataBinary.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b6aeecef", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 100 entries, 0 to 99\n", + "Columns: 128 entries, R1-PA1:VH to snort_log4\n", + "dtypes: float64(112), int64(16)\n", + "memory usage: 100.1 KB\n" + ] + } + ], + "source": [ + "test_raw_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "dec22570", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_raw_data.columns[test_raw_data.isnull().any()].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "80208613", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>R1-PA1:VH</th>\n", + " <th>R1-PM1:V</th>\n", + " <th>R1-PA2:VH</th>\n", + " <th>R1-PM2:V</th>\n", + " <th>R1-PA3:VH</th>\n", + " <th>R1-PM3:V</th>\n", + " <th>R1-PA4:IH</th>\n", + " <th>R1-PM4:I</th>\n", + " <th>R1-PA5:IH</th>\n", + " <th>R1-PM5:I</th>\n", + " <th>...</th>\n", + " <th>control_panel_log3</th>\n", + " <th>control_panel_log4</th>\n", + " <th>relay1_log</th>\n", + " <th>relay2_log</th>\n", + " <th>relay3_log</th>\n", + " <th>relay4_log</th>\n", + " <th>snort_log1</th>\n", + " <th>snort_log2</th>\n", + " <th>snort_log3</th>\n", + " <th>snort_log4</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>-130.794806</td>\n", + " <td>130330.8575</td>\n", + " <td>109.205756</td>\n", + " <td>129704.0257</td>\n", + " <td>-10.771607</td>\n", + " <td>130381.0040</td>\n", + " <td>-131.052637</td>\n", + " <td>442.39376</td>\n", + " <td>105.573203</td>\n", + " <td>460.33854</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>43</th>\n", + " <td>95.557901</td>\n", + " <td>130029.9782</td>\n", + " <td>-24.425191</td>\n", + " <td>129979.8317</td>\n", + " <td>-144.431201</td>\n", + " <td>130080.1248</td>\n", + " <td>95.122453</td>\n", + " <td>455.02835</td>\n", + " <td>-28.556217</td>\n", + " <td>473.15624</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>12.049302</td>\n", + " <td>131810.1804</td>\n", + " <td>-107.922330</td>\n", + " <td>131183.3486</td>\n", + " <td>132.066772</td>\n", + " <td>131860.3269</td>\n", + " <td>9.986654</td>\n", + " <td>370.79775</td>\n", + " <td>-109.899035</td>\n", + " <td>370.61464</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>64</th>\n", + " <td>115.393700</td>\n", + " <td>131785.1071</td>\n", + " <td>-4.577933</td>\n", + " <td>131760.0339</td>\n", + " <td>-124.583943</td>\n", + " <td>131885.4002</td>\n", + " <td>113.669097</td>\n", + " <td>357.98005</td>\n", + " <td>-6.566096</td>\n", + " <td>358.52938</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>71</th>\n", + " <td>-169.652803</td>\n", + " <td>132537.3052</td>\n", + " <td>70.376406</td>\n", + " <td>132487.1587</td>\n", + " <td>-49.635334</td>\n", + " <td>132612.5250</td>\n", + " <td>-165.361349</td>\n", + " <td>261.29797</td>\n", + " <td>71.740046</td>\n", + " <td>269.35481</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 128 columns</p>\n", + "</div>" + ], + "text/plain": [ + " R1-PA1:VH R1-PM1:V R1-PA2:VH R1-PM2:V R1-PA3:VH R1-PM3:V \\\n", + "23 -130.794806 130330.8575 109.205756 129704.0257 -10.771607 130381.0040 \n", + "43 95.557901 130029.9782 -24.425191 129979.8317 -144.431201 130080.1248 \n", + "1 12.049302 131810.1804 -107.922330 131183.3486 132.066772 131860.3269 \n", + "64 115.393700 131785.1071 -4.577933 131760.0339 -124.583943 131885.4002 \n", + "71 -169.652803 132537.3052 70.376406 132487.1587 -49.635334 132612.5250 \n", + "\n", + " R1-PA4:IH R1-PM4:I R1-PA5:IH R1-PM5:I ... control_panel_log3 \\\n", + "23 -131.052637 442.39376 105.573203 460.33854 ... 0 \n", + "43 95.122453 455.02835 -28.556217 473.15624 ... 0 \n", + "1 9.986654 370.79775 -109.899035 370.61464 ... 0 \n", + "64 113.669097 357.98005 -6.566096 358.52938 ... 0 \n", + "71 -165.361349 261.29797 71.740046 269.35481 ... 0 \n", + "\n", + " control_panel_log4 relay1_log relay2_log relay3_log relay4_log \\\n", + "23 0 0 0 0 0 \n", + "43 0 0 0 0 0 \n", + "1 0 0 0 0 0 \n", + "64 0 0 0 0 0 \n", + "71 0 0 0 0 0 \n", + "\n", + " snort_log1 snort_log2 snort_log3 snort_log4 \n", + "23 0 0 0 0 \n", + "43 0 0 0 0 \n", + "1 0 0 0 0 \n", + "64 0 0 0 0 \n", + "71 0 0 0 0 \n", + "\n", + "[5 rows x 128 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_raw_data.sample(n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "f774ee14", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>R1:S_0</th>\n", + " <th>R1:S_2058</th>\n", + " <th>R2:S_0</th>\n", + " <th>R3:S_0</th>\n", + " <th>R4:S_0</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>82</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " R1:S_0 R1:S_2058 R2:S_0 R3:S_0 R4:S_0\n", + "82 1 0 1 1 1\n", + "17 1 0 1 1 1\n", + "2 1 0 1 1 1" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#let's save a copy before proceeding\n", + "results_df = test_raw_data.copy()\n", + "\n", + "test_RS_frame = test_raw_data[column_names_RS]\n", + "test_RS_frame = test_RS_frame.astype('int')\n", + "test_RS_frame = test_RS_frame.astype('category')\n", + "test_RS_frame = pd.get_dummies(test_RS_frame, dtype=int)\n", + "test_RS_frame.sample(3)" + ] + }, + { + "cell_type": "markdown", + "id": "4be4f161", + "metadata": {}, + "source": [ + "### Comment\n", + "We notice that test data does not has all the categorical values for RS feature as in training set. For numerical analysis let's add additional dummy columns by hand." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "2e3ae9b9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>R1:S_0</th>\n", + " <th>R1:S_2048</th>\n", + " <th>R1:S_2058</th>\n", + " <th>R1:S_270336</th>\n", + " <th>R2:S_0</th>\n", + " <th>R2:S_270336</th>\n", + " <th>R3:S_0</th>\n", + " <th>R3:S_2048</th>\n", + " <th>R3:S_270336</th>\n", + " <th>R4:S_0</th>\n", + " <th>R4:S_2058</th>\n", + " <th>R4:S_270336</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>79</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>92</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " R1:S_0 R1:S_2048 R1:S_2058 R1:S_270336 R2:S_0 R2:S_270336 R3:S_0 \\\n", + "79 1 0 0 0 1 0 1 \n", + "92 1 0 0 0 1 0 1 \n", + "\n", + " R3:S_2048 R3:S_270336 R4:S_0 R4:S_2058 R4:S_270336 \n", + "79 0 0 1 0 0 \n", + "92 0 0 1 0 0 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = RS_frame.columns.values\n", + "b = test_RS_frame.columns.values\n", + "missing_cols = [ai for ai in a if ai not in b]\n", + "missing_cols\n", + "for col in a :\n", + " if col not in b :\n", + " test_RS_frame[col] = 0\n", + "\n", + "#order columns as in training\n", + "test_RS_frame = test_RS_frame.loc[:,a]\n", + "test_RS_frame.sample(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "890ce249", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 100 entries, 0 to 99\n", + "Columns: 124 entries, R1-PA1:VH to snort_log4\n", + "dtypes: float64(112), int64(12)\n", + "memory usage: 97.0 KB\n" + ] + } + ], + "source": [ + "#let's drop RS columns\n", + "test_raw_data = test_raw_data.drop(column_names_RS, axis=1)\n", + "test_raw_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "24da31bd", + "metadata": {}, + "outputs": [], + "source": [ + "#let's deal with features requiring normaliation\n", + "unscaled_test_input = test_raw_data.iloc[:,:-12]\n", + "unscaled_test_input.replace([np.inf, -np.inf], np.nan, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "d17cd7e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100, 112)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scaled_test_input = scaler.transform(unscaled_test_input)\n", + "scaled_test_input.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "33845d19", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 100 entries, 0 to 99\n", + "Columns: 112 entries, R1-PA1:VH to R4-PA:ZH\n", + "dtypes: float64(112)\n", + "memory usage: 87.6 KB\n" + ] + } + ], + "source": [ + "test_df1_scaled = pd.DataFrame(scaled_test_input, index=unscaled_test_input.index, columns=column_names_updated[:scaled_test_input.shape[1]])\n", + "test_df1_scaled.replace(np.nan, 1, inplace=True)\n", + "test_df1_scaled.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "75c977ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 100 entries, 0 to 99\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 control_panel_log1 100 non-null int64\n", + " 1 control_panel_log2 100 non-null int64\n", + " 2 control_panel_log3 100 non-null int64\n", + " 3 control_panel_log4 100 non-null int64\n", + " 4 relay1_log 100 non-null int64\n", + " 5 relay2_log 100 non-null int64\n", + " 6 relay3_log 100 non-null int64\n", + " 7 relay4_log 100 non-null int64\n", + " 8 snort_log1 100 non-null int64\n", + " 9 snort_log2 100 non-null int64\n", + " 10 snort_log3 100 non-null int64\n", + " 11 snort_log4 100 non-null int64\n", + "dtypes: int64(12)\n", + "memory usage: 9.5 KB\n" + ] + } + ], + "source": [ + "test_df2 = test_raw_data.iloc[:,-12:]\n", + "test_df2.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "4f176a75", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 100 entries, 0 to 99\n", + "Columns: 136 entries, R1-PA1:VH to R4:S_270336\n", + "dtypes: float64(112), int64(24)\n", + "memory usage: 106.4 KB\n" + ] + } + ], + "source": [ + "test_X = pd.concat([test_df1_scaled, test_df2, test_RS_frame], axis=1)\n", + "test_X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "7aacce46", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results = clf.predict(test_X)\n", + "results" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "c627bdcf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 100 entries, 0 to 99\n", + "Columns: 129 entries, R1-PA1:VH to Result\n", + "dtypes: float64(112), int64(17)\n", + "memory usage: 100.9 KB\n" + ] + } + ], + "source": [ + "results_df['Result'] = np.transpose(results)\n", + "results_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "0ea24283", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>R1-PA1:VH</th>\n", + " <th>R1-PM1:V</th>\n", + " <th>R1-PA2:VH</th>\n", + " <th>R1-PM2:V</th>\n", + " <th>R1-PA3:VH</th>\n", + " <th>R1-PM3:V</th>\n", + " <th>R1-PA4:IH</th>\n", + " <th>R1-PM4:I</th>\n", + " <th>R1-PA5:IH</th>\n", + " <th>R1-PM5:I</th>\n", + " <th>...</th>\n", + " <th>control_panel_log4</th>\n", + " <th>relay1_log</th>\n", + " <th>relay2_log</th>\n", + " <th>relay3_log</th>\n", + " <th>relay4_log</th>\n", + " <th>snort_log1</th>\n", + " <th>snort_log2</th>\n", + " <th>snort_log3</th>\n", + " <th>snort_log4</th>\n", + " <th>Result</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>40</th>\n", + " <td>97.448662</td>\n", + " <td>130656.8100</td>\n", + " <td>-22.528701</td>\n", + " <td>130631.7367</td>\n", + " <td>-142.540440</td>\n", + " <td>130732.0298</td>\n", + " <td>98.439879</td>\n", + " <td>444.59108</td>\n", + " <td>-26.911828</td>\n", + " <td>471.32514</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>41</th>\n", + " <td>96.961648</td>\n", + " <td>130330.8575</td>\n", + " <td>-23.021444</td>\n", + " <td>130305.7842</td>\n", + " <td>-143.010266</td>\n", + " <td>130406.0773</td>\n", + " <td>97.695034</td>\n", + " <td>446.42218</td>\n", + " <td>-27.198307</td>\n", + " <td>471.69136</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>63</th>\n", + " <td>115.416618</td>\n", + " <td>131785.1071</td>\n", + " <td>-4.555014</td>\n", + " <td>131760.0339</td>\n", + " <td>-124.566754</td>\n", + " <td>131885.4002</td>\n", + " <td>113.697745</td>\n", + " <td>358.34627</td>\n", + " <td>-6.531719</td>\n", + " <td>358.71249</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>39</th>\n", + " <td>99.012837</td>\n", + " <td>131960.6200</td>\n", + " <td>-20.958796</td>\n", + " <td>131910.4735</td>\n", + " <td>-140.970536</td>\n", + " <td>132035.8398</td>\n", + " <td>100.662955</td>\n", + " <td>436.53424</td>\n", + " <td>-25.238791</td>\n", + " <td>467.11361</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>11.946170</td>\n", + " <td>131810.1804</td>\n", + " <td>-108.031192</td>\n", + " <td>131183.3486</td>\n", + " <td>131.963639</td>\n", + " <td>131860.3269</td>\n", + " <td>9.860604</td>\n", + " <td>370.79775</td>\n", + " <td>-110.059463</td>\n", + " <td>370.79775</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 129 columns</p>\n", + "</div>" + ], + "text/plain": [ + " R1-PA1:VH R1-PM1:V R1-PA2:VH R1-PM2:V R1-PA3:VH R1-PM3:V \\\n", + "40 97.448662 130656.8100 -22.528701 130631.7367 -142.540440 130732.0298 \n", + "41 96.961648 130330.8575 -23.021444 130305.7842 -143.010266 130406.0773 \n", + "63 115.416618 131785.1071 -4.555014 131760.0339 -124.566754 131885.4002 \n", + "39 99.012837 131960.6200 -20.958796 131910.4735 -140.970536 132035.8398 \n", + "4 11.946170 131810.1804 -108.031192 131183.3486 131.963639 131860.3269 \n", + "\n", + " R1-PA4:IH R1-PM4:I R1-PA5:IH R1-PM5:I ... control_panel_log4 \\\n", + "40 98.439879 444.59108 -26.911828 471.32514 ... 0 \n", + "41 97.695034 446.42218 -27.198307 471.69136 ... 0 \n", + "63 113.697745 358.34627 -6.531719 358.71249 ... 0 \n", + "39 100.662955 436.53424 -25.238791 467.11361 ... 0 \n", + "4 9.860604 370.79775 -110.059463 370.79775 ... 0 \n", + "\n", + " relay1_log relay2_log relay3_log relay4_log snort_log1 snort_log2 \\\n", + "40 0 0 0 0 0 0 \n", + "41 0 0 0 0 0 0 \n", + "63 0 0 0 0 0 0 \n", + "39 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 \n", + "\n", + " snort_log3 snort_log4 Result \n", + "40 0 0 1 \n", + "41 0 0 1 \n", + "63 0 0 0 \n", + "39 0 0 0 \n", + "4 0 0 1 \n", + "\n", + "[5 rows x 129 columns]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_df.sample(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "20c29600", + "metadata": {}, + "outputs": [], + "source": [ + "results_df.to_csv('TestingResultsBinary.csv')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab