diff --git a/assign_1.ipynb b/assign_1.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..6f8ee8c173500d484f306f3c9eaedcea74a3f4be
--- /dev/null
+++ b/assign_1.ipynb
@@ -0,0 +1,2514 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5354ae81",
+   "metadata": {},
+   "source": [
+    "# Coursework 2\n",
+    "\n",
+    "## Assignment 1\n",
+    "\n",
+    "We will use `pandas library` for data processing given data is available in tabular form. We would first check data for consistency, missing or invalid entries. We will look at categorical data and will map to binary using dummies. Non-categorical data will be normalied.\n",
+    "\n",
+    "Given target values are split between normal and anomalous samples we can start with a logistical regression to find a good fit. For training purpose we may further break the data into training and test. The test data that is provided will be used as final validator on accuracy of ML model.\n",
+    "\n",
+    "First, let's do some basic analysis of input data. (Please note that both Training and Test data had missing headers as described in the assignment. This has been manually corrected.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3997c6f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 6000 entries, 0 to 5999\n",
+      "Columns: 129 entries, R1-PA1:VH to marker\n",
+      "dtypes: float64(113), int64(16)\n",
+      "memory usage: 5.9 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "raw_data = pd.read_csv('TrainingDataBinary.csv')\n",
+    "raw_data.info()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a10544aa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6000, 129)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "65f549be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_5052/39642624.py:1: FutureWarning: In a future version of pandas all arguments of DataFrame.any and Series.any will be keyword-only.\n",
+      "  len(raw_data[raw_data.isnull().any(1)])\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(raw_data[raw_data.isnull().any(1)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d592ca2b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 6000 entries, 0 to 5999\n",
+      "Columns: 129 entries, R1-PA1:VH to marker\n",
+      "dtypes: float64(113), int64(16)\n",
+      "memory usage: 6.0 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "raw_data.drop_duplicates(keep='first', inplace=True)\n",
+    "raw_data.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "283c569c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_data.columns[raw_data.isnull().any()].tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "ed5a0f8a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    3000\n",
+       "1    3000\n",
+       "Name: marker, dtype: int64"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_data['marker'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d15070d",
+   "metadata": {},
+   "source": [
+    "### Comment\n",
+    "So, there are 6000 rows of all not-null data and the target column *marker* has value 0 and 1. Further, it's equally divided giving us a good stable sample."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "081e4ad7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>R1-PA1:VH</th>\n",
+       "      <th>R1-PM1:V</th>\n",
+       "      <th>R1-PA2:VH</th>\n",
+       "      <th>R1-PM2:V</th>\n",
+       "      <th>R1-PA3:VH</th>\n",
+       "      <th>R1-PM3:V</th>\n",
+       "      <th>R1-PA4:IH</th>\n",
+       "      <th>R1-PM4:I</th>\n",
+       "      <th>R1-PA5:IH</th>\n",
+       "      <th>R1-PM5:I</th>\n",
+       "      <th>...</th>\n",
+       "      <th>control_panel_log4</th>\n",
+       "      <th>relay1_log</th>\n",
+       "      <th>relay2_log</th>\n",
+       "      <th>relay3_log</th>\n",
+       "      <th>relay4_log</th>\n",
+       "      <th>snort_log1</th>\n",
+       "      <th>snort_log2</th>\n",
+       "      <th>snort_log3</th>\n",
+       "      <th>snort_log4</th>\n",
+       "      <th>marker</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1521</th>\n",
+       "      <td>96.228262</td>\n",
+       "      <td>132010.7666</td>\n",
+       "      <td>-23.754830</td>\n",
+       "      <td>131985.6933</td>\n",
+       "      <td>-143.743652</td>\n",
+       "      <td>132085.9864</td>\n",
+       "      <td>100.153023</td>\n",
+       "      <td>296.08887</td>\n",
+       "      <td>-23.657427</td>\n",
+       "      <td>311.28700</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2236</th>\n",
+       "      <td>-30.406870</td>\n",
+       "      <td>135395.6580</td>\n",
+       "      <td>-150.401421</td>\n",
+       "      <td>135345.5115</td>\n",
+       "      <td>89.599140</td>\n",
+       "      <td>135445.8045</td>\n",
+       "      <td>-26.161253</td>\n",
+       "      <td>517.65197</td>\n",
+       "      <td>-149.381556</td>\n",
+       "      <td>538.16029</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4081</th>\n",
+       "      <td>97.471580</td>\n",
+       "      <td>131835.2537</td>\n",
+       "      <td>-22.494323</td>\n",
+       "      <td>131785.1071</td>\n",
+       "      <td>-142.483145</td>\n",
+       "      <td>131885.4002</td>\n",
+       "      <td>98.084645</td>\n",
+       "      <td>349.37388</td>\n",
+       "      <td>-23.800667</td>\n",
+       "      <td>356.88139</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5096</th>\n",
+       "      <td>-71.957769</td>\n",
+       "      <td>131709.8873</td>\n",
+       "      <td>168.082899</td>\n",
+       "      <td>131684.8140</td>\n",
+       "      <td>48.071159</td>\n",
+       "      <td>131760.0339</td>\n",
+       "      <td>-74.874125</td>\n",
+       "      <td>401.01090</td>\n",
+       "      <td>165.252487</td>\n",
+       "      <td>402.65889</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5882</th>\n",
+       "      <td>-87.427630</td>\n",
+       "      <td>131459.1546</td>\n",
+       "      <td>152.607309</td>\n",
+       "      <td>131409.0081</td>\n",
+       "      <td>32.595569</td>\n",
+       "      <td>131509.3012</td>\n",
+       "      <td>-90.641923</td>\n",
+       "      <td>418.58946</td>\n",
+       "      <td>149.484689</td>\n",
+       "      <td>418.04013</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 129 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      R1-PA1:VH     R1-PM1:V   R1-PA2:VH     R1-PM2:V   R1-PA3:VH  \\\n",
+       "1521  96.228262  132010.7666  -23.754830  131985.6933 -143.743652   \n",
+       "2236 -30.406870  135395.6580 -150.401421  135345.5115   89.599140   \n",
+       "4081  97.471580  131835.2537  -22.494323  131785.1071 -142.483145   \n",
+       "5096 -71.957769  131709.8873  168.082899  131684.8140   48.071159   \n",
+       "5882 -87.427630  131459.1546  152.607309  131409.0081   32.595569   \n",
+       "\n",
+       "         R1-PM3:V   R1-PA4:IH   R1-PM4:I   R1-PA5:IH   R1-PM5:I  ...  \\\n",
+       "1521  132085.9864  100.153023  296.08887  -23.657427  311.28700  ...   \n",
+       "2236  135445.8045  -26.161253  517.65197 -149.381556  538.16029  ...   \n",
+       "4081  131885.4002   98.084645  349.37388  -23.800667  356.88139  ...   \n",
+       "5096  131760.0339  -74.874125  401.01090  165.252487  402.65889  ...   \n",
+       "5882  131509.3012  -90.641923  418.58946  149.484689  418.04013  ...   \n",
+       "\n",
+       "      control_panel_log4  relay1_log  relay2_log  relay3_log  relay4_log  \\\n",
+       "1521                   0           0           0           0           0   \n",
+       "2236                   0           0           0           0           0   \n",
+       "4081                   0           0           0           0           0   \n",
+       "5096                   0           0           0           0           0   \n",
+       "5882                   0           0           0           0           0   \n",
+       "\n",
+       "      snort_log1  snort_log2  snort_log3  snort_log4  marker  \n",
+       "1521           0           0           0           0       1  \n",
+       "2236           0           0           0           0       1  \n",
+       "4081           0           0           0           0       1  \n",
+       "5096           0           0           0           0       0  \n",
+       "5882           0           0           0           0       0  \n",
+       "\n",
+       "[5 rows x 129 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_data.sample(n=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "989201f2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['R1-PA1:VH', 'R1-PM1:V', 'R1-PA2:VH', 'R1-PM2:V', 'R1-PA3:VH',\n",
+       "       'R1-PM3:V', 'R1-PA4:IH', 'R1-PM4:I', 'R1-PA5:IH', 'R1-PM5:I',\n",
+       "       'R1-PA6:IH', 'R1-PM6:I', 'R1-PA7:VH', 'R1-PM7:V', 'R1-PA8:VH',\n",
+       "       'R1-PM8:V', 'R1-PA9:VH', 'R1-PM9:V', 'R1-PA10:IH', 'R1-PM10:I',\n",
+       "       'R1-PA11:IH', 'R1-PM11:I', 'R1-PA12:IH', 'R1-PM12:I', 'R1:F',\n",
+       "       'R1:DF', 'R1-PA:Z', 'R1-PA:ZH', 'R1:S', 'R2-PA1:VH', 'R2-PM1:V',\n",
+       "       'R2-PA2:VH', 'R2-PM2:V', 'R2-PA3:VH', 'R2-PM3:V', 'R2-PA4:IH',\n",
+       "       'R2-PM4:I', 'R2-PA5:IH', 'R2-PM5:I', 'R2-PA6:IH', 'R2-PM6:I',\n",
+       "       'R2-PA7:VH', 'R2-PM7:V', 'R2-PA8:VH', 'R2-PM8:V', 'R2-PA9:VH',\n",
+       "       'R2-PM9:V', 'R2-PA10:IH', 'R2-PM10:I', 'R2-PA11:IH', 'R2-PM11:I',\n",
+       "       'R2-PA12:IH', 'R2-PM12:I', 'R2:F', 'R2:DF', 'R2-PA:Z', 'R2-PA:ZH',\n",
+       "       'R2:S', 'R3-PA1:VH', 'R3-PM1:V', 'R3-PA2:VH', 'R3-PM2:V',\n",
+       "       'R3-PA3:VH', 'R3-PM3:V', 'R3-PA4:IH', 'R3-PM4:I', 'R3-PA5:IH',\n",
+       "       'R3-PM5:I', 'R3-PA6:IH', 'R3-PM6:I', 'R3-PA7:VH', 'R3-PM7:V',\n",
+       "       'R3-PA8:VH', 'R3-PM8:V', 'R3-PA9:VH', 'R3-PM9:V', 'R3-PA10:IH',\n",
+       "       'R3-PM10:I', 'R3-PA11:IH', 'R3-PM11:I', 'R3-PA12:IH', 'R3-PM12:I',\n",
+       "       'R3:F', 'R3:DF', 'R3-PA:Z', 'R3-PA:ZH', 'R3:S', 'R4-PA1:VH',\n",
+       "       'R4-PM1:V', 'R4-PA2:VH', 'R4-PM2:V', 'R4-PA3:VH', 'R4-PM3:V',\n",
+       "       'R4-PA4:IH', 'R4-PM4:I', 'R4-PA5:IH', 'R4-PM5:I', 'R4-PA6:IH',\n",
+       "       'R4-PM6:I', 'R4-PA7:VH', 'R4-PM7:V', 'R4-PA8:VH', 'R4-PM8:V',\n",
+       "       'R4-PA9:VH', 'R4-PM9:V', 'R4-PA10:IH', 'R4-PM10:I', 'R4-PA11:IH',\n",
+       "       'R4-PM11:I', 'R4-PA12:IH', 'R4-PM12:I', 'R4:F', 'R4:DF', 'R4-PA:Z',\n",
+       "       'R4-PA:ZH', 'R4:S', 'control_panel_log1', 'control_panel_log2',\n",
+       "       'control_panel_log3', 'control_panel_log4', 'relay1_log',\n",
+       "       'relay2_log', 'relay3_log', 'relay4_log', 'snort_log1',\n",
+       "       'snort_log2', 'snort_log3', 'snort_log4', 'marker'], dtype=object)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "column_names = raw_data.columns.values\n",
+    "column_names"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "131b20d5",
+   "metadata": {},
+   "source": [
+    "### Data analysis\n",
+    "\n",
+    "We understand that first 116 columns have current/voltage data so these are continuous real value data. Next 16 columns are switch (0/1) and last column is target that we just observed to contain half as 0 and other half as 1.\n",
+    "\n",
+    "Let's normalie real value data removing anomalous values, if any.\n",
+    "\n",
+    "We find out minimum, maximum, and number of unique data points for each feature."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "80f7ee9a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 129 entries, 0 to 128\n",
+      "Data columns (total 3 columns):\n",
+      " #   Column  Non-Null Count  Dtype  \n",
+      "---  ------  --------------  -----  \n",
+      " 0   minn    129 non-null    float64\n",
+      " 1   maxx    129 non-null    float64\n",
+      " 2   uniq    129 non-null    int64  \n",
+      "dtypes: float64(2), int64(1)\n",
+      "memory usage: 4.0 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "minn = [raw_data[c].min() for c in column_names]\n",
+    "maxx = [raw_data[c].max() for c in column_names]\n",
+    "uniq = [len(raw_data[c].unique()) for c in column_names]\n",
+    "idx=[x for x in range(len(minn))]\n",
+    "data_stat = pd.DataFrame({\"minn\": minn, \"maxx\": maxx, \"uniq\": uniq},index=idx)\n",
+    "data_stat.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0d21a61e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>minn</th>\n",
+       "      <th>maxx</th>\n",
+       "      <th>uniq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>116</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>117</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>118</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>119</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>126</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>127</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     minn  maxx  uniq\n",
+       "116   0.0   0.0     1\n",
+       "117   0.0   0.0     1\n",
+       "118   0.0   0.0     1\n",
+       "119   0.0   0.0     1\n",
+       "126   0.0   0.0     1\n",
+       "127   0.0   0.0     1"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Feature with the same value across the sample\n",
+    "data_stat[data_stat.uniq == 1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "ad863ae4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>minn</th>\n",
+       "      <th>maxx</th>\n",
+       "      <th>uniq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>270336.0</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>57</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>270336.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>86</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>270336.0</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>115</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>270336.0</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>116</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>117</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>118</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>119</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>120</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>121</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>122</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>123</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>124</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>125</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>126</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>127</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     minn      maxx  uniq\n",
+       "28    0.0  270336.0     4\n",
+       "57    0.0  270336.0     2\n",
+       "86    0.0  270336.0     3\n",
+       "115   0.0  270336.0     3\n",
+       "116   0.0       0.0     1\n",
+       "117   0.0       0.0     1\n",
+       "118   0.0       0.0     1\n",
+       "119   0.0       0.0     1\n",
+       "120   0.0       1.0     2\n",
+       "121   0.0       1.0     2\n",
+       "122   0.0       1.0     2\n",
+       "123   0.0       1.0     2\n",
+       "124   0.0       1.0     2\n",
+       "125   0.0       1.0     2\n",
+       "126   0.0       0.0     1\n",
+       "127   0.0       0.0     1\n",
+       "128   0.0       1.0     2"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Looking for categorical features... looking for feature with no more than 9 unique values\n",
+    "data_stat[data_stat.uniq < 10]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ff927af",
+   "metadata": {},
+   "source": [
+    "### Comment\n",
+    "Columns 116 onward are control values and were expected to be 0 or 1. Moreover column# 116-119 and 126-127 have all 0 and therefore, can be dropped.\n",
+    "\n",
+    "It's also interesting to note that 4 other columns have categorical data. Let's explore these."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "71c92946",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([     0,   2058,   2048, 270336])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_data[column_names[28]].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "6f86ffd4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([     0, 270336])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_data[column_names[57]].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "2311b43d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([     0,   2048, 270336])"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_data[column_names[86]].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "726a7bb9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([     0.,   2058., 270336.])"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_data[column_names[115]].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1cfdd4ba",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['R1:S', 'R2:S', 'R3:S', 'R4:S']"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[column_names[28], column_names[57], column_names[86], column_names[115]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3d245bd",
+   "metadata": {},
+   "source": [
+    "### Comment\n",
+    "\n",
+    "So, columns R#:S have categorical values that take on 4 possible values of 0, 2048, 2058, and 270336. We can create dummy columns to hold binary and drop original."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "9f105bf3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>R1:S_0</th>\n",
+       "      <th>R1:S_2048</th>\n",
+       "      <th>R1:S_2058</th>\n",
+       "      <th>R1:S_270336</th>\n",
+       "      <th>R2:S_0</th>\n",
+       "      <th>R2:S_270336</th>\n",
+       "      <th>R3:S_0</th>\n",
+       "      <th>R3:S_2048</th>\n",
+       "      <th>R3:S_270336</th>\n",
+       "      <th>R4:S_0</th>\n",
+       "      <th>R4:S_2058</th>\n",
+       "      <th>R4:S_270336</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>4472</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1837</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1054</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>460</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5263</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1395</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      R1:S_0  R1:S_2048  R1:S_2058  R1:S_270336  R2:S_0  R2:S_270336  R3:S_0  \\\n",
+       "4472       1          0          0            0       1            0       1   \n",
+       "1837       1          0          0            0       1            0       1   \n",
+       "1054       1          0          0            0       1            0       1   \n",
+       "460        1          0          0            0       1            0       1   \n",
+       "5263       1          0          0            0       1            0       1   \n",
+       "1395       1          0          0            0       1            0       1   \n",
+       "\n",
+       "      R3:S_2048  R3:S_270336  R4:S_0  R4:S_2058  R4:S_270336  \n",
+       "4472          0            0       1          0            0  \n",
+       "1837          0            0       1          0            0  \n",
+       "1054          0            0       1          0            0  \n",
+       "460           0            0       1          0            0  \n",
+       "5263          0            0       1          0            0  \n",
+       "1395          0            0       1          0            0  "
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#lets make a copy of original\n",
+    "df = raw_data.copy()\n",
+    "df.replace([np.inf, -np.inf], np.nan, inplace=True)\n",
+    "df.dropna(inplace=True)\n",
+    "\n",
+    "column_names_RS = [column_names[28], column_names[57], column_names[86], column_names[115]]\n",
+    "RS_frame = df[column_names_RS]\n",
+    "RS_frame = RS_frame.astype('int')\n",
+    "RS_frame = RS_frame.astype('category')\n",
+    "RS_frame = pd.get_dummies(RS_frame, dtype=int)\n",
+    "RS_frame.sample(n=6)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54517212",
+   "metadata": {},
+   "source": [
+    "### Comment\n",
+    "\n",
+    "We have created dummy columns corresponding to categorical values of RS. We can drop these now.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "19de15f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "6000 rows\n",
+      "** R1-PA1:VH:5530 (92%)\n",
+      "** R1-PM1:V:357 (5%)\n",
+      "** R1-PA2:VH:5550 (92%)\n",
+      "** R1-PM2:V:347 (5%)\n",
+      "** R1-PA3:VH:5515 (91%)\n",
+      "** R1-PM3:V:350 (5%)\n",
+      "** R1-PA4:IH:5616 (93%)\n",
+      "** R1-PM4:I:1587 (26%)\n",
+      "** R1-PA5:IH:5566 (92%)\n",
+      "** R1-PM5:I:1489 (24%)\n",
+      "** R1-PA6:IH:5569 (92%)\n",
+      "** R1-PM6:I:1570 (26%)\n",
+      "** R1-PA7:VH:5533 (92%)\n",
+      "** R1-PM7:V:360 (6%)\n",
+      "** R1-PA8:VH:48 (0%)\n",
+      "** R1-PM8:V:47 (0%)\n",
+      "** R1-PA9:VH:44 (0%)\n",
+      "** R1-PM9:V:40 (0%)\n",
+      "** R1-PA10:IH:5561 (92%)\n",
+      "** R1-PM10:I:1526 (25%)\n",
+      "** R1-PA11:IH:2152 (35%)\n",
+      "** R1-PM11:I:184 (3%)\n",
+      "** R1-PA12:IH:2185 (36%)\n",
+      "** R1-PM12:I:187 (3%)\n",
+      "** R1:F:103 (1%)\n",
+      "** R1:DF:101 (1%)\n",
+      "** R1-PA:Z:5999 (99%)\n",
+      "** R1-PA:ZH:5913 (98%)\n",
+      "** R1:S:[0:99.22%,2058:0.6%,270336:0.17%,2048:0.02%]\n",
+      "** R2-PA1:VH:5659 (94%)\n",
+      "** R2-PM1:V:4602 (76%)\n",
+      "** R2-PA2:VH:5661 (94%)\n",
+      "** R2-PM2:V:4577 (76%)\n",
+      "** R2-PA3:VH:5658 (94%)\n",
+      "** R2-PM3:V:4593 (76%)\n",
+      "** R2-PA4:IH:5690 (94%)\n",
+      "** R2-PM4:I:4968 (82%)\n",
+      "** R2-PA5:IH:5711 (95%)\n",
+      "** R2-PM5:I:4838 (80%)\n",
+      "** R2-PA6:IH:5743 (95%)\n",
+      "** R2-PM6:I:4957 (82%)\n",
+      "** R2-PA7:VH:5670 (94%)\n",
+      "** R2-PM7:V:4599 (76%)\n",
+      "** R2-PA8:VH:50 (0%)\n",
+      "** R2-PM8:V:50 (0%)\n",
+      "** R2-PA9:VH:47 (0%)\n",
+      "** R2-PM9:V:47 (0%)\n",
+      "** R2-PA10:IH:5695 (94%)\n",
+      "** R2-PM10:I:4924 (82%)\n",
+      "** R2-PA11:IH:2131 (35%)\n",
+      "** R2-PM11:I:1253 (20%)\n",
+      "** R2-PA12:IH:2162 (36%)\n",
+      "** R2-PM12:I:1255 (20%)\n",
+      "** R2:F:108 (1%)\n",
+      "** R2:DF:107 (1%)\n",
+      "** R2-PA:Z:6000 (100%)\n",
+      "** R2-PA:ZH:5886 (98%)\n",
+      "** R2:S:[0:99.83%,270336:0.17%]\n",
+      "** R3-PA1:VH:5534 (92%)\n",
+      "** R3-PM1:V:393 (6%)\n",
+      "** R3-PA2:VH:5536 (92%)\n",
+      "** R3-PM2:V:389 (6%)\n",
+      "** R3-PA3:VH:5539 (92%)\n",
+      "** R3-PM3:V:384 (6%)\n",
+      "** R3-PA4:IH:5602 (93%)\n",
+      "** R3-PM4:I:1601 (26%)\n",
+      "** R3-PA5:IH:5575 (92%)\n",
+      "** R3-PM5:I:1464 (24%)\n",
+      "** R3-PA6:IH:5614 (93%)\n",
+      "** R3-PM6:I:1571 (26%)\n",
+      "** R3-PA7:VH:5554 (92%)\n",
+      "** R3-PM7:V:392 (6%)\n",
+      "** R3-PA8:VH:50 (0%)\n",
+      "** R3-PM8:V:49 (0%)\n",
+      "** R3-PA9:VH:47 (0%)\n",
+      "** R3-PM9:V:46 (0%)\n",
+      "** R3-PA10:IH:5548 (92%)\n",
+      "** R3-PM10:I:1560 (26%)\n",
+      "** R3-PA11:IH:2063 (34%)\n",
+      "** R3-PM11:I:181 (3%)\n",
+      "** R3-PA12:IH:2053 (34%)\n",
+      "** R3-PM12:I:175 (2%)\n",
+      "** R3:F:92 (1%)\n",
+      "** R3:DF:111 (1%)\n",
+      "** R3-PA:Z:5999 (99%)\n",
+      "** R3-PA:ZH:5902 (98%)\n",
+      "** R3:S:[0:99.77%,270336:0.17%,2048:0.07%]\n",
+      "** R4-PA1:VH:5664 (94%)\n",
+      "** R4-PM1:V:1792 (29%)\n",
+      "** R4-PA2:VH:5668 (94%)\n",
+      "** R4-PM2:V:1781 (29%)\n",
+      "** R4-PA3:VH:5674 (94%)\n",
+      "** R4-PM3:V:1794 (29%)\n",
+      "** R4-PA4:IH:5699 (94%)\n",
+      "** R4-PM4:I:2866 (47%)\n",
+      "** R4-PA5:IH:5692 (94%)\n",
+      "** R4-PM5:I:2752 (45%)\n",
+      "** R4-PA6:IH:5693 (94%)\n",
+      "** R4-PM6:I:2859 (47%)\n",
+      "** R4-PA7:VH:5687 (94%)\n",
+      "** R4-PM7:V:1781 (29%)\n",
+      "** R4-PA8:VH:50 (0%)\n",
+      "** R4-PM8:V:48 (0%)\n",
+      "** R4-PA9:VH:45 (0%)\n",
+      "** R4-PM9:V:39 (0%)\n",
+      "** R4-PA10:IH:5695 (94%)\n",
+      "** R4-PM10:I:2826 (47%)\n",
+      "** R4-PA11:IH:2090 (34%)\n",
+      "** R4-PM11:I:696 (11%)\n",
+      "** R4-PA12:IH:2078 (34%)\n",
+      "** R4-PM12:I:726 (12%)\n",
+      "** R4:F:117 (1%)\n",
+      "** R4:DF:96 (1%)\n",
+      "** R4-PA:Z:5999 (99%)\n",
+      "** R4-PA:ZH:5891 (98%)\n",
+      "** R4:S:[0.0:99.82%,270336.0:0.17%,2058.0:0.02%]\n",
+      "** control_panel_log1:[0:100.0%]\n",
+      "** control_panel_log2:[0:100.0%]\n",
+      "** control_panel_log3:[0:100.0%]\n",
+      "** control_panel_log4:[0:100.0%]\n",
+      "** relay1_log:[0:99.87%,1:0.13%]\n",
+      "** relay2_log:[0:99.88%,1:0.12%]\n",
+      "** relay3_log:[0:99.95%,1:0.05%]\n",
+      "** relay4_log:[0:99.97%,1:0.03%]\n",
+      "** snort_log1:[0:99.98%,1:0.02%]\n",
+      "** snort_log2:[0:99.98%,1:0.02%]\n",
+      "** snort_log3:[0:100.0%]\n",
+      "** snort_log4:[0:100.0%]\n",
+      "** marker:[0:50.0%,1:50.0%]\n"
+     ]
+    }
+   ],
+   "source": [
+    "ENCODING = 'utf-8'\n",
+    "\n",
+    "def expand_categories(values):\n",
+    "    result = []\n",
+    "    s = values.value_counts()\n",
+    "    t = float(len(values))\n",
+    "    for v in s.index:\n",
+    "        result.append(\"{}:{}%\".format(v,round(100*(s[v]/t),2)))\n",
+    "    return \"[{}]\".format(\",\".join(result))\n",
+    "        \n",
+    "def analyze(df):\n",
+    "    print()\n",
+    "    cols = df.columns.values\n",
+    "    total = float(len(df))\n",
+    "\n",
+    "    print(\"{} rows\".format(int(total)))\n",
+    "    for col in cols:\n",
+    " \n",
+    "        uniques = df[col].unique()\n",
+    "        unique_count = len(uniques)\n",
+    "        if unique_count>10:\n",
+    "            print(\"** {}:{} ({}%)\".format(col,unique_count,int(((unique_count)/total)*100)))\n",
+    "        else:\n",
+    "            print(\"** {}:{}\".format(col,expand_categories(df[col])))\n",
+    "            expand_categories(df[col])\n",
+    "\n",
+    "# Analyze KDD-99\n",
+    "\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import numpy as np\n",
+    "from sklearn import metrics\n",
+    "from scipy.stats import zscore\n",
+    "\n",
+    "analyze(df)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "4311c15b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 6000 entries, 0 to 5999\n",
+      "Columns: 125 entries, R1-PA1:VH to marker\n",
+      "dtypes: float64(112), int64(13)\n",
+      "memory usage: 5.8 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "column_names_updated = np.delete(column_names, [28, 57, 86, 115])\n",
+    "df = df.drop(column_names_RS, axis=1)\n",
+    "\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "f21a5139",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6000, 112)"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "#let's scale numerical feature\n",
+    "unscaled_input = df.iloc[:,:-13]\n",
+    "scaler = StandardScaler().fit(unscaled_input)\n",
+    "scaled_input = scaler.transform(unscaled_input)\n",
+    "scaled_input[np.isnan(scaled_input)] = 1\n",
+    "#np.isnan(scaled_input.sum())\n",
+    "scaled_input.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "91c3e8c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 6000 entries, 0 to 5999\n",
+      "Columns: 112 entries, R1-PA1:VH to R4-PA:ZH\n",
+      "dtypes: float64(112)\n",
+      "memory usage: 5.2 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df1_scaled = pd.DataFrame(scaled_input, index=df.index, columns=column_names_updated[:scaled_input.shape[1]])\n",
+    "df1_scaled.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "4a23f18c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 6000 entries, 0 to 5999\n",
+      "Data columns (total 12 columns):\n",
+      " #   Column              Non-Null Count  Dtype\n",
+      "---  ------              --------------  -----\n",
+      " 0   control_panel_log1  6000 non-null   int64\n",
+      " 1   control_panel_log2  6000 non-null   int64\n",
+      " 2   control_panel_log3  6000 non-null   int64\n",
+      " 3   control_panel_log4  6000 non-null   int64\n",
+      " 4   relay1_log          6000 non-null   int64\n",
+      " 5   relay2_log          6000 non-null   int64\n",
+      " 6   relay3_log          6000 non-null   int64\n",
+      " 7   relay4_log          6000 non-null   int64\n",
+      " 8   snort_log1          6000 non-null   int64\n",
+      " 9   snort_log2          6000 non-null   int64\n",
+      " 10  snort_log3          6000 non-null   int64\n",
+      " 11  snort_log4          6000 non-null   int64\n",
+      "dtypes: int64(12)\n",
+      "memory usage: 609.4 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df2 = df.iloc[:,-13:-1]\n",
+    "df2.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "ba3d7444",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 6000 entries, 0 to 5999\n",
+      "Data columns (total 12 columns):\n",
+      " #   Column       Non-Null Count  Dtype\n",
+      "---  ------       --------------  -----\n",
+      " 0   R1:S_0       6000 non-null   int64\n",
+      " 1   R1:S_2048    6000 non-null   int64\n",
+      " 2   R1:S_2058    6000 non-null   int64\n",
+      " 3   R1:S_270336  6000 non-null   int64\n",
+      " 4   R2:S_0       6000 non-null   int64\n",
+      " 5   R2:S_270336  6000 non-null   int64\n",
+      " 6   R3:S_0       6000 non-null   int64\n",
+      " 7   R3:S_2048    6000 non-null   int64\n",
+      " 8   R3:S_270336  6000 non-null   int64\n",
+      " 9   R4:S_0       6000 non-null   int64\n",
+      " 10  R4:S_2058    6000 non-null   int64\n",
+      " 11  R4:S_270336  6000 non-null   int64\n",
+      "dtypes: int64(12)\n",
+      "memory usage: 609.4 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "#RS dummies\n",
+    "RS_frame.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "2e67286d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 6000 entries, 0 to 5999\n",
+      "Columns: 136 entries, R1-PA1:VH to R4:S_270336\n",
+      "dtypes: float64(112), int64(24)\n",
+      "memory usage: 6.3 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "X = pd.concat([df1_scaled, df2, RS_frame], axis=1)\n",
+    "X.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "7d4368da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y = df[column_names[-1]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "5ac91fca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6000,)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "4335edba",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9052380952380953"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "#using SVM analysis\n",
+    "from sklearn import svm\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size=0.35)\n",
+    "\n",
+    "clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)\n",
+    "clf.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "aea2f41f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9238095238095239"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "#using Logistic regression\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size=0.35)\n",
+    "\n",
+    "clf = LogisticRegression(C=1e2, max_iter=5000).fit(X_train, y_train)\n",
+    "clf.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "76cd1a27",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.91      0.94      0.92      1051\n",
+      "           1       0.93      0.91      0.92      1049\n",
+      "\n",
+      "    accuracy                           0.92      2100\n",
+      "   macro avg       0.92      0.92      0.92      2100\n",
+      "weighted avg       0.92      0.92      0.92      2100\n",
+      "\n",
+      "----------------------------------------\n",
+      "True negatives:  984 \n",
+      "False positives:  67 \n",
+      "False negatives:  93 \n",
+      "True Positives:  956\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f92f587fb80>"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 640x480 with 2 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import confusion_matrix\n",
+    "from sklearn.metrics import classification_report\n",
+    "from sklearn.metrics import ConfusionMatrixDisplay\n",
+    "\n",
+    "y_pred = clf.predict(X_test)\n",
+    "#accuracy, precision, recall, F1\n",
+    "print(classification_report(y_test, y_pred))\n",
+    "print('-'*40)\n",
+    "tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()\n",
+    "print('True negatives: ', tn, '\\nFalse positives: ', fp, '\\nFalse negatives: ', fn, '\\nTrue Positives: ', tp)\n",
+    "ConfusionMatrixDisplay.from_predictions(y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "30062966",
+   "metadata": {},
+   "source": [
+    "## Training Result\n",
+    "\n",
+    "So, we have about 90% accuracy on training result using Logistical Regression.\n",
+    "\n",
+    "Let's do some cross-validation to see result stability."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "4791c3eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5 fold cross-validation accuracy [0.89] with std deviation [0.03]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import cross_val_score\n",
+    "\n",
+    "scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')\n",
+    "print (\"5 fold cross-validation accuracy [%0.2f%%] with std deviation [%0.2f%%]\" % (100*scores.mean(), 100*scores.std()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a4cd28dc",
+   "metadata": {},
+   "source": [
+    "We see the result to be stable with cross-validation. This was expected given target data has a good split between positive and negative.\n",
+    "\n",
+    "## Test Result\n",
+    "\n",
+    "Let's predict outcome for our test data using the chosen classifier."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "77fbdfde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_raw_data = pd.read_csv('TestingDataBinary.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "b6aeecef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Columns: 128 entries, R1-PA1:VH to snort_log4\n",
+      "dtypes: float64(112), int64(16)\n",
+      "memory usage: 100.1 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_raw_data.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "dec22570",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_raw_data.columns[test_raw_data.isnull().any()].tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "80208613",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>R1-PA1:VH</th>\n",
+       "      <th>R1-PM1:V</th>\n",
+       "      <th>R1-PA2:VH</th>\n",
+       "      <th>R1-PM2:V</th>\n",
+       "      <th>R1-PA3:VH</th>\n",
+       "      <th>R1-PM3:V</th>\n",
+       "      <th>R1-PA4:IH</th>\n",
+       "      <th>R1-PM4:I</th>\n",
+       "      <th>R1-PA5:IH</th>\n",
+       "      <th>R1-PM5:I</th>\n",
+       "      <th>...</th>\n",
+       "      <th>control_panel_log3</th>\n",
+       "      <th>control_panel_log4</th>\n",
+       "      <th>relay1_log</th>\n",
+       "      <th>relay2_log</th>\n",
+       "      <th>relay3_log</th>\n",
+       "      <th>relay4_log</th>\n",
+       "      <th>snort_log1</th>\n",
+       "      <th>snort_log2</th>\n",
+       "      <th>snort_log3</th>\n",
+       "      <th>snort_log4</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>-130.794806</td>\n",
+       "      <td>130330.8575</td>\n",
+       "      <td>109.205756</td>\n",
+       "      <td>129704.0257</td>\n",
+       "      <td>-10.771607</td>\n",
+       "      <td>130381.0040</td>\n",
+       "      <td>-131.052637</td>\n",
+       "      <td>442.39376</td>\n",
+       "      <td>105.573203</td>\n",
+       "      <td>460.33854</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43</th>\n",
+       "      <td>95.557901</td>\n",
+       "      <td>130029.9782</td>\n",
+       "      <td>-24.425191</td>\n",
+       "      <td>129979.8317</td>\n",
+       "      <td>-144.431201</td>\n",
+       "      <td>130080.1248</td>\n",
+       "      <td>95.122453</td>\n",
+       "      <td>455.02835</td>\n",
+       "      <td>-28.556217</td>\n",
+       "      <td>473.15624</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>12.049302</td>\n",
+       "      <td>131810.1804</td>\n",
+       "      <td>-107.922330</td>\n",
+       "      <td>131183.3486</td>\n",
+       "      <td>132.066772</td>\n",
+       "      <td>131860.3269</td>\n",
+       "      <td>9.986654</td>\n",
+       "      <td>370.79775</td>\n",
+       "      <td>-109.899035</td>\n",
+       "      <td>370.61464</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>64</th>\n",
+       "      <td>115.393700</td>\n",
+       "      <td>131785.1071</td>\n",
+       "      <td>-4.577933</td>\n",
+       "      <td>131760.0339</td>\n",
+       "      <td>-124.583943</td>\n",
+       "      <td>131885.4002</td>\n",
+       "      <td>113.669097</td>\n",
+       "      <td>357.98005</td>\n",
+       "      <td>-6.566096</td>\n",
+       "      <td>358.52938</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71</th>\n",
+       "      <td>-169.652803</td>\n",
+       "      <td>132537.3052</td>\n",
+       "      <td>70.376406</td>\n",
+       "      <td>132487.1587</td>\n",
+       "      <td>-49.635334</td>\n",
+       "      <td>132612.5250</td>\n",
+       "      <td>-165.361349</td>\n",
+       "      <td>261.29797</td>\n",
+       "      <td>71.740046</td>\n",
+       "      <td>269.35481</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 128 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     R1-PA1:VH     R1-PM1:V   R1-PA2:VH     R1-PM2:V   R1-PA3:VH     R1-PM3:V  \\\n",
+       "23 -130.794806  130330.8575  109.205756  129704.0257  -10.771607  130381.0040   \n",
+       "43   95.557901  130029.9782  -24.425191  129979.8317 -144.431201  130080.1248   \n",
+       "1    12.049302  131810.1804 -107.922330  131183.3486  132.066772  131860.3269   \n",
+       "64  115.393700  131785.1071   -4.577933  131760.0339 -124.583943  131885.4002   \n",
+       "71 -169.652803  132537.3052   70.376406  132487.1587  -49.635334  132612.5250   \n",
+       "\n",
+       "     R1-PA4:IH   R1-PM4:I   R1-PA5:IH   R1-PM5:I  ...  control_panel_log3  \\\n",
+       "23 -131.052637  442.39376  105.573203  460.33854  ...                   0   \n",
+       "43   95.122453  455.02835  -28.556217  473.15624  ...                   0   \n",
+       "1     9.986654  370.79775 -109.899035  370.61464  ...                   0   \n",
+       "64  113.669097  357.98005   -6.566096  358.52938  ...                   0   \n",
+       "71 -165.361349  261.29797   71.740046  269.35481  ...                   0   \n",
+       "\n",
+       "    control_panel_log4  relay1_log  relay2_log  relay3_log  relay4_log  \\\n",
+       "23                   0           0           0           0           0   \n",
+       "43                   0           0           0           0           0   \n",
+       "1                    0           0           0           0           0   \n",
+       "64                   0           0           0           0           0   \n",
+       "71                   0           0           0           0           0   \n",
+       "\n",
+       "    snort_log1  snort_log2  snort_log3  snort_log4  \n",
+       "23           0           0           0           0  \n",
+       "43           0           0           0           0  \n",
+       "1            0           0           0           0  \n",
+       "64           0           0           0           0  \n",
+       "71           0           0           0           0  \n",
+       "\n",
+       "[5 rows x 128 columns]"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_raw_data.sample(n=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "f774ee14",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>R1:S_0</th>\n",
+       "      <th>R1:S_2058</th>\n",
+       "      <th>R2:S_0</th>\n",
+       "      <th>R3:S_0</th>\n",
+       "      <th>R4:S_0</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>82</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    R1:S_0  R1:S_2058  R2:S_0  R3:S_0  R4:S_0\n",
+       "82       1          0       1       1       1\n",
+       "17       1          0       1       1       1\n",
+       "2        1          0       1       1       1"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#let's save a copy before proceeding\n",
+    "results_df = test_raw_data.copy()\n",
+    "\n",
+    "test_RS_frame = test_raw_data[column_names_RS]\n",
+    "test_RS_frame = test_RS_frame.astype('int')\n",
+    "test_RS_frame = test_RS_frame.astype('category')\n",
+    "test_RS_frame = pd.get_dummies(test_RS_frame, dtype=int)\n",
+    "test_RS_frame.sample(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4be4f161",
+   "metadata": {},
+   "source": [
+    "### Comment\n",
+    "We notice that test data does not has all the categorical values for RS feature as in training set. For numerical analysis let's add additional dummy columns by hand."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "2e3ae9b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>R1:S_0</th>\n",
+       "      <th>R1:S_2048</th>\n",
+       "      <th>R1:S_2058</th>\n",
+       "      <th>R1:S_270336</th>\n",
+       "      <th>R2:S_0</th>\n",
+       "      <th>R2:S_270336</th>\n",
+       "      <th>R3:S_0</th>\n",
+       "      <th>R3:S_2048</th>\n",
+       "      <th>R3:S_270336</th>\n",
+       "      <th>R4:S_0</th>\n",
+       "      <th>R4:S_2058</th>\n",
+       "      <th>R4:S_270336</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>79</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>92</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    R1:S_0  R1:S_2048  R1:S_2058  R1:S_270336  R2:S_0  R2:S_270336  R3:S_0  \\\n",
+       "79       1          0          0            0       1            0       1   \n",
+       "92       1          0          0            0       1            0       1   \n",
+       "\n",
+       "    R3:S_2048  R3:S_270336  R4:S_0  R4:S_2058  R4:S_270336  \n",
+       "79          0            0       1          0            0  \n",
+       "92          0            0       1          0            0  "
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a = RS_frame.columns.values\n",
+    "b = test_RS_frame.columns.values\n",
+    "missing_cols = [ai for ai in a if ai not in b]\n",
+    "missing_cols\n",
+    "for col in a :\n",
+    "    if col not in b :\n",
+    "        test_RS_frame[col] = 0\n",
+    "\n",
+    "#order columns as in training\n",
+    "test_RS_frame = test_RS_frame.loc[:,a]\n",
+    "test_RS_frame.sample(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "890ce249",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Columns: 124 entries, R1-PA1:VH to snort_log4\n",
+      "dtypes: float64(112), int64(12)\n",
+      "memory usage: 97.0 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "#let's drop RS columns\n",
+    "test_raw_data = test_raw_data.drop(column_names_RS, axis=1)\n",
+    "test_raw_data.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "24da31bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#let's deal with features requiring normaliation\n",
+    "unscaled_test_input = test_raw_data.iloc[:,:-12]\n",
+    "unscaled_test_input.replace([np.inf, -np.inf], np.nan, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "d17cd7e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(100, 112)"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "scaled_test_input = scaler.transform(unscaled_test_input)\n",
+    "scaled_test_input.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "33845d19",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Columns: 112 entries, R1-PA1:VH to R4-PA:ZH\n",
+      "dtypes: float64(112)\n",
+      "memory usage: 87.6 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_df1_scaled = pd.DataFrame(scaled_test_input, index=unscaled_test_input.index, columns=column_names_updated[:scaled_test_input.shape[1]])\n",
+    "test_df1_scaled.replace(np.nan, 1, inplace=True)\n",
+    "test_df1_scaled.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "75c977ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Data columns (total 12 columns):\n",
+      " #   Column              Non-Null Count  Dtype\n",
+      "---  ------              --------------  -----\n",
+      " 0   control_panel_log1  100 non-null    int64\n",
+      " 1   control_panel_log2  100 non-null    int64\n",
+      " 2   control_panel_log3  100 non-null    int64\n",
+      " 3   control_panel_log4  100 non-null    int64\n",
+      " 4   relay1_log          100 non-null    int64\n",
+      " 5   relay2_log          100 non-null    int64\n",
+      " 6   relay3_log          100 non-null    int64\n",
+      " 7   relay4_log          100 non-null    int64\n",
+      " 8   snort_log1          100 non-null    int64\n",
+      " 9   snort_log2          100 non-null    int64\n",
+      " 10  snort_log3          100 non-null    int64\n",
+      " 11  snort_log4          100 non-null    int64\n",
+      "dtypes: int64(12)\n",
+      "memory usage: 9.5 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_df2 = test_raw_data.iloc[:,-12:]\n",
+    "test_df2.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "4f176a75",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Columns: 136 entries, R1-PA1:VH to R4:S_270336\n",
+      "dtypes: float64(112), int64(24)\n",
+      "memory usage: 106.4 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_X = pd.concat([test_df1_scaled, test_df2, test_RS_frame], axis=1)\n",
+    "test_X.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "7aacce46",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+       "       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,\n",
+       "       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results = clf.predict(test_X)\n",
+    "results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "c627bdcf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Columns: 129 entries, R1-PA1:VH to Result\n",
+      "dtypes: float64(112), int64(17)\n",
+      "memory usage: 100.9 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "results_df['Result'] = np.transpose(results)\n",
+    "results_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "0ea24283",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>R1-PA1:VH</th>\n",
+       "      <th>R1-PM1:V</th>\n",
+       "      <th>R1-PA2:VH</th>\n",
+       "      <th>R1-PM2:V</th>\n",
+       "      <th>R1-PA3:VH</th>\n",
+       "      <th>R1-PM3:V</th>\n",
+       "      <th>R1-PA4:IH</th>\n",
+       "      <th>R1-PM4:I</th>\n",
+       "      <th>R1-PA5:IH</th>\n",
+       "      <th>R1-PM5:I</th>\n",
+       "      <th>...</th>\n",
+       "      <th>control_panel_log4</th>\n",
+       "      <th>relay1_log</th>\n",
+       "      <th>relay2_log</th>\n",
+       "      <th>relay3_log</th>\n",
+       "      <th>relay4_log</th>\n",
+       "      <th>snort_log1</th>\n",
+       "      <th>snort_log2</th>\n",
+       "      <th>snort_log3</th>\n",
+       "      <th>snort_log4</th>\n",
+       "      <th>Result</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>97.448662</td>\n",
+       "      <td>130656.8100</td>\n",
+       "      <td>-22.528701</td>\n",
+       "      <td>130631.7367</td>\n",
+       "      <td>-142.540440</td>\n",
+       "      <td>130732.0298</td>\n",
+       "      <td>98.439879</td>\n",
+       "      <td>444.59108</td>\n",
+       "      <td>-26.911828</td>\n",
+       "      <td>471.32514</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41</th>\n",
+       "      <td>96.961648</td>\n",
+       "      <td>130330.8575</td>\n",
+       "      <td>-23.021444</td>\n",
+       "      <td>130305.7842</td>\n",
+       "      <td>-143.010266</td>\n",
+       "      <td>130406.0773</td>\n",
+       "      <td>97.695034</td>\n",
+       "      <td>446.42218</td>\n",
+       "      <td>-27.198307</td>\n",
+       "      <td>471.69136</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>63</th>\n",
+       "      <td>115.416618</td>\n",
+       "      <td>131785.1071</td>\n",
+       "      <td>-4.555014</td>\n",
+       "      <td>131760.0339</td>\n",
+       "      <td>-124.566754</td>\n",
+       "      <td>131885.4002</td>\n",
+       "      <td>113.697745</td>\n",
+       "      <td>358.34627</td>\n",
+       "      <td>-6.531719</td>\n",
+       "      <td>358.71249</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>99.012837</td>\n",
+       "      <td>131960.6200</td>\n",
+       "      <td>-20.958796</td>\n",
+       "      <td>131910.4735</td>\n",
+       "      <td>-140.970536</td>\n",
+       "      <td>132035.8398</td>\n",
+       "      <td>100.662955</td>\n",
+       "      <td>436.53424</td>\n",
+       "      <td>-25.238791</td>\n",
+       "      <td>467.11361</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>11.946170</td>\n",
+       "      <td>131810.1804</td>\n",
+       "      <td>-108.031192</td>\n",
+       "      <td>131183.3486</td>\n",
+       "      <td>131.963639</td>\n",
+       "      <td>131860.3269</td>\n",
+       "      <td>9.860604</td>\n",
+       "      <td>370.79775</td>\n",
+       "      <td>-110.059463</td>\n",
+       "      <td>370.79775</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 129 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     R1-PA1:VH     R1-PM1:V   R1-PA2:VH     R1-PM2:V   R1-PA3:VH     R1-PM3:V  \\\n",
+       "40   97.448662  130656.8100  -22.528701  130631.7367 -142.540440  130732.0298   \n",
+       "41   96.961648  130330.8575  -23.021444  130305.7842 -143.010266  130406.0773   \n",
+       "63  115.416618  131785.1071   -4.555014  131760.0339 -124.566754  131885.4002   \n",
+       "39   99.012837  131960.6200  -20.958796  131910.4735 -140.970536  132035.8398   \n",
+       "4    11.946170  131810.1804 -108.031192  131183.3486  131.963639  131860.3269   \n",
+       "\n",
+       "     R1-PA4:IH   R1-PM4:I   R1-PA5:IH   R1-PM5:I  ...  control_panel_log4  \\\n",
+       "40   98.439879  444.59108  -26.911828  471.32514  ...                   0   \n",
+       "41   97.695034  446.42218  -27.198307  471.69136  ...                   0   \n",
+       "63  113.697745  358.34627   -6.531719  358.71249  ...                   0   \n",
+       "39  100.662955  436.53424  -25.238791  467.11361  ...                   0   \n",
+       "4     9.860604  370.79775 -110.059463  370.79775  ...                   0   \n",
+       "\n",
+       "    relay1_log  relay2_log  relay3_log  relay4_log  snort_log1  snort_log2  \\\n",
+       "40           0           0           0           0           0           0   \n",
+       "41           0           0           0           0           0           0   \n",
+       "63           0           0           0           0           0           0   \n",
+       "39           0           0           0           0           0           0   \n",
+       "4            0           0           0           0           0           0   \n",
+       "\n",
+       "    snort_log3  snort_log4  Result  \n",
+       "40           0           0       1  \n",
+       "41           0           0       1  \n",
+       "63           0           0       0  \n",
+       "39           0           0       0  \n",
+       "4            0           0       1  \n",
+       "\n",
+       "[5 rows x 129 columns]"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results_df.sample(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "20c29600",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results_df.to_csv('TestingResultsBinary.csv')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}