Finished report, code comments, QoL formatting

changes

Finished report, code comments, QoL formatting
fcdb506e · christosPro123 · 0f0ce335 · fcdb506e · fcdb506e · fcdb506e
Commit fcdb506e authored 2 years ago by christosPro123
--- a/COMP3217.docx
+++ b/COMP3217.docx
--- a/part1.ipynb
+++ b/part1.ipynb
@@ -507,7 +507,7 @@
    "test_data['predicted_marker'] = y_pred_new\n",
    "\n",
    "# Save the updated DataFrame to a new CSV file\n",
-    "test_data.to_csv('TestingDataBinary_with_predictions.csv', index=False)"
+    "test_data.to_csv('TestingResultsBinary.csv', index=False)"
   ]
  }
 ],

 %% Cell type:code id: tags:
 ``` python
 import pandas as pd
 import numpy as np
 import sklearn
 import scipy
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
 from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LogisticRegression
 import matplotlib.pyplot as plt
 from sklearn.decomposition import PCA
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import GridSearchCV
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import StandardScaler
 from sklearn.impute import SimpleImputer
 from sklearn.decomposition import PCA
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 from sklearn.model_selection import GridSearchCV
 import pandas as pd
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import RandomizedSearchCV
 ```
 %% Cell type:code id: tags:
 ``` python
 #Read CSV file as Pandas Dataframe
 train_df = pd.read_csv('TrainingDataBinary.csv')
 ```
 %% Cell type:code id: tags:
 ``` python
 print(train_df.info())
 ```
 %% Output
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 6000 entries, 0 to 5999
    Columns: 129 entries, 1 to 129
    dtypes: float64(112), int64(17)
    memory usage: 5.9 MB
    None
 %% Cell type:code id: tags:
 ``` python
 # Create a histogram to show the distribution of a column
 plt.hist(train_df['129'])
 ```
 %% Output
    (array([3000.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            3000.]),
     array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
     <BarContainer object of 10 artists>)
 %% Cell type:code id: tags:
 ``` python
 scaler = StandardScaler()
 # Separate the features from the target variable
 X = train_df.drop('129', axis=1)
 y = train_df['129']
 #Fix infinite value error
 X.replace([np.inf,-np.inf],0,inplace=True)
 # Create a SimpleImputer object to replace NaN values with the mean value of the corresponding column
 imputer = SimpleImputer(strategy='mean')
 # Impute the missing values in the features data
 X_imputed = imputer.fit_transform(X)
 # Fit the scaler to the features data and transform the data
 X_scaled = scaler.fit_transform(X_imputed)
 # # The transformed data will be a numpy array, so you can convert it back to a DataFrame
 # X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
 ```
 %% Cell type:code id: tags:
 ``` python
 #PCA
 pca = PCA(n_components=100)
 X_pca = pca.fit_transform(X_scaled)
 ```
 %% Cell type:code id: tags:
 ``` python
 #split data
 X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
 #train the model
 log_reg = LogisticRegression()
 log_reg.fit(X_train, y_train)
 # 5. Evaluate the model on the testing set
 y_pred = log_reg.predict(X_test)
 accuracy = accuracy_score(y_test, y_pred)
 report = classification_report(y_test, y_pred)
 print("Accuracy:", accuracy)
 print("Classification Report:\n", report)
 ```
 %% Output
    Accuracy: 0.895
    Classification Report:
                   precision    recall  f1-score   support
               0       0.86      0.94      0.90       588
               1       0.93      0.86      0.89       612
        accuracy                           0.90      1200
       macro avg       0.90      0.90      0.89      1200
    weighted avg       0.90      0.90      0.89      1200
    c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
    STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
    Increase the number of iterations (max_iter) or scale the data as shown in:
        https://scikit-learn.org/stable/modules/preprocessing.html
    Please also refer to the documentation for alternative solver options:
        https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
      n_iter_i = _check_optimize_result(
 %% Cell type:code id: tags:
 ``` python
 conf_matrix = confusion_matrix(y_test, y_pred)
 print("Confusion Matrix:\n", conf_matrix)
 ```
 %% Output
    Confusion Matrix:
     [[550  38]
     [ 88 524]]
 %% Cell type:markdown id: tags:
 [[True Negatives (TN), False Positives (FP)],
 [False Negatives (FN), True Positives (TP)]]
 %% Cell type:markdown id: tags:
 Fine tuning
 %% Cell type:code id: tags:
 ``` python
 param_dist = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': np.logspace(-4, 4, 10),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 500, 1000],
 }
 # Create the RandomizedSearchCV object with the logistic regression model, hyperparameters, and cross-validation
 log_reg = LogisticRegression()
 random_search = RandomizedSearchCV(log_reg, param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=1, random_state=42)
 # Fit the random search to the training data
 random_search.fit(X_train, y_train)
 # Check the best hyperparameters found
 print("Best Parameters:", random_search.best_params_)
 print("Best Score:", random_search.best_score_)
 # Use the best estimator for predictions and evaluation
 best_model = random_search.best_estimator_
 y_pred = best_model.predict(X_test)
 accuracy = accuracy_score(y_test, y_pred)
 conf_matrix = confusion_matrix(y_test, y_pred)
 report = classification_report(y_test, y_pred)
 print("Accuracy:", accuracy)
 print("Confusion Matrix:\n", conf_matrix)
 print("Classification Report:\n", report)
 ```
 %% Output
    Fitting 3 folds for each of 100 candidates, totalling 300 fits
    c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py:378: FitFailedWarning:
    120 fits failed out of a total of 300.
    The score on these train-test partitions for these parameters will be set to nan.
    If these failures are not expected, you can try to debug them by setting error_score='raise'.
    Below are more details about the failures:
    --------------------------------------------------------------------------------
    24 fits failed with the following error:
    Traceback (most recent call last):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
        estimator.fit(X_train, y_train, **fit_params)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
        solver = _check_solver(self.solver, self.penalty, self.dual)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
        raise ValueError(
    ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty.
    --------------------------------------------------------------------------------
    15 fits failed with the following error:
    Traceback (most recent call last):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
        estimator.fit(X_train, y_train, **fit_params)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
        solver = _check_solver(self.solver, self.penalty, self.dual)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 71, in _check_solver
        raise ValueError("penalty='none' is not supported for the liblinear solver")
    ValueError: penalty='none' is not supported for the liblinear solver
    --------------------------------------------------------------------------------
    18 fits failed with the following error:
    Traceback (most recent call last):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
        estimator.fit(X_train, y_train, **fit_params)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
        solver = _check_solver(self.solver, self.penalty, self.dual)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
        raise ValueError(
    ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
    --------------------------------------------------------------------------------
    6 fits failed with the following error:
    Traceback (most recent call last):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
        estimator.fit(X_train, y_train, **fit_params)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1291, in fit
        fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
        return super().__call__(iterable_with_config)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1085, in __call__
        if self.dispatch_one_batch(iterator):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch
        self._dispatch(tasks)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 819, in _dispatch
        job = self._backend.apply_async(batch, callback=cb)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
        result = ImmediateResult(func)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 597, in __init__
        self.results = batch()
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in __call__
        return [func(*args, **kwargs)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
        return [func(*args, **kwargs)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
        return self.function(*args, **kwargs)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 521, in _logistic_regression_path
        alpha = (1.0 / C) * (1 - l1_ratio)
    TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'
    --------------------------------------------------------------------------------
    21 fits failed with the following error:
    Traceback (most recent call last):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
        estimator.fit(X_train, y_train, **fit_params)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
        solver = _check_solver(self.solver, self.penalty, self.dual)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 64, in _check_solver
        raise ValueError(
    ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.
    --------------------------------------------------------------------------------
    12 fits failed with the following error:
    Traceback (most recent call last):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
        estimator.fit(X_train, y_train, **fit_params)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
        solver = _check_solver(self.solver, self.penalty, self.dual)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
        raise ValueError(
    ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.
    --------------------------------------------------------------------------------
    9 fits failed with the following error:
    Traceback (most recent call last):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
        estimator.fit(X_train, y_train, **fit_params)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
        solver = _check_solver(self.solver, self.penalty, self.dual)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
        raise ValueError(
    ValueError: Solver sag supports only 'l2' or 'none' penalties, got elasticnet penalty.
    --------------------------------------------------------------------------------
    6 fits failed with the following error:
    Traceback (most recent call last):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
        estimator.fit(X_train, y_train, **fit_params)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
        solver = _check_solver(self.solver, self.penalty, self.dual)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
        raise ValueError(
    ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.
    --------------------------------------------------------------------------------
    9 fits failed with the following error:
    Traceback (most recent call last):
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
        estimator.fit(X_train, y_train, **fit_params)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
        solver = _check_solver(self.solver, self.penalty, self.dual)
      File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
        raise ValueError(
    ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
      warnings.warn(some_fits_failed_message, FitFailedWarning)
    c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_search.py:952: UserWarning: One or more of the test scores are non-finite: [       nan 0.87666667 0.92083333        nan        nan 0.87416667
            nan 0.87666667 0.864375   0.87645833 0.78208333 0.87854167
     0.72333333 0.87854167 0.85645833        nan        nan        nan
     0.85083333 0.72333333 0.5025     0.92020833 0.78208333 0.918125
     0.86458333 0.87666667        nan 0.9225     0.90375           nan
     0.78208333        nan 0.5025            nan        nan        nan
            nan 0.78208333        nan 0.78208333 0.85645833 0.628125
     0.918125          nan 0.49916667 0.85875           nan 0.49916667
            nan        nan 0.87791667 0.86520833        nan 0.9225
            nan 0.918125   0.865625   0.84166667        nan 0.9225
     0.90375    0.918125   0.87375    0.918125   0.864375          nan
            nan 0.87666667        nan 0.90375    0.85625    0.62895833
            nan        nan 0.85625           nan        nan 0.87854167
     0.85645833        nan 0.87791667 0.90395833 0.87854167        nan
            nan 0.87375    0.78208333 0.87666667        nan        nan
     0.78208333 0.90270833        nan        nan 0.85625           nan
     0.86583333        nan        nan        nan]
      warnings.warn(
    c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
      warnings.warn(
    c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:1181: UserWarning: Setting penalty=None will ignore the C and l1_ratio parameters
      warnings.warn(
    Best Parameters: {'solver': 'newton-cg', 'penalty': 'none', 'max_iter': 100, 'C': 0.005994842503189409}
    Best Score: 0.9225
    Accuracy: 0.9325
    Confusion Matrix:
     [[557  31]
     [ 50 562]]
    Classification Report:
                   precision    recall  f1-score   support
               0       0.92      0.95      0.93       588
               1       0.95      0.92      0.93       612
        accuracy                           0.93      1200
       macro avg       0.93      0.93      0.93      1200
    weighted avg       0.93      0.93      0.93      1200
    c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:210: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.
      warnings.warn(
 %% Cell type:markdown id: tags:
 Predict
 %% Cell type:code id: tags:
 ``` python
 test_data = pd.read_csv('TestingDataBinary.csv')
 ```
 %% Cell type:code id: tags:
 ``` python
 # Preprocessing
 X_new = test_data
 X_new.replace([np.inf, -np.inf], 0, inplace=True)
 # Impute the missing values in the features data
 X_imputed_new = imputer.transform(X_new)
 # Scale the features data
 X_scaled_new = scaler.transform(X_imputed_new)
 # Apply PCA transformation
 X_pca_new = pca.transform(X_scaled_new)
 ```
 %% Cell type:code id: tags:
 ``` python
 # Use the best estimator for predictions on the new data
 y_pred_new = best_model.predict(X_pca_new)
 # Save the predictions to a new column in the DataFrame
 test_data['predicted_marker'] = y_pred_new
 # Save the updated DataFrame to a new CSV file
-test_data.to_csv('TestingDataBinary_with_predictions.csv', index=False)
+test_data.to_csv('TestingResultsBinary.csv', index=False)
 ```

--- a/part2.ipynb
+++ b/part2.ipynb
@@ -153,6 +153,7 @@
    }
   ],
   "source": [
+    "#Plot histogram of classification distribution\n",
    "plt.hist(train_df['129'])"
   ]
  },
@@ -162,6 +163,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "#Split features from target classification to fit Random Forest Classifier\n",
    "X = train_df.drop('129', axis=1)\n",
    "y = train_df['129']"
   ]
@@ -196,6 +198,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "#Extract the importance of each feature from the model into a seperate dataframe\n",
    "importances = rfc.feature_importances_\n",
    "feature_importances = pd.DataFrame({'Feature':X.columns, 'Importance': importances})"
   ]
@@ -206,7 +209,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "feature_importances = feature_importances.sort_values('Importance', ascending=False)\n"
+    "#Sort the features dataframe in descending order (highest importance feature at top)\n",
+    "feature_importances = feature_importances.sort_values('Importance', ascending=False)"
   ]
  },
  {
@@ -237,23 +241,17 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "#Extract the top N features from dataframe into a list (n was changed several times to test - 40 is ideal to avoid overfitting)\n",
    "n_features= feature_importances.head(40)['Feature'].tolist()"
   ]
  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Feature Columns: 91, 82, 101, 53, and 115"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
+    "#Re-split the data based on those 40 features\n",
    "X = train_df[n_features]\n",
    "y = train_df['129']"
   ]
@@ -264,6 +262,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "#Split into training and testing data\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
   ]
  },
@@ -287,6 +286,7 @@
    }
   ],
   "source": [
+    "#Train classifier again based on new split of data\n",
    "rfc = RandomForestClassifier()\n",
    "rfc.fit(X_train,y_train)"
   ]
@@ -297,6 +297,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "#Make first predictions\n",
    "y_pred = rfc.predict(X_test)"
   ]
  },
@@ -306,6 +307,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "#Print accuracy\n",
    "accuracy = accuracy_score(y_test,y_pred)"
   ]
  },
@@ -335,7 +337,6 @@
    "# Generate the classification report\n",
    "classification_rep = classification_report(y_test, y_pred)\n",
    "\n",
-    "# Print the classification report\n",
    "print(classification_rep)"
   ]
  },
@@ -429,6 +430,7 @@
    }
   ],
   "source": [
+    "#Split data to classify based on those same n-features\n",
    "multi_test = test_df[n_features]\n",
    "predicted_category = rfc.predict(multi_test)\n",
    "print(\"Predicted Category: \", predicted_category)"
@@ -452,6 +454,7 @@
    }
   ],
   "source": [
+    "#Calculate distribution of predicted classification\n",
    "category_counts = pd.Series(predicted_category).value_counts(normalize=True) * 100\n",
    "print('Category Percentages:')\n",
    "print(category_counts)"
@@ -463,6 +466,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "#Append predictions to dataframe as new column and save as new CSV\n",
    "test_df[\"Prediction Markers\"] = predicted_category\n",
    "test_df.to_csv(\"TestingResultsMulti.csv\",index=False)"
   ]

 %% Cell type:code id: tags:
 ``` python
 import pandas as pd
 import numpy as np
 import sklearn
 import scipy
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
 from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LogisticRegression
 import matplotlib.pyplot as plt
 from sklearn.decomposition import PCA
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import GridSearchCV
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import StandardScaler
 from sklearn.impute import SimpleImputer
 from sklearn.decomposition import PCA
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 from sklearn.model_selection import GridSearchCV
 import pandas as pd
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import classification_report
 from sklearn.model_selection import learning_curve
 ```
 %% Cell type:code id: tags:
 ``` python
 #Read CSV file as Pandas Dataframe
 train_df = pd.read_csv('TrainingDataMulti.csv')
 ```
 %% Cell type:code id: tags:
 ``` python
 print(train_df.info())
 ```
 %% Output
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 6000 entries, 0 to 5999
    Columns: 129 entries, 1 to 129
    dtypes: float64(112), int64(17)
    memory usage: 5.9 MB
    None
 %% Cell type:code id: tags:
 ``` python
 print(train_df)
 ```
 %% Output
                   1            2          3            4           5  \
    0      70.399324  127673.0908 -49.572308  127648.0176 -169.578319
    1      73.688102  130280.7109 -46.300719  130255.6377 -166.278082
    2      73.733939  130305.7842 -46.254883  130280.7109 -166.232245
    3      74.083443  130581.5902 -45.899649  130556.5169 -165.882741
    4      74.553268  131083.0556 -45.424094  131057.9823 -165.424375
    ...          ...          ...        ...          ...         ...
    5995  116.889120  131860.3269  -3.076783  131810.1804 -123.094253
    5996  116.849013  131810.1804  -3.116890  131760.0339 -123.128630
    5997  116.384917  131734.9606  -3.586716  131684.8140 -123.586996
    5998  111.125164  130506.3704  -8.846468  130456.2238 -128.858208
    5999  110.878793  130481.2971  -9.092840  130456.2238 -129.104580
                    6           7          8          9         10  ...  120  121  \
    0     127723.2374   65.689611  605.91099 -57.003571  626.78553  ...    0    0
    1     130355.9307   71.831719  483.59351 -50.947407  500.98896  ...    0    0
    2     130381.0040   71.808800  483.59351 -50.913030  500.98896  ...    0    0
    3     130656.8100   72.152575  482.86107 -50.437475  499.15786  ...    0    0
    4     131158.2754   72.118198  484.50906 -50.013486  497.69298  ...    0    0
    ...           ...         ...        ...        ...        ...  ...  ...  ...
    5995  131910.4735  114.780635  376.10794  -5.254023  374.82617  ...    0    0
    5996  131885.4002  114.769176  376.29105  -5.322778  374.82617  ...    0    0
    5997  131785.1071  114.299351  376.47416  -5.849899  374.82617  ...    0    0
    5998  130556.5169  106.667553  478.83265 -13.464508  477.73399  ...    0    0
    5999  130556.5169  106.392533  478.83265 -13.750987  477.91710  ...    0    0
          122  123  124  125  126  127  128  129
    0       0    0    0    0    0    0    0    0
    1       0    0    0    0    0    0    0    0
    2       0    0    0    0    0    0    0    0
    3       0    0    0    0    0    0    0    0
    4       0    0    0    0    0    0    0    0
    ...   ...  ...  ...  ...  ...  ...  ...  ...
    5995    0    0    0    0    0    0    0    0
    5996    0    0    0    0    0    0    0    0
    5997    0    0    0    0    0    0    0    0
    5998    0    0    0    0    0    0    0    0
    5999    0    0    0    0    0    0    0    0
    [6000 rows x 129 columns]
 %% Cell type:code id: tags:
 ``` python
+#Plot histogram of classification distribution
 plt.hist(train_df['129'])
 ```
 %% Output
    (array([3000.,    0.,    0.,    0.,    0., 1500.,    0.,    0.,    0.,
            1500.]),
     array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. ]),
     <BarContainer object of 10 artists>)
 %% Cell type:code id: tags:
 ``` python
+#Split features from target classification to fit Random Forest Classifier
 X = train_df.drop('129', axis=1)
 y = train_df['129']
 ```
 %% Cell type:code id: tags:
 ``` python
 rfc = RandomForestClassifier()
 rfc.fit(X,y)
 ```
 %% Output
    RandomForestClassifier()
 %% Cell type:code id: tags:
 ``` python
+#Extract the importance of each feature from the model into a seperate dataframe
 importances = rfc.feature_importances_
 feature_importances = pd.DataFrame({'Feature':X.columns, 'Importance': importances})
 ```
 %% Cell type:code id: tags:
 ``` python
+#Sort the features dataframe in descending order (highest importance feature at top)
 feature_importances = feature_importances.sort_values('Importance', ascending=False)
 ```
 %% Cell type:code id: tags:
 ``` python
 print(feature_importances.head())
 ```
 %% Output
        Feature  Importance
    90       91    0.050203
    100     101    0.024118
    50       51    0.022545
    81       82    0.021537
    3         4    0.020457
 %% Cell type:code id: tags:
 ``` python
+#Extract the top N features from dataframe into a list (n was changed several times to test - 40 is ideal to avoid overfitting)
 n_features= feature_importances.head(40)['Feature'].tolist()
 ```
-%% Cell type:markdown id: tags:
-Feature Columns: 91, 82, 101, 53, and 115
 %% Cell type:code id: tags:
 ``` python
+#Re-split the data based on those 40 features
 X = train_df[n_features]
 y = train_df['129']
 ```
 %% Cell type:code id: tags:
 ``` python
+#Split into training and testing data
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 ```
 %% Cell type:code id: tags:
 ``` python
+#Train classifier again based on new split of data
 rfc = RandomForestClassifier()
 rfc.fit(X_train,y_train)
 ```
 %% Output
    RandomForestClassifier()
 %% Cell type:code id: tags:
 ``` python
+#Make first predictions
 y_pred = rfc.predict(X_test)
 ```
 %% Cell type:code id: tags:
 ``` python
+#Print accuracy
 accuracy = accuracy_score(y_test,y_pred)
 ```
 %% Cell type:code id: tags:
 ``` python
 # Generate the classification report
 classification_rep = classification_report(y_test, y_pred)
-# Print the classification report
 print(classification_rep)
 ```
 %% Output
                  precision    recall  f1-score   support
               0       0.99      0.98      0.98       602
               1       0.90      0.90      0.90       277
               2       0.91      0.91      0.91       321
        accuracy                           0.94      1200
       macro avg       0.93      0.93      0.93      1200
    weighted avg       0.95      0.94      0.95      1200
 %% Cell type:code id: tags:
 ``` python
 # Calculate the learning curve
 train_sizes, train_scores, test_scores = learning_curve(
    rfc, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring='accuracy')
 # Calculate the mean and standard deviation of the training and testing scores
 train_scores_mean = np.mean(train_scores, axis=1)
 train_scores_std = np.std(train_scores, axis=1)
 test_scores_mean = np.mean(test_scores, axis=1)
 test_scores_std = np.std(test_scores, axis=1)
 # Plot the learning curve
 plt.figure()
 plt.title('Learning Curve')
 plt.xlabel('Training Examples')
 plt.ylabel('Accuracy')
 plt.grid()
 plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color='r')
 plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color='g')
 plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Accuracy')
 plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Testing Accuracy')
 plt.legend(loc='best')
 plt.show()
 ```
 %% Output
 %% Cell type:markdown id: tags:
 Load Testing Data and Classify
 %% Cell type:code id: tags:
 ``` python
 test_df = pd.read_csv('TestingDataMulti.csv')
 print(test_df.info())
 ```
 %% Output
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 100 entries, 0 to 99
    Columns: 128 entries, 1 to 128
    dtypes: float64(104), int64(24)
    memory usage: 100.1 KB
    None
 %% Cell type:code id: tags:
 ``` python
+#Split data to classify based on those same n-features
 multi_test = test_df[n_features]
 predicted_category = rfc.predict(multi_test)
 print("Predicted Category: ", predicted_category)
 ```
 %% Output
    Predicted Category:  [2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 0 0 0 1 1 1 1
     1 1 1 1 2 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
     0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 %% Cell type:code id: tags:
 ``` python
+#Calculate distribution of predicted classification
 category_counts = pd.Series(predicted_category).value_counts(normalize=True) * 100
 print('Category Percentages:')
 print(category_counts)
 ```
 %% Output
    Category Percentages:
    1    39.0
    2    31.0
    0    30.0
    dtype: float64
 %% Cell type:code id: tags:
 ``` python
+#Append predictions to dataframe as new column and save as new CSV
 test_df["Prediction Markers"] = predicted_category
 test_df.to_csv("TestingResultsMulti.csv",index=False)
 ```