Part B code

d37603ad · nc2g20 · 8ebbc412 · d37603ad
Commit d37603ad authored 2 years ago by nc2g20
--- a/partb.ipynb
+++ b/partb.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Class Distribution:\n",
+      " 0    3000\n",
+      "2    1500\n",
+      "1    1500\n",
+      "Name: 128, dtype: int64\n",
+      "Cross-validation scores: [0.96501458 0.94606414 0.95626822 0.9548105  0.95043732 0.95620438\n",
+      " 0.94744526]\n",
+      "Mean cross-validation score: 0.9537491981747265\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.99      0.99      0.99       602\n",
+      "           1       0.91      0.91      0.91       277\n",
+      "           2       0.93      0.93      0.93       321\n",
+      "\n",
+      "    accuracy                           0.96      1200\n",
+      "   macro avg       0.94      0.94      0.94      1200\n",
+      "weighted avg       0.96      0.96      0.96      1200\n",
+      "\n",
+      "Training error: 0.0%\n",
+      "Validation error: 4.416666666666669%\n",
+      "\n",
+      "Error Analysis:\n",
+      "Number of errors in validation set: 53\n",
+      "Indices of validation errors: Int64Index([2338,  228,  506, 1580, 3185, 1477, 1027, 2344, 4096, 2417, 3066,\n",
+      "            5454, 3049,  706, 1498, 2410, 4095, 2899, 2373, 2127, 4920, 2244,\n",
+      "            3268,  303, 2209, 1221, 1513, 1609,  453, 1817, 1918, 1543, 4441,\n",
+      "            1095, 1978, 4473, 1002, 2836, 2197,  293, 2370,  471,  227, 3184,\n",
+      "            3838, 4440, 2364, 3839, 4998, 2432, 2874,  452, 3970],\n",
+      "           dtype='int64')\n",
+      "[2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 0 2 0 1 1 1 1\n",
+      " 1 2 1 1 1 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0\n",
+      " 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split, cross_val_score\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
+    "\n",
+    "# Load the training data using pandas\n",
+    "data = pd.read_csv('TrainingDataMulti.csv', header=None)\n",
+    "\n",
+    "# Separate the features from the labels\n",
+    "X = data.iloc[:, :-1]  # all rows, all columns except the last\n",
+    "y = data.iloc[:, -1]  # all rows, last column\n",
+    "\n",
+    "# Check the class distribution\n",
+    "print(\"Class Distribution:\\n\", y.value_counts())\n",
+    "\n",
+    "# Split the data into training and validation sets\n",
+    "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Create a model and fit it to the training data\n",
+    "model = RandomForestClassifier(n_estimators=100, random_state=10)\n",
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "# Perform k-fold cross-validation on the training set\n",
+    "cv_scores = cross_val_score(model, X_train, y_train, cv=7)  # 5-fold cross-validation\n",
+    "print(\"Cross-validation scores:\", cv_scores)\n",
+    "print(\"Mean cross-validation score:\", cv_scores.mean())\n",
+    "\n",
+    "# Evaluate the model\n",
+    "train_preds = model.predict(X_train)\n",
+    "val_preds = model.predict(X_val)\n",
+    "\n",
+    "train_error = 1 - accuracy_score(y_train, train_preds)\n",
+    "val_error = 1 - accuracy_score(y_val, val_preds)\n",
+    "\n",
+    "print(classification_report(y_val, val_preds))\n",
+    "\n",
+    "print(f\"Training error: {train_error * 100}%\")\n",
+    "print(f\"Validation error: {val_error * 100}%\")\n",
+    "\n",
+    "# Perform error analysis on validation data\n",
+    "errors = X_val[y_val != val_preds]\n",
+    "print(f\"\\nError Analysis:\\nNumber of errors in validation set: {len(errors)}\")\n",
+    "print(\"Indices of validation errors:\", errors.index)\n",
+    "\n",
+    "# Load the testing data\n",
+    "test_data = pd.read_csv('TestingDataMulti.csv', header=None)\n",
+    "\n",
+    "# Predict the labels for the testing data\n",
+    "test_preds = model.predict(test_data)\n",
+    "\n",
+    "# Print the predicted labels for clarity\n",
+    "print(test_preds)\n",
+    "\n",
+    "# Save the testing data with predicted labels as per specification\n",
+    "test_data['Predicted Label'] = test_preds\n",
+    "test_data.to_csv('TestingResultsMulti.csv', index=False, header=False)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+``` python
+import pandas as pd
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+# Load the training data using pandas
+data = pd.read_csv('TrainingDataMulti.csv', header=None)
+# Separate the features from the labels
+X = data.iloc[:, :-1]  # all rows, all columns except the last
+y = data.iloc[:, -1]  # all rows, last column
+# Check the class distribution
+print("Class Distribution:\n", y.value_counts())
+# Split the data into training and validation sets
+X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
+# Create a model and fit it to the training data
+model = RandomForestClassifier(n_estimators=100, random_state=10)
+model.fit(X_train, y_train)
+# Perform k-fold cross-validation on the training set
+cv_scores = cross_val_score(model, X_train, y_train, cv=7)  # 5-fold cross-validation
+print("Cross-validation scores:", cv_scores)
+print("Mean cross-validation score:", cv_scores.mean())
+# Evaluate the model
+train_preds = model.predict(X_train)
+val_preds = model.predict(X_val)
+train_error = 1 - accuracy_score(y_train, train_preds)
+val_error = 1 - accuracy_score(y_val, val_preds)
+print(classification_report(y_val, val_preds))
+print(f"Training error: {train_error * 100}%")
+print(f"Validation error: {val_error * 100}%")
+# Perform error analysis on validation data
+errors = X_val[y_val != val_preds]
+print(f"\nError Analysis:\nNumber of errors in validation set: {len(errors)}")
+print("Indices of validation errors:", errors.index)
+# Load the testing data
+test_data = pd.read_csv('TestingDataMulti.csv', header=None)
+# Predict the labels for the testing data
+test_preds = model.predict(test_data)
+# Print the predicted labels for clarity
+print(test_preds)
+# Save the testing data with predicted labels as per specification
+test_data['Predicted Label'] = test_preds
+test_data.to_csv('TestingResultsMulti.csv', index=False, header=False)
+```
+%% Output
+    Class Distribution:
+     0    3000
+    2    1500
+    1    1500
+    Name: 128, dtype: int64
+    Cross-validation scores: [0.96501458 0.94606414 0.95626822 0.9548105  0.95043732 0.95620438
+     0.94744526]
+    Mean cross-validation score: 0.9537491981747265
+                  precision    recall  f1-score   support
+               0       0.99      0.99      0.99       602
+               1       0.91      0.91      0.91       277
+               2       0.93      0.93      0.93       321
+        accuracy                           0.96      1200
+       macro avg       0.94      0.94      0.94      1200
+    weighted avg       0.96      0.96      0.96      1200
+    Training error: 0.0%
+    Validation error: 4.416666666666669%
+    Error Analysis:
+    Number of errors in validation set: 53
+    Indices of validation errors: Int64Index([2338,  228,  506, 1580, 3185, 1477, 1027, 2344, 4096, 2417, 3066,
+                5454, 3049,  706, 1498, 2410, 4095, 2899, 2373, 2127, 4920, 2244,
+                3268,  303, 2209, 1221, 1513, 1609,  453, 1817, 1918, 1543, 4441,
+                1095, 1978, 4473, 1002, 2836, 2197,  293, 2370,  471,  227, 3184,
+                3838, 4440, 2364, 3839, 4998, 2432, 2874,  452, 3970],
+               dtype='int64')
+    [2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 0 2 0 1 1 1 1
+     1 2 1 1 1 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]