Part A code

8ebbc412 · nc2g20 · 8ebbc412
Commit 8ebbc412 authored 2 years ago by nc2g20
--- a/parta.ipynb
+++ b/parta.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Class Distribution:\n",
+      " 0    3000\n",
+      "1    3000\n",
+      "Name: 128, dtype: int64\n",
+      "Cross-validation scores: [0.984375   0.98541667 0.98020833 0.98229167 0.98020833]\n",
+      "Mean cross-validation score: 0.9825000000000002\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.99      0.99      0.99       588\n",
+      "           1       0.99      0.99      0.99       612\n",
+      "\n",
+      "    accuracy                           0.99      1200\n",
+      "   macro avg       0.99      0.99      0.99      1200\n",
+      "weighted avg       0.99      0.99      0.99      1200\n",
+      "\n",
+      "Training error: 0.0%\n",
+      "Validation error: 1.0833333333333361%\n",
+      "\n",
+      "Error Analysis:\n",
+      "Number of errors in validation set: 13\n",
+      "Indices of validation errors: Int64Index([5459, 812, 3320, 5566, 5454, 3971, 5113, 4003, 1370, 751, 1507,\n",
+      "            5460, 2272],\n",
+      "           dtype='int64')\n",
+      "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
+      " 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1\n",
+      " 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split, cross_val_score\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
+    "\n",
+    "# Load the training data using pandas\n",
+    "data = pd.read_csv('TrainingDataBinary.csv', header=None)\n",
+    "\n",
+    "# Separate the features from the labels\n",
+    "X = data.iloc[:, :-1]  # all rows, all columns except the last\n",
+    "y = data.iloc[:, -1]  # all rows, last column\n",
+    "\n",
+    "# Check the class distribution\n",
+    "print(\"Class Distribution:\\n\", y.value_counts())\n",
+    "\n",
+    "# Split the data into training and validation sets\n",
+    "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Create a model and fit it to the training data\n",
+    "model = RandomForestClassifier(n_estimators=100, random_state=15)\n",
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "# Perform k-fold cross-validation on the training set\n",
+    "cv_scores = cross_val_score(model, X_train, y_train, cv=5)  # 5-fold cross-validation\n",
+    "print(\"Cross-validation scores:\", cv_scores)\n",
+    "print(\"Mean cross-validation score:\", cv_scores.mean())\n",
+    "\n",
+    "# Evaluate the model\n",
+    "train_preds = model.predict(X_train)\n",
+    "val_preds = model.predict(X_val)\n",
+    "\n",
+    "train_error = 1 - accuracy_score(y_train, train_preds)\n",
+    "val_error = 1 - accuracy_score(y_val, val_preds)\n",
+    "\n",
+    "print(classification_report(y_val, val_preds))\n",
+    "\n",
+    "print(f\"Training error: {train_error * 100}%\")\n",
+    "print(f\"Validation error: {val_error * 100}%\")\n",
+    "\n",
+    "# Perform error analysis on validation data\n",
+    "errors = X_val[y_val != val_preds]\n",
+    "print(f\"\\nError Analysis:\\nNumber of errors in validation set: {len(errors)}\")\n",
+    "print(\"Indices of validation errors:\", errors.index)\n",
+    "\n",
+    "# Load the testing data\n",
+    "test_data = pd.read_csv('TestingDataBinary.csv', header=None)\n",
+    "\n",
+    "# Predict the labels for the testing data\n",
+    "test_preds = model.predict(test_data)\n",
+    "\n",
+    "# Print the predicted labels for clarity\n",
+    "print(test_preds)\n",
+    "\n",
+    "# Save the testing data with predicted labels as per specification\n",
+    "test_data['Predicted Label'] = test_preds\n",
+    "test_data.to_csv('TestingResultsBinary.csv', index=False, header=False)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+
+# Load the training data using pandas
+data = pd.read_csv('TrainingDataBinary.csv', header=None)
+
+# Separate the features from the labels
+X = data.iloc[:, :-1]  # all rows, all columns except the last
+y = data.iloc[:, -1]  # all rows, last column
+
+# Check the class distribution
+print("Class Distribution:\n", y.value_counts())
+
+# Split the data into training and validation sets
+X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Create a model and fit it to the training data
+model = RandomForestClassifier(n_estimators=100, random_state=15)
+model.fit(X_train, y_train)
+
+# Perform k-fold cross-validation on the training set
+cv_scores = cross_val_score(model, X_train, y_train, cv=5)  # 5-fold cross-validation
+print("Cross-validation scores:", cv_scores)
+print("Mean cross-validation score:", cv_scores.mean())
+
+# Evaluate the model
+train_preds = model.predict(X_train)
+val_preds = model.predict(X_val)
+
+train_error = 1 - accuracy_score(y_train, train_preds)
+val_error = 1 - accuracy_score(y_val, val_preds)
+
+print(classification_report(y_val, val_preds))
+
+print(f"Training error: {train_error * 100}%")
+print(f"Validation error: {val_error * 100}%")
+
+# Perform error analysis on validation data
+errors = X_val[y_val != val_preds]
+print(f"\nError Analysis:\nNumber of errors in validation set: {len(errors)}")
+print("Indices of validation errors:", errors.index)
+
+# Load the testing data
+test_data = pd.read_csv('TestingDataBinary.csv', header=None)
+
+# Predict the labels for the testing data
+test_preds = model.predict(test_data)
+
+# Print the predicted labels for clarity
+print(test_preds)
+
+# Save the testing data with predicted labels as per specification
+test_data['Predicted Label'] = test_preds
+test_data.to_csv('TestingResultsBinary.csv', index=False, header=False)
+```
+
+%% Output
+
+    Class Distribution:
+     0    3000
+    1    3000
+    Name: 128, dtype: int64
+    Cross-validation scores: [0.984375   0.98541667 0.98020833 0.98229167 0.98020833]
+    Mean cross-validation score: 0.9825000000000002
+                  precision    recall  f1-score   support
+    
+               0       0.99      0.99      0.99       588
+               1       0.99      0.99      0.99       612
+    
+        accuracy                           0.99      1200
+       macro avg       0.99      0.99      0.99      1200
+    weighted avg       0.99      0.99      0.99      1200
+    
+    Training error: 0.0%
+    Validation error: 1.0833333333333361%
+    
+    Error Analysis:
+    Number of errors in validation set: 13
+    Indices of validation errors: Int64Index([5459, 812, 3320, 5566, 5454, 3971, 5113, 4003, 1370, 751, 1507,
+                5460, 2272],
+               dtype='int64')
+    [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+     1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
+     1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]