diff --git a/partb.ipynb b/partb.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..acee1bb4ae7d7a8594cbfb89ed1703ab86291be2 --- /dev/null +++ b/partb.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Class Distribution:\n", + " 0 3000\n", + "2 1500\n", + "1 1500\n", + "Name: 128, dtype: int64\n", + "Cross-validation scores: [0.96501458 0.94606414 0.95626822 0.9548105 0.95043732 0.95620438\n", + " 0.94744526]\n", + "Mean cross-validation score: 0.9537491981747265\n", + " precision recall f1-score support\n", + "\n", + " 0 0.99 0.99 0.99 602\n", + " 1 0.91 0.91 0.91 277\n", + " 2 0.93 0.93 0.93 321\n", + "\n", + " accuracy 0.96 1200\n", + " macro avg 0.94 0.94 0.94 1200\n", + "weighted avg 0.96 0.96 0.96 1200\n", + "\n", + "Training error: 0.0%\n", + "Validation error: 4.416666666666669%\n", + "\n", + "Error Analysis:\n", + "Number of errors in validation set: 53\n", + "Indices of validation errors: Int64Index([2338, 228, 506, 1580, 3185, 1477, 1027, 2344, 4096, 2417, 3066,\n", + " 5454, 3049, 706, 1498, 2410, 4095, 2899, 2373, 2127, 4920, 2244,\n", + " 3268, 303, 2209, 1221, 1513, 1609, 453, 1817, 1918, 1543, 4441,\n", + " 1095, 1978, 4473, 1002, 2836, 2197, 293, 2370, 471, 227, 3184,\n", + " 3838, 4440, 2364, 3839, 4998, 2432, 2874, 452, 3970],\n", + " dtype='int64')\n", + "[2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 0 2 0 1 1 1 1\n", + " 1 2 1 1 1 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "\n", + "# Load the training data using pandas\n", + "data = pd.read_csv('TrainingDataMulti.csv', header=None)\n", + "\n", + "# Separate the features from the labels\n", + "X = data.iloc[:, :-1] # all rows, all columns except the last\n", + "y = data.iloc[:, -1] # all rows, last column\n", + "\n", + "# Check the class distribution\n", + "print(\"Class Distribution:\\n\", y.value_counts())\n", + "\n", + "# Split the data into training and validation sets\n", + "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Create a model and fit it to the training data\n", + "model = RandomForestClassifier(n_estimators=100, random_state=10)\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Perform k-fold cross-validation on the training set\n", + "cv_scores = cross_val_score(model, X_train, y_train, cv=7) # 5-fold cross-validation\n", + "print(\"Cross-validation scores:\", cv_scores)\n", + "print(\"Mean cross-validation score:\", cv_scores.mean())\n", + "\n", + "# Evaluate the model\n", + "train_preds = model.predict(X_train)\n", + "val_preds = model.predict(X_val)\n", + "\n", + "train_error = 1 - accuracy_score(y_train, train_preds)\n", + "val_error = 1 - accuracy_score(y_val, val_preds)\n", + "\n", + "print(classification_report(y_val, val_preds))\n", + "\n", + "print(f\"Training error: {train_error * 100}%\")\n", + "print(f\"Validation error: {val_error * 100}%\")\n", + "\n", + "# Perform error analysis on validation data\n", + "errors = X_val[y_val != val_preds]\n", + "print(f\"\\nError Analysis:\\nNumber of errors in validation set: {len(errors)}\")\n", + "print(\"Indices of validation errors:\", errors.index)\n", + "\n", + "# Load the testing data\n", + "test_data = pd.read_csv('TestingDataMulti.csv', header=None)\n", + "\n", + "# Predict the labels for the testing data\n", + "test_preds = model.predict(test_data)\n", + "\n", + "# Print the predicted labels for clarity\n", + "print(test_preds)\n", + "\n", + "# Save the testing data with predicted labels as per specification\n", + "test_data['Predicted Label'] = test_preds\n", + "test_data.to_csv('TestingResultsMulti.csv', index=False, header=False)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}