diff --git a/COMP3217.docx b/COMP3217.docx index e4bd96f44bf0065b1cc577a2cd42134a8fa508e1..96233f4717916487a884a0500943b51456c29c54 100644 Binary files a/COMP3217.docx and b/COMP3217.docx differ diff --git a/part1.ipynb b/part1.ipynb index 4d6c92eebc90ec3d5a30f371405689f5c4a699bc..5e41726999a68751d75079df9f62d0966ed865a7 100644 --- a/part1.ipynb +++ b/part1.ipynb @@ -507,7 +507,7 @@ "test_data['predicted_marker'] = y_pred_new\n", "\n", "# Save the updated DataFrame to a new CSV file\n", - "test_data.to_csv('TestingDataBinary_with_predictions.csv', index=False)" + "test_data.to_csv('TestingResultsBinary.csv', index=False)" ] } ], diff --git a/part2.ipynb b/part2.ipynb index eb26b8eae966a41d79b873ac90a13900a44599d1..238267893f7aa260ed29da004db8976355d9823f 100644 --- a/part2.ipynb +++ b/part2.ipynb @@ -153,6 +153,7 @@ } ], "source": [ + "#Plot histogram of classification distribution\n", "plt.hist(train_df['129'])" ] }, @@ -162,6 +163,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Split features from target classification to fit Random Forest Classifier\n", "X = train_df.drop('129', axis=1)\n", "y = train_df['129']" ] @@ -196,6 +198,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Extract the importance of each feature from the model into a seperate dataframe\n", "importances = rfc.feature_importances_\n", "feature_importances = pd.DataFrame({'Feature':X.columns, 'Importance': importances})" ] @@ -206,7 +209,8 @@ "metadata": {}, "outputs": [], "source": [ - "feature_importances = feature_importances.sort_values('Importance', ascending=False)\n" + "#Sort the features dataframe in descending order (highest importance feature at top)\n", + "feature_importances = feature_importances.sort_values('Importance', ascending=False)" ] }, { @@ -237,23 +241,17 @@ "metadata": {}, "outputs": [], "source": [ + "#Extract the top N features from dataframe into a list (n was changed several times to test - 40 is ideal to avoid overfitting)\n", "n_features= feature_importances.head(40)['Feature'].tolist()" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Feature Columns: 91, 82, 101, 53, and 115" - ] - }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ + "#Re-split the data based on those 40 features\n", "X = train_df[n_features]\n", "y = train_df['129']" ] @@ -264,6 +262,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Split into training and testing data\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, @@ -287,6 +286,7 @@ } ], "source": [ + "#Train classifier again based on new split of data\n", "rfc = RandomForestClassifier()\n", "rfc.fit(X_train,y_train)" ] @@ -297,6 +297,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Make first predictions\n", "y_pred = rfc.predict(X_test)" ] }, @@ -306,6 +307,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Print accuracy\n", "accuracy = accuracy_score(y_test,y_pred)" ] }, @@ -335,7 +337,6 @@ "# Generate the classification report\n", "classification_rep = classification_report(y_test, y_pred)\n", "\n", - "# Print the classification report\n", "print(classification_rep)" ] }, @@ -429,6 +430,7 @@ } ], "source": [ + "#Split data to classify based on those same n-features\n", "multi_test = test_df[n_features]\n", "predicted_category = rfc.predict(multi_test)\n", "print(\"Predicted Category: \", predicted_category)" @@ -452,6 +454,7 @@ } ], "source": [ + "#Calculate distribution of predicted classification\n", "category_counts = pd.Series(predicted_category).value_counts(normalize=True) * 100\n", "print('Category Percentages:')\n", "print(category_counts)" @@ -463,6 +466,7 @@ "metadata": {}, "outputs": [], "source": [ + "#Append predictions to dataframe as new column and save as new CSV\n", "test_df[\"Prediction Markers\"] = predicted_category\n", "test_df.to_csv(\"TestingResultsMulti.csv\",index=False)" ]