Skip to content
Snippets Groups Projects
Commit 8ebbc412 authored by nc2g20's avatar nc2g20 :sweat_drops:
Browse files

Part A code

parents
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Load the training data using pandas
data = pd.read_csv('TrainingDataBinary.csv', header=None)
# Separate the features from the labels
X = data.iloc[:, :-1] # all rows, all columns except the last
y = data.iloc[:, -1] # all rows, last column
# Check the class distribution
print("Class Distribution:\n", y.value_counts())
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a model and fit it to the training data
model = RandomForestClassifier(n_estimators=100, random_state=15)
model.fit(X_train, y_train)
# Perform k-fold cross-validation on the training set
cv_scores = cross_val_score(model, X_train, y_train, cv=5) # 5-fold cross-validation
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())
# Evaluate the model
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)
train_error = 1 - accuracy_score(y_train, train_preds)
val_error = 1 - accuracy_score(y_val, val_preds)
print(classification_report(y_val, val_preds))
print(f"Training error: {train_error * 100}%")
print(f"Validation error: {val_error * 100}%")
# Perform error analysis on validation data
errors = X_val[y_val != val_preds]
print(f"\nError Analysis:\nNumber of errors in validation set: {len(errors)}")
print("Indices of validation errors:", errors.index)
# Load the testing data
test_data = pd.read_csv('TestingDataBinary.csv', header=None)
# Predict the labels for the testing data
test_preds = model.predict(test_data)
# Print the predicted labels for clarity
print(test_preds)
# Save the testing data with predicted labels as per specification
test_data['Predicted Label'] = test_preds
test_data.to_csv('TestingResultsBinary.csv', index=False, header=False)
```
%% Output
Class Distribution:
0 3000
1 3000
Name: 128, dtype: int64
Cross-validation scores: [0.984375 0.98541667 0.98020833 0.98229167 0.98020833]
Mean cross-validation score: 0.9825000000000002
precision recall f1-score support
0 0.99 0.99 0.99 588
1 0.99 0.99 0.99 612
accuracy 0.99 1200
macro avg 0.99 0.99 0.99 1200
weighted avg 0.99 0.99 0.99 1200
Training error: 0.0%
Validation error: 1.0833333333333361%
Error Analysis:
Number of errors in validation set: 13
Indices of validation errors: Int64Index([5459, 812, 3320, 5566, 5454, 3971, 5113, 4003, 1370, 751, 1507,
5460, 2272],
dtype='int64')
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment