Skip to content
Snippets Groups Projects
Commit d37603ad authored by nc2g20's avatar nc2g20 :sweat_drops:
Browse files

Part B code

parent 8ebbc412
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Load the training data using pandas
data = pd.read_csv('TrainingDataMulti.csv', header=None)
# Separate the features from the labels
X = data.iloc[:, :-1] # all rows, all columns except the last
y = data.iloc[:, -1] # all rows, last column
# Check the class distribution
print("Class Distribution:\n", y.value_counts())
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a model and fit it to the training data
model = RandomForestClassifier(n_estimators=100, random_state=10)
model.fit(X_train, y_train)
# Perform k-fold cross-validation on the training set
cv_scores = cross_val_score(model, X_train, y_train, cv=7) # 5-fold cross-validation
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())
# Evaluate the model
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)
train_error = 1 - accuracy_score(y_train, train_preds)
val_error = 1 - accuracy_score(y_val, val_preds)
print(classification_report(y_val, val_preds))
print(f"Training error: {train_error * 100}%")
print(f"Validation error: {val_error * 100}%")
# Perform error analysis on validation data
errors = X_val[y_val != val_preds]
print(f"\nError Analysis:\nNumber of errors in validation set: {len(errors)}")
print("Indices of validation errors:", errors.index)
# Load the testing data
test_data = pd.read_csv('TestingDataMulti.csv', header=None)
# Predict the labels for the testing data
test_preds = model.predict(test_data)
# Print the predicted labels for clarity
print(test_preds)
# Save the testing data with predicted labels as per specification
test_data['Predicted Label'] = test_preds
test_data.to_csv('TestingResultsMulti.csv', index=False, header=False)
```
%% Output
Class Distribution:
0 3000
2 1500
1 1500
Name: 128, dtype: int64
Cross-validation scores: [0.96501458 0.94606414 0.95626822 0.9548105 0.95043732 0.95620438
0.94744526]
Mean cross-validation score: 0.9537491981747265
precision recall f1-score support
0 0.99 0.99 0.99 602
1 0.91 0.91 0.91 277
2 0.93 0.93 0.93 321
accuracy 0.96 1200
macro avg 0.94 0.94 0.94 1200
weighted avg 0.96 0.96 0.96 1200
Training error: 0.0%
Validation error: 4.416666666666669%
Error Analysis:
Number of errors in validation set: 53
Indices of validation errors: Int64Index([2338, 228, 506, 1580, 3185, 1477, 1027, 2344, 4096, 2417, 3066,
5454, 3049, 706, 1498, 2410, 4095, 2899, 2373, 2127, 4920, 2244,
3268, 303, 2209, 1221, 1513, 1609, 453, 1817, 1918, 1543, 4441,
1095, 1978, 4473, 1002, 2836, 2197, 293, 2370, 471, 227, 3184,
3838, 4440, 2364, 3839, 4998, 2432, 2874, 452, 3970],
dtype='int64')
[2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 0 2 0 1 1 1 1
1 2 1 1 1 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment