Skip to content
Snippets Groups Projects
Commit 1b9ab599 authored by Jaralus's avatar Jaralus
Browse files

update Part_A.py and Part_B.py

parent bd5c92cf
No related branches found
No related tags found
No related merge requests found
...@@ -69,13 +69,21 @@ def main(): ...@@ -69,13 +69,21 @@ def main():
# Evaluate the performance of the trained classifiers # Evaluate the performance of the trained classifiers
for classifier_name, classifier in classifiers: for classifier_name, classifier in classifiers:
print(f"Training {classifier_name}...") print(f"Training {classifier_name}...")
# Predict the labels for the training data
train_predicted_labels = classifier.predict(train_features_scaled) train_predicted_labels = classifier.predict(train_features_scaled)
# Compare predicted labels to actual labels
train_accuracy = accuracy_score(train_labels, train_predicted_labels) train_accuracy = accuracy_score(train_labels, train_predicted_labels)
print(f"Training Accuracy:", train_accuracy) print(f"Training Accuracy:", train_accuracy)
# Perform cross validation to obtain a more accurate accuracy score
cv_accuracy = cross_validation(classifier, train_features_scaled, train_labels, "accuracy") cv_accuracy = cross_validation(classifier, train_features_scaled, train_labels, "accuracy")
print(f"Cross-Validated Accuracy: {cv_accuracy}") print(f"Cross-Validated Accuracy: {cv_accuracy}")
cv_f1 = cross_validation(classifier, train_features_scaled, train_labels, "f1") cv_f1 = cross_validation(classifier, train_features_scaled, train_labels, "f1")
print(f"Cross-Validated F1-Score: {cv_f1}") print(f"Cross-Validated F1-Score: {cv_f1}")
# Average both accuracy and f1 score together to find an average cross validation score
cv_average = ((cv_accuracy + cv_f1) / 2) cv_average = ((cv_accuracy + cv_f1) / 2)
print(f"Average Score: {cv_average} \n") print(f"Average Score: {cv_average} \n")
......
...@@ -6,7 +6,6 @@ from skopt.space import Real, Categorical, Integer ...@@ -6,7 +6,6 @@ from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier from sklearn.neural_network import MLPClassifier
...@@ -15,72 +14,103 @@ from sklearn.ensemble import RandomForestClassifier ...@@ -15,72 +14,103 @@ from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
# This function performs cross-validation on a given classifier
def cross_validation(classifier, features, labels, scoring_metrics): def cross_validation(classifier, features, labels, scoring_metrics):
scores = cross_val_score(classifier, features, labels, cv = 5, scoring = scoring_metrics, n_jobs = -1) # Perform cross-validation using the given classifier, features, labels, and scoring metrics
scores = cross_val_score(classifier, features, labels, cv=5, scoring=scoring_metrics)
# Return the mean score of the cross-validation
return scores.mean() return scores.mean()
def main(): def main():
best_classifier_name = ""
best_classifier_average = 0 best_classifier_average = 0
# Read the training data
train_data = pd.read_csv("TrainingDataMulti.csv", header = None) train_data = pd.read_csv("TrainingDataMulti.csv", header = None)
# Shuffle the training data
train_data_shuffled = shuffle(train_data) train_data_shuffled = shuffle(train_data)
# Split the training data into features and labels
train_features = train_data_shuffled.iloc[:, :-1] # Select everything apart from the last column train_features = train_data_shuffled.iloc[:, :-1] # Select everything apart from the last column
train_labels = train_data_shuffled.iloc[:, -1] # Select the last column train_labels = train_data_shuffled.iloc[:, -1] # Select the last column
# Scale the training features
scaler = StandardScaler() scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features) train_features_scaled = scaler.fit_transform(train_features)
""" # The following commented out code was used to find optimal hyperparameters
'''
# Define the hyperparameters for the Bayes search
parameters = { parameters = {
"solver" : ["lbfgs", "sgd", "adam"], "solver" : ["lbfgs", "sgd", "adam"],
"activation" : ["relu", "logistic", "tanh"], "activation" : ["relu", "logistic", "tanh"],
"learning_rate" : ["constant", "invscaling", "adaptive"] "learning_rate" : ["constant", "invscaling", "adaptive"]
} }
search = BayesSearchCV(MLPClassifier(max_iter = 10000, solver = "newton-cg"), parameters, n_iter = 50, n_jobs = -1, cv = 5, scoring = "accuracy").fit(train_features_scaled, train_labels) # Perform Bayesian optimization to find the best hyperparameters
search = BayesSearchCV(MLPClassifier(max_iter = 10000), parameters, n_iter = 50, n_jobs = -1, cv = 5, scoring = "accuracy").fit(train_features_scaled, train_labels)
print(f"Best Score: {search.best_score_}") print(f"Best Score: {search.best_score_}")
print(f"Best Hyperparameters: {search.best_params_}") print(f"Best Hyperparameters: {search.best_params_}")
'''
""" # Train the classifiers
classifiers = [
classifiers = [#LogisticRegression(max_iter = 10000, solver = "newton-cg", C = 9.088000000000001).fit(train_features_scaled, train_labels), #("Logistical Regression" , LogisticRegression(max_iter = 10000, solver = "newton-cg", C = 9.088000000000001).fit(train_features_scaled, train_labels)),
MLPClassifier(max_iter = 10000, solver = "adam", activation = "tanh", learning_rate = "constant").fit(train_features_scaled, train_labels), ("Multi-layer Perceptron" , MLPClassifier(max_iter = 10000, solver = "adam", activation = "tanh", learning_rate = "adaptive").fit(train_features_scaled, train_labels)),
#SVC(C = 7.989999999999979, kernel = "linear").fit(train_features_scaled, train_labels), #("C-Support Vector" , SVC(C = 9.59, kernel = "linear").fit(train_features_scaled, train_labels))
#RandomForestClassifier(n_estimators = 418, max_depth = 5).fit(train_features_scaled, train_labels),
#DecisionTreeClassifier(max_features = "sqrt", criterion = "gini", max_depth = 19).fit(train_features_scaled, train_labels),
#KNeighborsClassifier(n_neighbors = 4, n_jobs = -1, leaf_size = 68, metric = "manhattan", weights = "distance", algorithm = "kd_tree").fit(train_features_scaled, train_labels)
] ]
#Heya # Evaluate the performance of the trained classifiers
for classifier_name, classifier in classifiers:
print(f"Training {classifier_name}...")
for classifier in classifiers: # Predict the labels for the training data
train_predicted_labels = classifier.predict(train_features_scaled) train_predicted_labels = classifier.predict(train_features_scaled)
# Compare predicted labels to actual labels
train_accuracy = accuracy_score(train_labels, train_predicted_labels) train_accuracy = accuracy_score(train_labels, train_predicted_labels)
print(f"Training Accuracy:", train_accuracy) print(f"Training Accuracy:", train_accuracy)
# Perform cross validation to obtain a more accurate accuracy score
cv_accuracy = cross_validation(classifier, train_features_scaled, train_labels, "accuracy") cv_accuracy = cross_validation(classifier, train_features_scaled, train_labels, "accuracy")
print(f"Cross-Validated Accuracy: {cv_accuracy}") print(f"Cross-Validated Accuracy: {cv_accuracy}")
cv_f1 = cross_validation(classifier, train_features_scaled, train_labels, "f1_weighted") cv_f1 = cross_validation(classifier, train_features_scaled, train_labels, "f1_weighted")
print(f"Cross-Validated F1-Score: {cv_f1}") print(f"Cross-Validated F1-Score: {cv_f1}")
# Average both accuracy and f1 score together to find an average cross validation score
cv_average = ((cv_accuracy + cv_f1) / 2) cv_average = ((cv_accuracy + cv_f1) / 2)
print(f"Average Score: {cv_average}") print(f"Average Score: {cv_average} \n")
# Update the best classifier if the current classifier has a better average score
if (best_classifier_average < cv_average): if (best_classifier_average < cv_average):
best_classifier_name = classifier_name
best_classifier = classifier best_classifier = classifier
best_classifier_average = cv_average best_classifier_average = cv_average
# Update the best classifier if the current classifier has a better average score
if (best_classifier_average < cv_average):
best_classifier_name = classifier_name
best_classifier = classifier
best_classifier_average = cv_average
print(f"The best classifier is {best_classifier_name}.\n")
# Read the training data
test_features = pd.read_csv("TestingDataMulti.csv", header = None) test_features = pd.read_csv("TestingDataMulti.csv", header = None)
# Scale the test features
test_features_scaled = scaler.transform(test_features) test_features_scaled = scaler.transform(test_features)
# Predict the labels for the test data
test_predicted_labels = best_classifier.predict(test_features_scaled) test_predicted_labels = best_classifier.predict(test_features_scaled)
# Save the test results
test_results = test_features.copy() test_results = test_features.copy()
test_results["Predicted Labels"] = test_predicted_labels test_results["Predicted Labels"] = test_predicted_labels
test_results.to_csv("TestingResultsMulti1.csv", header = False, index = False) test_results.to_csv("TestingResultsMulti.csv", header = False, index = False)
print("Test data was trained and new labels have been predicted.")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment