PartA_Xinyi_Zhang.py

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, f1_score

# Load the dataset
data = pd.read_csv('TrainingDataBinary.csv')

# Separate features (X) and labels (y)
X = data.iloc[:, :-1]  # All columns except the last one
y = data.iloc[:, -1]   # Last column (marker)

# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=50, min_samples_split=2)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_test_pred = rf_classifier.predict(X_test)

# Evaluate the accuracy of the model on the validation set
accuracy = accuracy_score(y_test, y_test_pred)
print("Validation Accuracy:", accuracy)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

# Convert the confusion matrix to a DataFrame for better visualization
cm_df = pd.DataFrame(cm, index=['True Label 0', 'True Label 1'], columns=['Predicted Label 0', 'Predicted Label 1'])

# Create a heatmap of the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Compute F1 score
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

# Load the testing data
test_data = pd.read_csv('TestingDataBinary.csv')

# Make predictions on the testing data
y_test_pred = rf_classifier.predict(test_data)

# Add the predicted labels as a new column named "marker"
test_data['marker'] = y_test_pred

# Save the updated testing data to a new CSV file
#test_data.to_csv('C:\\Users\\97061\\Documents\\Southampton\\Modules\\Semester2\\COMP3217 Security for CPS\\Assignment\\Assignment2\\Testoutputfile\\TestingResultsBinary.csv', index=False)
test_data.to_csv('TestingResultsBinary.csv', index=False)

# Print the predicted labels for the testing data
print("Predicted labels for testing data:")
print(y_test_pred)