Skip to content
Snippets Groups Projects
Commit 0bd01afc authored by yl1r22's avatar yl1r22
Browse files

Upload New File

parent e89d6685
Branches
No related tags found
No related merge requests found
%% Cell type:code id:6064e0b1 tags:
``` python
#Import scikit-learn dataset library
#from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import svm, metrics
from sklearn.ensemble import RandomForestClassifier
from joblib import dump
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
#Read training datasets
df = pd.read_csv('H:\AI classification\TrainingDataBinary.csv', header=None)
# Print the head of csv document to check
print(df.head(1))
# The first 128 columns are features
df_feature = df.iloc[:, :128]
# the 129th column is labels
df_label = df.iloc[:, 128]
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(df_feature, df_label, test_size=0.2) # 80% training and 20% test
```
%% Output
0 1 2 3 4 5 \
0 70.399324 127673.0908 -49.572308 127648.0176 -169.578319 127723.2374
6 7 8 9 ... 119 120 121 122 123 \
0 65.689611 605.91099 -57.003571 626.78553 ... 0 0 0 0 0
124 125 126 127 128
0 0 0 0 0 0
[1 rows x 129 columns]
%% Cell type:code id:cdc65331 tags:
``` python
# Create a Randomforest Classifier
clf1 = RandomForestClassifier(n_estimators=100, max_features=78)
# Train the model using the training sets
clf1.fit(X_train, y_train)
# #Predict the response for test dataset
y_pred1 = clf1.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred1))
```
%% Output
Accuracy: 0.9741666666666666
%% Cell type:code id:12a28ae3 tags:
``` python
# Using Cross-validation to evaluate classifier
scores1 = cross_val_score(clf1, X_train, y_train, cv=5)
#Print model's Scores
print("Scores", scores1)
print("Mean Scores", np.mean(scores1))
```
%% Output
Scores [0.971875 0.97291667 0.96666667 0.978125 0.97916667]
Mean Scores 0.9737500000000001
%% Cell type:code id:16ad1a95 tags:
``` python
#Dump the model
dump(clf1, 'H:/AI classification/RFC_part1.pkl')
```
%% Output
['H:/AI classification/RFC_part1.pkl']
%% Cell type:code id:06c58b04 tags:
``` python
# Load testing dataset
test_data=pd.read_csv('H:\AI classification\TestingDataBinary.csv', header=None)
# Predict Testing dataset
predictions = clf1.predict(test_data) # Using clf1 model to predict
# Convert predictions into dataframe format
predictions_df = pd.DataFrame(predictions)
#Write the predictions to testing dataset
result = pd.concat([test_data,predictions_df], axis=1)
#Output a csv document
result.to_csv('H:/AI classification/test_pre1.csv', index = False, header = False)
```
%% Cell type:code id:c089a3e7 tags:
``` python
# print precdictions
print(predictions)
```
%% Output
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
%% Cell type:code id:47105ba9 tags:
``` python
# Create and print confusion matrix
conf_mat = confusion_matrix(y_test, y_pred1)
print("Confusion Matrix:")
print(conf_mat)
```
%% Output
Confusion Matrix:
[[591 14]
[ 17 578]]
%% Cell type:code id:e71eff12 tags:
``` python
# Calculating f1 score
f1 = f1_score(y_test, y_pred1)
print("F1 Score:", f1)
```
%% Output
F1 Score: 0.9738837405223252
%% Cell type:code id:57ae5f66 tags:
``` python
#Plotting confusion matixs
plt.figure(figsize=(10, 7))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='YlGnBu')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
```
%% Output
%% Cell type:code id:fef4fa06 tags:
``` python
#ROC curve plotting
#Calculate predicted scores, put positives into 1D array
y_score = clf1.predict_proba(X_test)
y_score_positive = y_score[:, 1]
# y_test is the true label,y_score_positive is predicted score
fpr, tpr, _ = roc_curve(y_test, y_score_positive)
roc_auc = auc(fpr, tpr)
#plotting the curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
```
%% Output
%% Cell type:code id:e4a85cd9 tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment