Skip to content
Snippets Groups Projects
Commit fcdb506e authored by christosPro123's avatar christosPro123
Browse files

Finished report, code comments, QoL formatting

changes
parent 0f0ce335
Branches
No related tags found
No related merge requests found
No preview for this file type
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import sklearn import sklearn
import scipy import scipy
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
import pandas as pd import pandas as pd
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import RandomizedSearchCV
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Read CSV file as Pandas Dataframe #Read CSV file as Pandas Dataframe
train_df = pd.read_csv('TrainingDataBinary.csv') train_df = pd.read_csv('TrainingDataBinary.csv')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(train_df.info()) print(train_df.info())
``` ```
%% Output %% Output
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999 RangeIndex: 6000 entries, 0 to 5999
Columns: 129 entries, 1 to 129 Columns: 129 entries, 1 to 129
dtypes: float64(112), int64(17) dtypes: float64(112), int64(17)
memory usage: 5.9 MB memory usage: 5.9 MB
None None
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Create a histogram to show the distribution of a column # Create a histogram to show the distribution of a column
plt.hist(train_df['129']) plt.hist(train_df['129'])
``` ```
%% Output %% Output
(array([3000., 0., 0., 0., 0., 0., 0., 0., 0., (array([3000., 0., 0., 0., 0., 0., 0., 0., 0.,
3000.]), 3000.]),
array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
<BarContainer object of 10 artists>) <BarContainer object of 10 artists>)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
scaler = StandardScaler() scaler = StandardScaler()
# Separate the features from the target variable # Separate the features from the target variable
X = train_df.drop('129', axis=1) X = train_df.drop('129', axis=1)
y = train_df['129'] y = train_df['129']
#Fix infinite value error #Fix infinite value error
X.replace([np.inf,-np.inf],0,inplace=True) X.replace([np.inf,-np.inf],0,inplace=True)
# Create a SimpleImputer object to replace NaN values with the mean value of the corresponding column # Create a SimpleImputer object to replace NaN values with the mean value of the corresponding column
imputer = SimpleImputer(strategy='mean') imputer = SimpleImputer(strategy='mean')
# Impute the missing values in the features data # Impute the missing values in the features data
X_imputed = imputer.fit_transform(X) X_imputed = imputer.fit_transform(X)
# Fit the scaler to the features data and transform the data # Fit the scaler to the features data and transform the data
X_scaled = scaler.fit_transform(X_imputed) X_scaled = scaler.fit_transform(X_imputed)
# # The transformed data will be a numpy array, so you can convert it back to a DataFrame # # The transformed data will be a numpy array, so you can convert it back to a DataFrame
# X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns) # X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#PCA #PCA
pca = PCA(n_components=100) pca = PCA(n_components=100)
X_pca = pca.fit_transform(X_scaled) X_pca = pca.fit_transform(X_scaled)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#split data #split data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
#train the model #train the model
log_reg = LogisticRegression() log_reg = LogisticRegression()
log_reg.fit(X_train, y_train) log_reg.fit(X_train, y_train)
# 5. Evaluate the model on the testing set # 5. Evaluate the model on the testing set
y_pred = log_reg.predict(X_test) y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred) report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy) print("Accuracy:", accuracy)
print("Classification Report:\n", report) print("Classification Report:\n", report)
``` ```
%% Output %% Output
Accuracy: 0.895 Accuracy: 0.895
Classification Report: Classification Report:
precision recall f1-score support precision recall f1-score support
0 0.86 0.94 0.90 588 0 0.86 0.94 0.90 588
1 0.93 0.86 0.89 612 1 0.93 0.86 0.89 612
accuracy 0.90 1200 accuracy 0.90 1200
macro avg 0.90 0.90 0.89 1200 macro avg 0.90 0.90 0.89 1200
weighted avg 0.90 0.90 0.89 1200 weighted avg 0.90 0.90 0.89 1200
c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1): c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in: Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options: Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result( n_iter_i = _check_optimize_result(
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
conf_matrix = confusion_matrix(y_test, y_pred) conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix) print("Confusion Matrix:\n", conf_matrix)
``` ```
%% Output %% Output
Confusion Matrix: Confusion Matrix:
[[550 38] [[550 38]
[ 88 524]] [ 88 524]]
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
[[True Negatives (TN), False Positives (FP)], [[True Negatives (TN), False Positives (FP)],
[False Negatives (FN), True Positives (TP)]] [False Negatives (FN), True Positives (TP)]]
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Fine tuning Fine tuning
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
param_dist = { param_dist = {
'penalty': ['l1', 'l2', 'elasticnet', 'none'], 'penalty': ['l1', 'l2', 'elasticnet', 'none'],
'C': np.logspace(-4, 4, 10), 'C': np.logspace(-4, 4, 10),
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'max_iter': [100, 500, 1000], 'max_iter': [100, 500, 1000],
} }
# Create the RandomizedSearchCV object with the logistic regression model, hyperparameters, and cross-validation # Create the RandomizedSearchCV object with the logistic regression model, hyperparameters, and cross-validation
log_reg = LogisticRegression() log_reg = LogisticRegression()
random_search = RandomizedSearchCV(log_reg, param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=1, random_state=42) random_search = RandomizedSearchCV(log_reg, param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=1, random_state=42)
# Fit the random search to the training data # Fit the random search to the training data
random_search.fit(X_train, y_train) random_search.fit(X_train, y_train)
# Check the best hyperparameters found # Check the best hyperparameters found
print("Best Parameters:", random_search.best_params_) print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_) print("Best Score:", random_search.best_score_)
# Use the best estimator for predictions and evaluation # Use the best estimator for predictions and evaluation
best_model = random_search.best_estimator_ best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test) y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred) conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred) report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy) print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix) print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report) print("Classification Report:\n", report)
``` ```
%% Output %% Output
Fitting 3 folds for each of 100 candidates, totalling 300 fits Fitting 3 folds for each of 100 candidates, totalling 300 fits
c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py:378: FitFailedWarning: c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py:378: FitFailedWarning:
120 fits failed out of a total of 300. 120 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan. The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'. If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures: Below are more details about the failures:
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
24 fits failed with the following error: 24 fits failed with the following error:
Traceback (most recent call last): Traceback (most recent call last):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
solver = _check_solver(self.solver, self.penalty, self.dual) solver = _check_solver(self.solver, self.penalty, self.dual)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
raise ValueError( raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty. ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty.
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
15 fits failed with the following error: 15 fits failed with the following error:
Traceback (most recent call last): Traceback (most recent call last):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
solver = _check_solver(self.solver, self.penalty, self.dual) solver = _check_solver(self.solver, self.penalty, self.dual)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 71, in _check_solver File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 71, in _check_solver
raise ValueError("penalty='none' is not supported for the liblinear solver") raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported for the liblinear solver ValueError: penalty='none' is not supported for the liblinear solver
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
18 fits failed with the following error: 18 fits failed with the following error:
Traceback (most recent call last): Traceback (most recent call last):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
solver = _check_solver(self.solver, self.penalty, self.dual) solver = _check_solver(self.solver, self.penalty, self.dual)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
raise ValueError( raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty. ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
6 fits failed with the following error: 6 fits failed with the following error:
Traceback (most recent call last): Traceback (most recent call last):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1291, in fit File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1291, in fit
fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)( fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__ File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
return super().__call__(iterable_with_config) return super().__call__(iterable_with_config)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1085, in __call__ File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1085, in __call__
if self.dispatch_one_batch(iterator): if self.dispatch_one_batch(iterator):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch
self._dispatch(tasks) self._dispatch(tasks)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 819, in _dispatch File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 819, in _dispatch
job = self._backend.apply_async(batch, callback=cb) job = self._backend.apply_async(batch, callback=cb)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func) result = ImmediateResult(func)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 597, in __init__ File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 597, in __init__
self.results = batch() self.results = batch()
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in __call__ File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in __call__
return [func(*args, **kwargs) return [func(*args, **kwargs)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in <listcomp> File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
return [func(*args, **kwargs) return [func(*args, **kwargs)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__ File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
return self.function(*args, **kwargs) return self.function(*args, **kwargs)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 521, in _logistic_regression_path File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 521, in _logistic_regression_path
alpha = (1.0 / C) * (1 - l1_ratio) alpha = (1.0 / C) * (1 - l1_ratio)
TypeError: unsupported operand type(s) for -: 'int' and 'NoneType' TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
21 fits failed with the following error: 21 fits failed with the following error:
Traceback (most recent call last): Traceback (most recent call last):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
solver = _check_solver(self.solver, self.penalty, self.dual) solver = _check_solver(self.solver, self.penalty, self.dual)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 64, in _check_solver File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 64, in _check_solver
raise ValueError( raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear. ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
12 fits failed with the following error: 12 fits failed with the following error:
Traceback (most recent call last): Traceback (most recent call last):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
solver = _check_solver(self.solver, self.penalty, self.dual) solver = _check_solver(self.solver, self.penalty, self.dual)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
raise ValueError( raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty. ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
9 fits failed with the following error: 9 fits failed with the following error:
Traceback (most recent call last): Traceback (most recent call last):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
solver = _check_solver(self.solver, self.penalty, self.dual) solver = _check_solver(self.solver, self.penalty, self.dual)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
raise ValueError( raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got elasticnet penalty. ValueError: Solver sag supports only 'l2' or 'none' penalties, got elasticnet penalty.
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
6 fits failed with the following error: 6 fits failed with the following error:
Traceback (most recent call last): Traceback (most recent call last):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
solver = _check_solver(self.solver, self.penalty, self.dual) solver = _check_solver(self.solver, self.penalty, self.dual)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
raise ValueError( raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty. ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
9 fits failed with the following error: 9 fits failed with the following error:
Traceback (most recent call last): Traceback (most recent call last):
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params) estimator.fit(X_train, y_train, **fit_params)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
solver = _check_solver(self.solver, self.penalty, self.dual) solver = _check_solver(self.solver, self.penalty, self.dual)
File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver File "c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
raise ValueError( raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty. ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
warnings.warn(some_fits_failed_message, FitFailedWarning) warnings.warn(some_fits_failed_message, FitFailedWarning)
c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_search.py:952: UserWarning: One or more of the test scores are non-finite: [ nan 0.87666667 0.92083333 nan nan 0.87416667 c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_search.py:952: UserWarning: One or more of the test scores are non-finite: [ nan 0.87666667 0.92083333 nan nan 0.87416667
nan 0.87666667 0.864375 0.87645833 0.78208333 0.87854167 nan 0.87666667 0.864375 0.87645833 0.78208333 0.87854167
0.72333333 0.87854167 0.85645833 nan nan nan 0.72333333 0.87854167 0.85645833 nan nan nan
0.85083333 0.72333333 0.5025 0.92020833 0.78208333 0.918125 0.85083333 0.72333333 0.5025 0.92020833 0.78208333 0.918125
0.86458333 0.87666667 nan 0.9225 0.90375 nan 0.86458333 0.87666667 nan 0.9225 0.90375 nan
0.78208333 nan 0.5025 nan nan nan 0.78208333 nan 0.5025 nan nan nan
nan 0.78208333 nan 0.78208333 0.85645833 0.628125 nan 0.78208333 nan 0.78208333 0.85645833 0.628125
0.918125 nan 0.49916667 0.85875 nan 0.49916667 0.918125 nan 0.49916667 0.85875 nan 0.49916667
nan nan 0.87791667 0.86520833 nan 0.9225 nan nan 0.87791667 0.86520833 nan 0.9225
nan 0.918125 0.865625 0.84166667 nan 0.9225 nan 0.918125 0.865625 0.84166667 nan 0.9225
0.90375 0.918125 0.87375 0.918125 0.864375 nan 0.90375 0.918125 0.87375 0.918125 0.864375 nan
nan 0.87666667 nan 0.90375 0.85625 0.62895833 nan 0.87666667 nan 0.90375 0.85625 0.62895833
nan nan 0.85625 nan nan 0.87854167 nan nan 0.85625 nan nan 0.87854167
0.85645833 nan 0.87791667 0.90395833 0.87854167 nan 0.85645833 nan 0.87791667 0.90395833 0.87854167 nan
nan 0.87375 0.78208333 0.87666667 nan nan nan 0.87375 0.78208333 0.87666667 nan nan
0.78208333 0.90270833 nan nan 0.85625 nan 0.78208333 0.90270833 nan nan 0.85625 nan
0.86583333 nan nan nan] 0.86583333 nan nan nan]
warnings.warn( warnings.warn(
c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`. c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
warnings.warn( warnings.warn(
c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:1181: UserWarning: Setting penalty=None will ignore the C and l1_ratio parameters c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:1181: UserWarning: Setting penalty=None will ignore the C and l1_ratio parameters
warnings.warn( warnings.warn(
Best Parameters: {'solver': 'newton-cg', 'penalty': 'none', 'max_iter': 100, 'C': 0.005994842503189409} Best Parameters: {'solver': 'newton-cg', 'penalty': 'none', 'max_iter': 100, 'C': 0.005994842503189409}
Best Score: 0.9225 Best Score: 0.9225
Accuracy: 0.9325 Accuracy: 0.9325
Confusion Matrix: Confusion Matrix:
[[557 31] [[557 31]
[ 50 562]] [ 50 562]]
Classification Report: Classification Report:
precision recall f1-score support precision recall f1-score support
0 0.92 0.95 0.93 588 0 0.92 0.95 0.93 588
1 0.95 0.92 0.93 612 1 0.95 0.92 0.93 612
accuracy 0.93 1200 accuracy 0.93 1200
macro avg 0.93 0.93 0.93 1200 macro avg 0.93 0.93 0.93 1200
weighted avg 0.93 0.93 0.93 1200 weighted avg 0.93 0.93 0.93 1200
c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:210: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations. c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:210: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.
warnings.warn( warnings.warn(
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Predict Predict
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
test_data = pd.read_csv('TestingDataBinary.csv') test_data = pd.read_csv('TestingDataBinary.csv')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Preprocessing # Preprocessing
X_new = test_data X_new = test_data
X_new.replace([np.inf, -np.inf], 0, inplace=True) X_new.replace([np.inf, -np.inf], 0, inplace=True)
# Impute the missing values in the features data # Impute the missing values in the features data
X_imputed_new = imputer.transform(X_new) X_imputed_new = imputer.transform(X_new)
# Scale the features data # Scale the features data
X_scaled_new = scaler.transform(X_imputed_new) X_scaled_new = scaler.transform(X_imputed_new)
# Apply PCA transformation # Apply PCA transformation
X_pca_new = pca.transform(X_scaled_new) X_pca_new = pca.transform(X_scaled_new)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Use the best estimator for predictions on the new data # Use the best estimator for predictions on the new data
y_pred_new = best_model.predict(X_pca_new) y_pred_new = best_model.predict(X_pca_new)
# Save the predictions to a new column in the DataFrame # Save the predictions to a new column in the DataFrame
test_data['predicted_marker'] = y_pred_new test_data['predicted_marker'] = y_pred_new
# Save the updated DataFrame to a new CSV file # Save the updated DataFrame to a new CSV file
test_data.to_csv('TestingDataBinary_with_predictions.csv', index=False) test_data.to_csv('TestingResultsBinary.csv', index=False)
``` ```
......
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import sklearn import sklearn
import scipy import scipy
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
import pandas as pd import pandas as pd
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report from sklearn.metrics import classification_report
from sklearn.model_selection import learning_curve from sklearn.model_selection import learning_curve
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Read CSV file as Pandas Dataframe #Read CSV file as Pandas Dataframe
train_df = pd.read_csv('TrainingDataMulti.csv') train_df = pd.read_csv('TrainingDataMulti.csv')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(train_df.info()) print(train_df.info())
``` ```
%% Output %% Output
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999 RangeIndex: 6000 entries, 0 to 5999
Columns: 129 entries, 1 to 129 Columns: 129 entries, 1 to 129
dtypes: float64(112), int64(17) dtypes: float64(112), int64(17)
memory usage: 5.9 MB memory usage: 5.9 MB
None None
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(train_df) print(train_df)
``` ```
%% Output %% Output
1 2 3 4 5 \ 1 2 3 4 5 \
0 70.399324 127673.0908 -49.572308 127648.0176 -169.578319 0 70.399324 127673.0908 -49.572308 127648.0176 -169.578319
1 73.688102 130280.7109 -46.300719 130255.6377 -166.278082 1 73.688102 130280.7109 -46.300719 130255.6377 -166.278082
2 73.733939 130305.7842 -46.254883 130280.7109 -166.232245 2 73.733939 130305.7842 -46.254883 130280.7109 -166.232245
3 74.083443 130581.5902 -45.899649 130556.5169 -165.882741 3 74.083443 130581.5902 -45.899649 130556.5169 -165.882741
4 74.553268 131083.0556 -45.424094 131057.9823 -165.424375 4 74.553268 131083.0556 -45.424094 131057.9823 -165.424375
... ... ... ... ... ... ... ... ... ... ... ...
5995 116.889120 131860.3269 -3.076783 131810.1804 -123.094253 5995 116.889120 131860.3269 -3.076783 131810.1804 -123.094253
5996 116.849013 131810.1804 -3.116890 131760.0339 -123.128630 5996 116.849013 131810.1804 -3.116890 131760.0339 -123.128630
5997 116.384917 131734.9606 -3.586716 131684.8140 -123.586996 5997 116.384917 131734.9606 -3.586716 131684.8140 -123.586996
5998 111.125164 130506.3704 -8.846468 130456.2238 -128.858208 5998 111.125164 130506.3704 -8.846468 130456.2238 -128.858208
5999 110.878793 130481.2971 -9.092840 130456.2238 -129.104580 5999 110.878793 130481.2971 -9.092840 130456.2238 -129.104580
6 7 8 9 10 ... 120 121 \ 6 7 8 9 10 ... 120 121 \
0 127723.2374 65.689611 605.91099 -57.003571 626.78553 ... 0 0 0 127723.2374 65.689611 605.91099 -57.003571 626.78553 ... 0 0
1 130355.9307 71.831719 483.59351 -50.947407 500.98896 ... 0 0 1 130355.9307 71.831719 483.59351 -50.947407 500.98896 ... 0 0
2 130381.0040 71.808800 483.59351 -50.913030 500.98896 ... 0 0 2 130381.0040 71.808800 483.59351 -50.913030 500.98896 ... 0 0
3 130656.8100 72.152575 482.86107 -50.437475 499.15786 ... 0 0 3 130656.8100 72.152575 482.86107 -50.437475 499.15786 ... 0 0
4 131158.2754 72.118198 484.50906 -50.013486 497.69298 ... 0 0 4 131158.2754 72.118198 484.50906 -50.013486 497.69298 ... 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5995 131910.4735 114.780635 376.10794 -5.254023 374.82617 ... 0 0 5995 131910.4735 114.780635 376.10794 -5.254023 374.82617 ... 0 0
5996 131885.4002 114.769176 376.29105 -5.322778 374.82617 ... 0 0 5996 131885.4002 114.769176 376.29105 -5.322778 374.82617 ... 0 0
5997 131785.1071 114.299351 376.47416 -5.849899 374.82617 ... 0 0 5997 131785.1071 114.299351 376.47416 -5.849899 374.82617 ... 0 0
5998 130556.5169 106.667553 478.83265 -13.464508 477.73399 ... 0 0 5998 130556.5169 106.667553 478.83265 -13.464508 477.73399 ... 0 0
5999 130556.5169 106.392533 478.83265 -13.750987 477.91710 ... 0 0 5999 130556.5169 106.392533 478.83265 -13.750987 477.91710 ... 0 0
122 123 124 125 126 127 128 129 122 123 124 125 126 127 128 129
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5995 0 0 0 0 0 0 0 0 5995 0 0 0 0 0 0 0 0
5996 0 0 0 0 0 0 0 0 5996 0 0 0 0 0 0 0 0
5997 0 0 0 0 0 0 0 0 5997 0 0 0 0 0 0 0 0
5998 0 0 0 0 0 0 0 0 5998 0 0 0 0 0 0 0 0
5999 0 0 0 0 0 0 0 0 5999 0 0 0 0 0 0 0 0
[6000 rows x 129 columns] [6000 rows x 129 columns]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Plot histogram of classification distribution
plt.hist(train_df['129']) plt.hist(train_df['129'])
``` ```
%% Output %% Output
(array([3000., 0., 0., 0., 0., 1500., 0., 0., 0., (array([3000., 0., 0., 0., 0., 1500., 0., 0., 0.,
1500.]), 1500.]),
array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. ]), array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. ]),
<BarContainer object of 10 artists>) <BarContainer object of 10 artists>)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Split features from target classification to fit Random Forest Classifier
X = train_df.drop('129', axis=1) X = train_df.drop('129', axis=1)
y = train_df['129'] y = train_df['129']
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
rfc = RandomForestClassifier() rfc = RandomForestClassifier()
rfc.fit(X,y) rfc.fit(X,y)
``` ```
%% Output %% Output
RandomForestClassifier() RandomForestClassifier()
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Extract the importance of each feature from the model into a seperate dataframe
importances = rfc.feature_importances_ importances = rfc.feature_importances_
feature_importances = pd.DataFrame({'Feature':X.columns, 'Importance': importances}) feature_importances = pd.DataFrame({'Feature':X.columns, 'Importance': importances})
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Sort the features dataframe in descending order (highest importance feature at top)
feature_importances = feature_importances.sort_values('Importance', ascending=False) feature_importances = feature_importances.sort_values('Importance', ascending=False)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(feature_importances.head()) print(feature_importances.head())
``` ```
%% Output %% Output
Feature Importance Feature Importance
90 91 0.050203 90 91 0.050203
100 101 0.024118 100 101 0.024118
50 51 0.022545 50 51 0.022545
81 82 0.021537 81 82 0.021537
3 4 0.020457 3 4 0.020457
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Extract the top N features from dataframe into a list (n was changed several times to test - 40 is ideal to avoid overfitting)
n_features= feature_importances.head(40)['Feature'].tolist() n_features= feature_importances.head(40)['Feature'].tolist()
``` ```
%% Cell type:markdown id: tags:
Feature Columns: 91, 82, 101, 53, and 115
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Re-split the data based on those 40 features
X = train_df[n_features] X = train_df[n_features]
y = train_df['129'] y = train_df['129']
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Train classifier again based on new split of data
rfc = RandomForestClassifier() rfc = RandomForestClassifier()
rfc.fit(X_train,y_train) rfc.fit(X_train,y_train)
``` ```
%% Output %% Output
RandomForestClassifier() RandomForestClassifier()
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Make first predictions
y_pred = rfc.predict(X_test) y_pred = rfc.predict(X_test)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Print accuracy
accuracy = accuracy_score(y_test,y_pred) accuracy = accuracy_score(y_test,y_pred)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Generate the classification report # Generate the classification report
classification_rep = classification_report(y_test, y_pred) classification_rep = classification_report(y_test, y_pred)
# Print the classification report
print(classification_rep) print(classification_rep)
``` ```
%% Output %% Output
precision recall f1-score support precision recall f1-score support
0 0.99 0.98 0.98 602 0 0.99 0.98 0.98 602
1 0.90 0.90 0.90 277 1 0.90 0.90 0.90 277
2 0.91 0.91 0.91 321 2 0.91 0.91 0.91 321
accuracy 0.94 1200 accuracy 0.94 1200
macro avg 0.93 0.93 0.93 1200 macro avg 0.93 0.93 0.93 1200
weighted avg 0.95 0.94 0.95 1200 weighted avg 0.95 0.94 0.95 1200
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Calculate the learning curve # Calculate the learning curve
train_sizes, train_scores, test_scores = learning_curve( train_sizes, train_scores, test_scores = learning_curve(
rfc, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring='accuracy') rfc, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring='accuracy')
# Calculate the mean and standard deviation of the training and testing scores # Calculate the mean and standard deviation of the training and testing scores
train_scores_mean = np.mean(train_scores, axis=1) train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1)
# Plot the learning curve # Plot the learning curve
plt.figure() plt.figure()
plt.title('Learning Curve') plt.title('Learning Curve')
plt.xlabel('Training Examples') plt.xlabel('Training Examples')
plt.ylabel('Accuracy') plt.ylabel('Accuracy')
plt.grid() plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color='r') train_scores_mean + train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color='g') test_scores_mean + test_scores_std, alpha=0.1, color='g')
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Accuracy') plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Accuracy')
plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Testing Accuracy') plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Testing Accuracy')
plt.legend(loc='best') plt.legend(loc='best')
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Load Testing Data and Classify Load Testing Data and Classify
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
test_df = pd.read_csv('TestingDataMulti.csv') test_df = pd.read_csv('TestingDataMulti.csv')
print(test_df.info()) print(test_df.info())
``` ```
%% Output %% Output
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99 RangeIndex: 100 entries, 0 to 99
Columns: 128 entries, 1 to 128 Columns: 128 entries, 1 to 128
dtypes: float64(104), int64(24) dtypes: float64(104), int64(24)
memory usage: 100.1 KB memory usage: 100.1 KB
None None
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Split data to classify based on those same n-features
multi_test = test_df[n_features] multi_test = test_df[n_features]
predicted_category = rfc.predict(multi_test) predicted_category = rfc.predict(multi_test)
print("Predicted Category: ", predicted_category) print("Predicted Category: ", predicted_category)
``` ```
%% Output %% Output
Predicted Category: [2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 0 0 0 1 1 1 1 Predicted Category: [2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 0 0 0 1 1 1 1
1 1 1 1 2 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 2 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Calculate distribution of predicted classification
category_counts = pd.Series(predicted_category).value_counts(normalize=True) * 100 category_counts = pd.Series(predicted_category).value_counts(normalize=True) * 100
print('Category Percentages:') print('Category Percentages:')
print(category_counts) print(category_counts)
``` ```
%% Output %% Output
Category Percentages: Category Percentages:
1 39.0 1 39.0
2 31.0 2 31.0
0 30.0 0 30.0
dtype: float64 dtype: float64
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Append predictions to dataframe as new column and save as new CSV
test_df["Prediction Markers"] = predicted_category test_df["Prediction Markers"] = predicted_category
test_df.to_csv("TestingResultsMulti.csv",index=False) test_df.to_csv("TestingResultsMulti.csv",index=False)
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment