diff --git a/main.py b/main.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/task1.py b/task1.py new file mode 100644 index 0000000000000000000000000000000000000000..62423d9e6a8d676b52dc702bdfa822bd92d407a3 --- /dev/null +++ b/task1.py @@ -0,0 +1,99 @@ +#Import scikit-learn dataset library +import pandas as pd +from matplotlib import pyplot as plt +from sklearn import datasets +from sklearn.decomposition import PCA +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn import svm, metrics + +import numpy as np +import csv + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils import Bunch + + +def load_dataset(): + with open('data/TrainingDataBinary.csv') as csv_file: + data_file = csv.reader(csv_file) + temp = next(data_file) + n_samples = 6000 + n_features = 128 # num. of features (not target) + feature_names = [ + *["R{}-PA{}".format(x+1, y+1) for x in range(29) for y in range(4)], + *["Control, Snort, Relay #{}".format(x+1) for x in range(12)] + ] + + target_names = ['Negative', 'Positive'] + data = np.empty((n_samples, n_features)) + target = np.empty((n_samples,), dtype=np.int64) + + for i, sample in enumerate(data_file): + data[i] = np.asarray(sample[:-1], dtype=np.float64) + target[i] = np.asarray(sample[-1], dtype=np.int64) + + return Bunch(data=data, target=target, feature_names = feature_names, target_names = target_names) + + +def load_test_data(): + with open('data/TestingDataBinary.csv') as csv_file: + data_file = csv.reader(csv_file) + temp = next(data_file) + n_samples = 100 + n_features = 128 # num. of features (not target) + feature_names = [ + *["R{}-PA{}".format(x+1, y+1) for x in range(29) for y in range(4)], + *["Control, Snort, Relay #{}".format(x+1) for x in range(12)] + ] + + target_names = ['Negative', 'Positive'] + data = np.empty((n_samples, n_features)) + + for i, sample in enumerate(data_file): + data[i] = np.asarray(sample, dtype=np.float64) + + return Bunch(data=data, feature_names = feature_names, target_names = target_names) + + +dataset = load_dataset() + +# print the names of the features +print("Features: ", dataset.feature_names) + +# print the label type of cancer('malignant' 'benign') +print("Labels: ", dataset.target_names) + +# print data(feature)shape +print (dataset.data.shape) + + +# Split dataset into training set and test set +X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.3) # 70% training and 30% test + + +#Create a svm Classifier +clf = svm.SVC(kernel='linear') # Linear Kernel + +#Train the model using the training sets +clf.fit(X_train, y_train) + +#Predict the response for test dataset +y_pred = clf.predict(X_test) + +print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) + +print("{} elements tested, {} incorrect".format( + min(len(y_test), len(y_pred)), len([ + True for t, p in zip(y_test, y_pred) if t != p + ]) +)) + +# predict for test data +test_dataset = load_test_data() +test_results = clf.predict(test_dataset.data) + +print("Predicted {} values from test data: {}".format( + len(test_results), ", ".join(str(t) for t in test_results) +))