Skip to content
Snippets Groups Projects
Commit 263e8859 authored by Fanis Baikas's avatar Fanis Baikas
Browse files

First commit. Added kNN classifier class compatible with PyTorch GPU...

First commit. Added kNN classifier class compatible with PyTorch GPU acceleration. An MPI execution environment is required to run knn_random_subsets.py
parents
No related branches found
No related tags found
No related merge requests found
File added
File added
import numpy as np
import torch
class kNearestNeighbors(object):
""" A k-Nearest-Neighbors classifier with L2 distance """
def __init__(self):
pass
def train(self, X, y, device):
"""
Train the classifier. For k-Nearest-Neighbors training involves just
storing the training data.
:param X: A torch Tensor of shape [num_train, D] containing num_train training examples
each of dimension D.
:param y: A torch Tensor of shape [num_train] containing the training labels,
where y[i] is the label for X[i].
:param device: The device name used by torch for acceleration (cpu, cuda, mps etc.)
"""
self.X_train = X
self.y_train = y
self.device = device
def predict(self, X, k=1):
"""
Predict labels for the test data using this classifier.
:param X: A torch Tensor of shape [num_test, D] containing num_test test examples
each of dimension D.
:param k: The number of k-nearest-neighbors that vote for the predicted labels.
:return y: A torch Tensor of shape [num_test] containing predicted labels for the
test data, where y[i] is the predicted label for the test point X[i].
"""
dists = self.compute_distances(X)
return self.predict_labels(dists, k=k)
def compute_distances(self, X):
"""
Compute the l2 distance matrix between each test point in X and each training point
in self.X_train. The computation of the matrix has been implemented using only matrix operations
without any loops.
:param X: A torch Tensor of shape [num_test, D] containing num_test test examples
each of dimension D.
:return dists: A torch Tensor of shape [num_test, num_train] where dists[i, j] is the
l2 distance between the ith test point and the jth training point.
"""
# The distance of a single test vector from a training vector is given by (x_test - x_train)^2 (ignoring the
# square root). But this is equal to x_test * x_test.T + x_train * x_train.T - 2 * x_test * x_train.T.
# Because we are dealing with multiple test and train examples, we can create 2 matrices containing all the dot
# products necessary for the calculation of the distances. The shape of these matrices is (num_test, num_train).
# X_dot contains the dot products of test examples copied over num_train columns.
# X_train_dot contains the dot products of training examples copied over num_test rows.
# X_dot = (X * X).sum(axis=1).reshape((X.shape[0], 1)) * np.ones(shape=(1, self.X_train.shape[0]))
# X_train_dot = (self.X_train * self.X_train).sum(axis=1) * np.ones(shape=(X.shape[0], 1))
# dists = X_dot + X_train_dot - 2*X.dot(self.X_train.T) # Ignore computing the square root for efficiency
X_dot = torch.mul(X, X).sum(axis=1).reshape((X.shape[0], 1)) * torch.ones(size=(1, self.X_train.shape[0])).to(self.device)
X_train_dot = torch.mul(self.X_train, self.X_train).sum(axis=1) * torch.ones(size=(X.shape[0], 1)).to(self.device)
dists = X_dot + X_train_dot - 2*torch.mm(X, self.X_train.T) # Ignore computing the square root for efficiency
return dists
def predict_labels(self, dists, k=1):
"""
Given a matrix of distances between test and training examples,
predict a label for each test point.
:param dists: A torch Tensor of shape [num_test, num_train] where dists[i, j] gives the
distance between the ith test and the jth training example.
:param k: The number of k-nearest-neighbors that vote for the predicted labels.
:return y: A torch Tensor of shape [num_test] containing predicted labels for the test data,
where y[i] is the predicted label for the test point X[i].
"""
num_test = dists.shape[0]
y_pred_np = np.zeros(num_test)
# Convert to numpy arrays
dists_np = dists.cpu().numpy()
y_train_np = self.y_train.cpu().numpy()
for i in range(num_test):
# Store the sorted indices of the ith row of the dists matrix to closest_y
closest_y = np.argsort(dists_np[i, :])
# Keep only the first k indices
closest_y = closest_y[0:k]
# Recover the labels of the k training examples with the lowest distance from the test example
kNN_labels = y_train_np[closest_y]
# Find the most common label (in case of a tie, select the smallest label)
kNN_labels_counts = np.bincount(kNN_labels)
y_pred_np[i] = np.argmax(kNN_labels_counts)
y_pred = torch.from_numpy(y_pred_np)
return y_pred
from mpi4py import MPI
import argparse
import sys
import os
import numpy as np
import torch
from torchvision import datasets
from torchvision.transforms import ToTensor
from classifiers.k_nearest_neighbors import kNearestNeighbors
def positive_int(s: str) -> int:
try:
v = int(s)
except ValueError:
raise argparse.ArgumentTypeError(f'Expected integer, got {s!r}')
if v <= 0:
raise argparse.ArgumentTypeError(f'Expected positive integer, got {v}')
return v
def get_random_balanced_subset_indices(dataset, subset_size):
idxs = []
dataset_size = len(dataset)
for class_num, _ in enumerate(dataset.classes):
# Get all the indices corresponding to the class number
class_idxs = torch.nonzero((dataset.targets == class_num))
# Remove unnecessary dimensions and convert to numpy array
class_idxs = torch.squeeze(class_idxs).numpy()
# Set the number of examples to be selected from the given class depending on the portion that it occupies in
# complete dataset
class_subset_size = (int)((class_idxs.size / dataset_size) * subset_size)
# Get a random sample of the indices
class_idxs = np.random.choice(class_idxs, class_subset_size, replace=False)
idxs += list(class_idxs)
return idxs
def ACC(classifier, X_train, y, X_test, y_test, device):
classifier.train(X_train, y, device)
y_pred = classifier.predict(X_test, k=3)
num_correct = torch.sum(y_test == y_pred)
acc = num_correct / y_test.shape[0]
return acc
# Get the MPI communicator object
comm = MPI.COMM_WORLD
# Create send and receive buffer for the tables that will hold the test accuracies and subset indices
acc_table_send_buffer = []
acc_table_rec_buffer = []
idxs_table_send_buffer = []
idxs_table_rec_buffer = []
if comm.rank == 0:
# Create an argument parser to set the subset_size and num_of_repetitions
parser = argparse.ArgumentParser(
'Script for evaluating the accuracy of kNN classifier on multiple random subsets of the FashionMNIST training set')
parser.add_argument('subset_size', type=positive_int,
help='The size of the random subsets generated by the program')
parser.add_argument('reps', type=positive_int,
help='The total number of random subsets the program will generate for testing the accuracy of the kNN classifier')
args = vars(parser.parse_args())
subset_size = args['subset_size']
num_of_repetitions = args['reps']
print('Program started.')
print('Number of available processor cores:', comm.size)
print('Running...')
# Redirect stdout of the first process to write output to file
orig_stdout = sys.stdout
f = open('log/output_log_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.txt', 'w')
sys.stdout = f
# Raise an exception if the number of repetitions is not fully divisible by the number of cores
if num_of_repetitions % comm.size != 0:
raise Exception('The specified number of iterations should be a multiple of the number of available cores.')
# To split the computational load equally among the processors, the total number of repetitions is divided by the number
# of available processor cores
repetitions_per_processor = (int)(num_of_repetitions / comm.size)
# Check if GPU acceleration is available
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
print(f"Using device: {device}")
print('subset_size:', subset_size)
print('num_of_repetitions:', num_of_repetitions, '\n')
# Obtain the FashionMNIST dataset
train_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=True)
test_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=False)
X_train = train_set.data.type(torch.float32)
y_train = train_set.targets.to(device)
X_test = test_set.data.type(torch.float32)
y_test = test_set.targets.to(device)
print('X_train shape:', X_train.shape, X_train.dtype)
print('y_train shape:', y_train.shape, y_train.dtype)
print('X_test shape:', X_test.shape, X_test.dtype)
print('y_test shape:', y_test.shape, y_test.dtype, '\n')
X_train = torch.reshape(X_train, (X_train.shape[0], -1)).to(device)
X_test = torch.reshape(X_test, (X_test.shape[0], -1)).to(device)
# Create the table that will hold the obtained accuracies and subset indices
acc_table = np.zeros((comm.size, repetitions_per_processor), dtype=float)
idxs_table = np.zeros((comm.size, repetitions_per_processor, subset_size), dtype=np.uint16)
print('acc_table shape:', acc_table.shape)
print('idxs_table shape:', idxs_table.shape, '\n')
acc_table_send_buffer = acc_table
idxs_table_send_buffer = idxs_table
if comm.rank != 0:
subset_size = None
num_of_repetitions = None
repetitions_per_processor = None
device = None
train_set = None
X_train = None
y_train = None
X_test = None
y_test = None
# Scatter the acc_table and idxs_table arrays to the available cores
acc_table_local = comm.scatter(acc_table_send_buffer, root=0)
idxs_table_local = comm.scatter(idxs_table_send_buffer, root=0)
# Broadcast the rest of the data
subset_size = comm.bcast(subset_size, root=0)
num_of_repetitions = comm.bcast(num_of_repetitions, root=0)
repetitions_per_processor = comm.bcast(repetitions_per_processor, root=0)
device = comm.bcast(device, root=0)
train_set = comm.bcast(train_set, root=0)
X_train = comm.bcast(X_train, root=0)
y_train = comm.bcast(y_train, root=0)
X_test = comm.bcast(X_test, root=0)
y_test = comm.bcast(y_test, root=0)
# Set random seeds for reproducible results
torch.manual_seed(comm.rank)
np.random.seed(comm.rank)
# Evaluate accuracy multiple times
for rep in range(repetitions_per_processor):
# Get a new balanced random subset
idxs = get_random_balanced_subset_indices(train_set, subset_size=subset_size)
S = X_train[idxs]
S_y = y_train[idxs].to(device)
classifier = kNearestNeighbors()
acc = ACC(classifier, S, S_y, X_test, y_test, device)
acc_table_local[rep] = acc
idxs_table_local[rep] = idxs
# For debugging purposes, doesn't work with file log because of multiprocessing
# print("Rank {}, Rep {}, Acc = {}".format(comm.rank, rep, acc))
# Gather the local arrays to the receiver buffers
acc_table_rec_buffer = comm.gather(acc_table_local, root=0)
idxs_table_rec_buffer = comm.gather(idxs_table_local, root=0)
if comm.rank == 0:
print('Gathered acc_table on rank 0')
acc_table = np.array(acc_table_rec_buffer).reshape(num_of_repetitions)
# print(acc_table, acc_table.shape, '\n')
print('Gathered idxs_table on rank 0')
idxs_table = np.array(idxs_table_rec_buffer).reshape((num_of_repetitions, subset_size))
# print(idxs_table, idxs_table.shape, '\n')
# Assign a value to each example as the average accuracy obtained from all the random subsets in which it was included
# Store the total number of times the example was included in the second column of the table
example_value_table = np.zeros((len(train_set), 2), dtype=float)
for example_idx, _ in enumerate(train_set):
idxs_table_rows, _ = np.where(idxs_table == example_idx)
if len(idxs_table_rows) != 0:
example_value_table[example_idx][0] = np.sum(acc_table[idxs_table_rows]) / len(idxs_table_rows)
example_value_table[example_idx][1] = len(idxs_table_rows)
print('Example idx %d, count = %d, value = %0.4f' % (example_idx, example_value_table[example_idx][1], example_value_table[example_idx][0]))
# Get the index of the example with the maximum value
max_value_example_idx = np.argmax(example_value_table[:, 0])
# Get the index of the example with the minimum non-zero value
# Find the indices of the non-zero value examples first
non_zero_value_example_idxs = np.nonzero(example_value_table[:, 0])
min_value_example_idx_in_non_zero_array = np.argmin(example_value_table[non_zero_value_example_idxs, 0])
min_value_example_idx = non_zero_value_example_idxs[0][min_value_example_idx_in_non_zero_array]
sorted_example_value_idxs = np.argsort(example_value_table[:, 0])[::-1]
print('\nSorted examples in descending value order:')
for i in range(len(train_set)):
print('Example idx %d, count = %d, value = %0.4f' % (sorted_example_value_idxs[i], example_value_table[sorted_example_value_idxs[i]][1], example_value_table[sorted_example_value_idxs[i]][0]))
print('\n' + 'max value: %0.4f, example idx %d, count = %d' % (example_value_table[max_value_example_idx, 0], max_value_example_idx, example_value_table[max_value_example_idx, 1]))
print('min value: %0.4f, example idx %d, count = % d' % (example_value_table[min_value_example_idx, 0], min_value_example_idx, example_value_table[min_value_example_idx, 1]))
path = 'numpy_arrays/subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions)
if not os.path.exists(path):
os.mkdir(path)
np.save(path + '/acc_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions), acc_table)
np.save(path + '/idxs_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions), idxs_table)
np.save(path + '/example_value_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions), example_value_table)
sys.stdout = orig_stdout
f.close()
print('Done!')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment