First commit. Added kNN classifier class compatible with PyTorch GPU...

First commit. Added kNN classifier class compatible with PyTorch GPU acceleration. An MPI execution environment is required to run knn_random_subsets.py

First commit. Added kNN classifier class compatible with PyTorch GPU...
263e8859 · Fanis Baikas · 263e8859 · 263e8859 · 263e8859 · 263e8859
Commit 263e8859 authored 2 years ago by Fanis Baikas
--- a/classifiers/__pycache__/k_nearest_neighbors.cpython-310.pyc
+++ b/classifiers/__pycache__/k_nearest_neighbors.cpython-310.pyc
--- a/classifiers/__pycache__/k_nearest_neighbors.cpython-39.pyc
+++ b/classifiers/__pycache__/k_nearest_neighbors.cpython-39.pyc
--- a/classifiers/k_nearest_neighbors.py
+++ b/classifiers/k_nearest_neighbors.py
+import numpy as np
+import torch
+class kNearestNeighbors(object):
+    """ A k-Nearest-Neighbors classifier with L2 distance """
+    def __init__(self):
+        pass
+    def train(self, X, y, device):
+        """
+        Train the classifier. For k-Nearest-Neighbors training involves just
+        storing the training data.
+        :param X: A torch Tensor of shape [num_train, D] containing num_train training examples
+        each of dimension D.
+        :param y: A torch Tensor of shape [num_train] containing the training labels,
+        where y[i] is the label for X[i].
+        :param device: The device name used by torch for acceleration (cpu, cuda, mps etc.)
+        """
+        self.X_train = X
+        self.y_train = y
+        self.device = device
+    def predict(self, X, k=1):
+        """
+        Predict labels for the test data using this classifier.
+        :param X: A torch Tensor of shape [num_test, D] containing num_test test examples
+        each of dimension D.
+        :param k: The number of k-nearest-neighbors that vote for the predicted labels.
+        :return y: A torch Tensor of shape [num_test] containing predicted labels for the
+        test data, where y[i] is the predicted label for the test point X[i].
+        """
+        dists = self.compute_distances(X)
+        return self.predict_labels(dists, k=k)
+    def compute_distances(self, X):
+        """
+        Compute the l2 distance matrix between each test point in X and each training point
+        in self.X_train. The computation of the matrix has been implemented using only matrix operations
+        without any loops.
+        :param X: A torch Tensor of shape [num_test, D] containing num_test test examples
+        each of dimension D.
+        :return dists: A torch Tensor of shape [num_test, num_train] where dists[i, j] is the
+        l2 distance between the ith test point and the jth training point.
+        """
+        # The distance of a single test vector from a training vector is given by (x_test - x_train)^2 (ignoring the
+        # square root). But this is equal to x_test * x_test.T + x_train * x_train.T - 2 * x_test * x_train.T.
+        # Because we are dealing with multiple test and train examples, we can create 2 matrices containing all the dot
+        # products necessary for the calculation of the distances. The shape of these matrices is (num_test, num_train).
+        # X_dot contains the dot products of test examples copied over num_train columns.
+        # X_train_dot contains the dot products of training examples copied over num_test rows.
+        # X_dot = (X * X).sum(axis=1).reshape((X.shape[0], 1)) * np.ones(shape=(1, self.X_train.shape[0]))
+        # X_train_dot = (self.X_train * self.X_train).sum(axis=1) * np.ones(shape=(X.shape[0], 1))
+        # dists = X_dot + X_train_dot - 2*X.dot(self.X_train.T)  # Ignore computing the square root for efficiency
+        X_dot = torch.mul(X, X).sum(axis=1).reshape((X.shape[0], 1)) * torch.ones(size=(1, self.X_train.shape[0])).to(self.device)
+        X_train_dot = torch.mul(self.X_train, self.X_train).sum(axis=1) * torch.ones(size=(X.shape[0], 1)).to(self.device)
+        dists = X_dot + X_train_dot - 2*torch.mm(X, self.X_train.T)  # Ignore computing the square root for efficiency
+        return dists
+    def predict_labels(self, dists, k=1):
+        """
+        Given a matrix of distances between test and training examples,
+        predict a label for each test point.
+        :param dists: A torch Tensor of shape [num_test, num_train] where dists[i, j] gives the
+        distance between the ith test and the jth training example.
+        :param k: The number of k-nearest-neighbors that vote for the predicted labels.
+        :return y: A torch Tensor of shape [num_test] containing predicted labels for the test data,
+        where y[i] is the predicted label for the test point X[i].
+        """
+        num_test = dists.shape[0]
+        y_pred_np = np.zeros(num_test)
+        # Convert to numpy arrays
+        dists_np = dists.cpu().numpy()
+        y_train_np = self.y_train.cpu().numpy()
+        for i in range(num_test):
+            # Store the sorted indices of the ith row of the dists matrix to closest_y
+            closest_y = np.argsort(dists_np[i, :])
+            # Keep only the first k indices
+            closest_y = closest_y[0:k]
+            # Recover the labels of the k training examples with the lowest distance from the test example
+            kNN_labels = y_train_np[closest_y]
+            # Find the most common label (in case of a tie, select the smallest label)
+            kNN_labels_counts = np.bincount(kNN_labels)
+            y_pred_np[i] = np.argmax(kNN_labels_counts)
+        y_pred = torch.from_numpy(y_pred_np)
+        return y_pred
--- a/knn_random_subsets.py
+++ b/knn_random_subsets.py
+from mpi4py import MPI
+import argparse
+import sys
+import os
+import numpy as np
+import torch
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+from classifiers.k_nearest_neighbors import kNearestNeighbors
+def positive_int(s: str) -> int:
+    try:
+        v = int(s)
+    except ValueError:
+        raise  argparse.ArgumentTypeError(f'Expected integer, got {s!r}')
+    if v <= 0:
+        raise argparse.ArgumentTypeError(f'Expected positive integer, got {v}')
+    return v
+def get_random_balanced_subset_indices(dataset, subset_size):
+    idxs = []
+    dataset_size = len(dataset)
+    for class_num, _ in enumerate(dataset.classes):
+        # Get all the indices corresponding to the class number
+        class_idxs = torch.nonzero((dataset.targets == class_num))
+        # Remove unnecessary dimensions and convert to numpy array
+        class_idxs = torch.squeeze(class_idxs).numpy()
+        # Set the number of examples to be selected from the given class depending on the portion that it occupies in
+        # complete dataset
+        class_subset_size = (int)((class_idxs.size / dataset_size) * subset_size)
+        # Get a random sample of the indices
+        class_idxs = np.random.choice(class_idxs, class_subset_size, replace=False)
+        idxs += list(class_idxs)
+    return idxs
+def ACC(classifier, X_train, y, X_test, y_test, device):
+    classifier.train(X_train, y, device)
+    y_pred = classifier.predict(X_test, k=3)
+    num_correct = torch.sum(y_test == y_pred)
+    acc = num_correct / y_test.shape[0]
+    return acc
+# Get the MPI communicator object
+comm = MPI.COMM_WORLD
+# Create send and receive buffer for the tables that will hold the test accuracies and subset indices
+acc_table_send_buffer = []
+acc_table_rec_buffer = []
+idxs_table_send_buffer = []
+idxs_table_rec_buffer = []
+if comm.rank == 0:
+    # Create an argument parser to set the subset_size and num_of_repetitions
+    parser = argparse.ArgumentParser(
+        'Script for evaluating the accuracy of kNN classifier on multiple random subsets of the FashionMNIST training set')
+    parser.add_argument('subset_size', type=positive_int,
+                        help='The size of the random subsets generated by the program')
+    parser.add_argument('reps', type=positive_int,
+                        help='The total number of random subsets the program will generate for testing the accuracy of the kNN classifier')
+    args = vars(parser.parse_args())
+    subset_size = args['subset_size']
+    num_of_repetitions = args['reps']
+    print('Program started.')
+    print('Number of available processor cores:', comm.size)
+    print('Running...')
+    # Redirect stdout of the first process to write output to file
+    orig_stdout = sys.stdout
+    f = open('log/output_log_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.txt', 'w')
+    sys.stdout = f
+    # Raise an exception if the number of repetitions is not fully divisible by the number of cores
+    if num_of_repetitions % comm.size != 0:
+        raise Exception('The specified number of iterations should be a multiple of the number of available cores.')
+    # To split the computational load equally among the processors, the total number of repetitions is divided by the number
+    # of available processor cores
+    repetitions_per_processor = (int)(num_of_repetitions / comm.size)
+    # Check if GPU acceleration is available
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    print(f"Using device: {device}")
+    print('subset_size:', subset_size)
+    print('num_of_repetitions:', num_of_repetitions, '\n')
+    # Obtain the FashionMNIST dataset
+    train_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=True)
+    test_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=False)
+    X_train = train_set.data.type(torch.float32)
+    y_train = train_set.targets.to(device)
+    X_test = test_set.data.type(torch.float32)
+    y_test = test_set.targets.to(device)
+    print('X_train shape:', X_train.shape, X_train.dtype)
+    print('y_train shape:', y_train.shape, y_train.dtype)
+    print('X_test shape:', X_test.shape, X_test.dtype)
+    print('y_test shape:', y_test.shape, y_test.dtype, '\n')
+    X_train = torch.reshape(X_train, (X_train.shape[0], -1)).to(device)
+    X_test = torch.reshape(X_test, (X_test.shape[0], -1)).to(device)
+    # Create the table that will hold the obtained accuracies and subset indices
+    acc_table = np.zeros((comm.size, repetitions_per_processor), dtype=float)
+    idxs_table = np.zeros((comm.size, repetitions_per_processor, subset_size), dtype=np.uint16)
+    print('acc_table shape:', acc_table.shape)
+    print('idxs_table shape:', idxs_table.shape, '\n')
+    acc_table_send_buffer = acc_table
+    idxs_table_send_buffer = idxs_table
+if comm.rank != 0:
+    subset_size = None
+    num_of_repetitions = None
+    repetitions_per_processor = None
+    device = None
+    train_set = None
+    X_train = None
+    y_train = None
+    X_test = None
+    y_test = None
+# Scatter the acc_table and idxs_table arrays to the available cores
+acc_table_local = comm.scatter(acc_table_send_buffer, root=0)
+idxs_table_local = comm.scatter(idxs_table_send_buffer, root=0)
+# Broadcast the rest of the data
+subset_size = comm.bcast(subset_size, root=0)
+num_of_repetitions = comm.bcast(num_of_repetitions, root=0)
+repetitions_per_processor = comm.bcast(repetitions_per_processor, root=0)
+device = comm.bcast(device, root=0)
+train_set = comm.bcast(train_set, root=0)
+X_train = comm.bcast(X_train, root=0)
+y_train = comm.bcast(y_train, root=0)
+X_test = comm.bcast(X_test, root=0)
+y_test = comm.bcast(y_test, root=0)
+# Set random seeds for reproducible results
+torch.manual_seed(comm.rank)
+np.random.seed(comm.rank)
+# Evaluate accuracy multiple times
+for rep in range(repetitions_per_processor):
+    # Get a new balanced random subset
+    idxs = get_random_balanced_subset_indices(train_set, subset_size=subset_size)
+    S = X_train[idxs]
+    S_y = y_train[idxs].to(device)
+    classifier = kNearestNeighbors()
+    acc = ACC(classifier, S, S_y, X_test, y_test, device)
+    acc_table_local[rep] = acc
+    idxs_table_local[rep] = idxs
+    # For debugging purposes, doesn't work with file log because of multiprocessing
+    # print("Rank {}, Rep {}, Acc = {}".format(comm.rank, rep, acc))
+# Gather the local arrays to the receiver buffers
+acc_table_rec_buffer = comm.gather(acc_table_local, root=0)
+idxs_table_rec_buffer = comm.gather(idxs_table_local, root=0)
+if comm.rank == 0:
+    print('Gathered acc_table on rank 0')
+    acc_table = np.array(acc_table_rec_buffer).reshape(num_of_repetitions)
+    # print(acc_table, acc_table.shape, '\n')
+    print('Gathered idxs_table on rank 0')
+    idxs_table = np.array(idxs_table_rec_buffer).reshape((num_of_repetitions, subset_size))
+    # print(idxs_table, idxs_table.shape, '\n')
+    # Assign a value to each example as the average accuracy obtained from all the random subsets in which it was included
+    # Store the total number of times the example was included in the second column of the table
+    example_value_table = np.zeros((len(train_set), 2), dtype=float)
+    for example_idx, _ in enumerate(train_set):
+        idxs_table_rows, _ = np.where(idxs_table == example_idx)
+        if len(idxs_table_rows) != 0:
+            example_value_table[example_idx][0] = np.sum(acc_table[idxs_table_rows]) / len(idxs_table_rows)
+            example_value_table[example_idx][1] = len(idxs_table_rows)
+        print('Example idx %d, count = %d, value = %0.4f' % (example_idx, example_value_table[example_idx][1], example_value_table[example_idx][0]))
+    # Get the index of the example with the maximum value
+    max_value_example_idx = np.argmax(example_value_table[:, 0])
+    # Get the index of the example with the minimum non-zero value
+    # Find the indices of the non-zero value examples first
+    non_zero_value_example_idxs = np.nonzero(example_value_table[:, 0])
+    min_value_example_idx_in_non_zero_array = np.argmin(example_value_table[non_zero_value_example_idxs, 0])
+    min_value_example_idx = non_zero_value_example_idxs[0][min_value_example_idx_in_non_zero_array]
+    sorted_example_value_idxs = np.argsort(example_value_table[:, 0])[::-1]
+    print('\nSorted examples in descending value order:')
+    for i in range(len(train_set)):
+        print('Example idx %d, count = %d, value = %0.4f' % (sorted_example_value_idxs[i], example_value_table[sorted_example_value_idxs[i]][1], example_value_table[sorted_example_value_idxs[i]][0]))
+    print('\n' + 'max value: %0.4f, example idx %d, count = %d' % (example_value_table[max_value_example_idx, 0], max_value_example_idx, example_value_table[max_value_example_idx, 1]))
+    print('min value: %0.4f, example idx %d, count = % d' % (example_value_table[min_value_example_idx, 0], min_value_example_idx, example_value_table[min_value_example_idx, 1]))
+    path = 'numpy_arrays/subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions)
+    if not os.path.exists(path):
+        os.mkdir(path)
+    np.save(path + '/acc_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions), acc_table)
+    np.save(path + '/idxs_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions), idxs_table)
+    np.save(path + '/example_value_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions), example_value_table)
+    sys.stdout = orig_stdout
+    f.close()
+    print('Done!')