knn_random_subsets.py

from mpi4py import MPI
import argparse
import sys
import os

import numpy as np
import torch
from torchvision import datasets
from torchvision.transforms import ToTensor
from classifiers.k_nearest_neighbors import kNearestNeighbors
def positive_int(s: str) -> int:
    try:
        v = int(s)
    except ValueError:
        raise  argparse.ArgumentTypeError(f'Expected integer, got {s!r}')

    if v <= 0:
        raise argparse.ArgumentTypeError(f'Expected positive integer, got {v}')

    return v
def get_random_balanced_subset_indices(dataset, subset_size):
    idxs = []
    dataset_size = len(dataset)

    for class_num, _ in enumerate(dataset.classes):
        # Get all the indices corresponding to the class number
        class_idxs = torch.nonzero((dataset.targets == class_num))
        # Remove unnecessary dimensions and convert to numpy array
        class_idxs = torch.squeeze(class_idxs).numpy()

        # Set the number of examples to be selected from the given class depending on the portion that it occupies in
        # complete dataset
        class_subset_size = (int)((class_idxs.size / dataset_size) * subset_size)

        # Get a random sample of the indices
        class_idxs = np.random.choice(class_idxs, class_subset_size, replace=False)

        idxs += list(class_idxs)

    return idxs
def ACC(classifier, X_train, y, X_test, y_test, device):
    classifier.train(X_train, y, device)
    y_pred = classifier.predict(X_test, k=3)
    num_correct = torch.sum(y_test == y_pred)
    acc = num_correct / y_test.shape[0]
    return acc

# Get the MPI communicator object
comm = MPI.COMM_WORLD

# Create send and receive buffer for the tables that will hold the test accuracies and subset indices
acc_table_send_buffer = []
acc_table_rec_buffer = []

idxs_table_send_buffer = []
idxs_table_rec_buffer = []

if comm.rank == 0:
    # Create an argument parser to set the subset_size and num_of_repetitions
    parser = argparse.ArgumentParser(
        'Script for evaluating the accuracy of kNN classifier on multiple random subsets of the FashionMNIST training set')
    parser.add_argument('subset_size', type=positive_int,
                        help='The size of the random subsets generated by the program')
    parser.add_argument('reps', type=positive_int,
                        help='The total number of random subsets the program will generate for testing the accuracy of the kNN classifier')
    args = vars(parser.parse_args())

    subset_size = args['subset_size']
    num_of_repetitions = args['reps']

    print('Program started.')
    print('Number of available processor cores:', comm.size)
    print('Running...')

    # Redirect stdout of the first process to write output to file
    orig_stdout = sys.stdout

    # Create log directory if it doesn't exist
    if not os.path.exists('log/'):
        os.mkdir('log')

    f = open('log/output_log_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.txt', 'w')
    sys.stdout = f

    # Raise an exception if the number of repetitions is not fully divisible by the number of cores
    if num_of_repetitions % comm.size != 0:
        raise Exception('The specified number of iterations should be a multiple of the number of available cores.')

    # To split the computational load equally among the processors, the total number of repetitions is divided by the number
    # of available processor cores
    repetitions_per_processor = (int)(num_of_repetitions / comm.size)

    # Check if GPU acceleration is available
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    print(f"Using device: {device}")
    print('subset_size:', subset_size)
    print('num_of_repetitions:', num_of_repetitions, '\n')

    # Obtain the FashionMNIST dataset
    train_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=True)
    test_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=False)

    X_train = train_set.data.type(torch.float32)
    y_train = train_set.targets.to(device)

    X_test = test_set.data.type(torch.float32)
    y_test = test_set.targets.to(device)

    print('X_train shape:', X_train.shape, X_train.dtype)
    print('y_train shape:', y_train.shape, y_train.dtype)
    print('X_test shape:', X_test.shape, X_test.dtype)
    print('y_test shape:', y_test.shape, y_test.dtype, '\n')

    X_train = torch.reshape(X_train, (X_train.shape[0], -1)).to(device)
    X_test = torch.reshape(X_test, (X_test.shape[0], -1)).to(device)

    # Create the table that will hold the obtained accuracies and subset indices
    acc_table = np.zeros((comm.size, repetitions_per_processor), dtype=float)
    idxs_table = np.zeros((comm.size, repetitions_per_processor, subset_size), dtype=np.uint16)

    print('acc_table shape:', acc_table.shape)
    print('idxs_table shape:', idxs_table.shape, '\n')

    acc_table_send_buffer = acc_table
    idxs_table_send_buffer = idxs_table

if comm.rank != 0:
    subset_size = None
    num_of_repetitions = None
    repetitions_per_processor = None
    device = None
    train_set = None
    X_train = None
    y_train = None
    X_test = None
    y_test = None

# Scatter the acc_table and idxs_table arrays to the available cores
acc_table_local = comm.scatter(acc_table_send_buffer, root=0)
idxs_table_local = comm.scatter(idxs_table_send_buffer, root=0)

# Broadcast the rest of the data
subset_size = comm.bcast(subset_size, root=0)
num_of_repetitions = comm.bcast(num_of_repetitions, root=0)
repetitions_per_processor = comm.bcast(repetitions_per_processor, root=0)
device = comm.bcast(device, root=0)
train_set = comm.bcast(train_set, root=0)
X_train = comm.bcast(X_train, root=0)
y_train = comm.bcast(y_train, root=0)
X_test = comm.bcast(X_test, root=0)
y_test = comm.bcast(y_test, root=0)

# Set random seeds for reproducible results
torch.manual_seed(comm.rank)
np.random.seed(comm.rank)

# Evaluate accuracy multiple times
for rep in range(repetitions_per_processor):
    # Get a new balanced random subset
    idxs = get_random_balanced_subset_indices(train_set, subset_size=subset_size)
    S = X_train[idxs]
    S_y = y_train[idxs].to(device)

    classifier = kNearestNeighbors()
    acc = ACC(classifier, S, S_y, X_test, y_test, device)

    acc_table_local[rep] = acc
    idxs_table_local[rep] = idxs

    # For debugging purposes, doesn't work with file log because of multiprocessing
    # print("Rank {}, Rep {}, Acc = {}".format(comm.rank, rep, acc))

# Gather the local arrays to the receiver buffers
acc_table_rec_buffer = comm.gather(acc_table_local, root=0)
idxs_table_rec_buffer = comm.gather(idxs_table_local, root=0)

if comm.rank == 0:
    print('Gathered acc_table on rank 0')
    acc_table = np.array(acc_table_rec_buffer).reshape(num_of_repetitions)
    # print(acc_table, acc_table.shape, '\n')

    print('Gathered idxs_table on rank 0')
    idxs_table = np.array(idxs_table_rec_buffer).reshape((num_of_repetitions, subset_size))
    # print(idxs_table, idxs_table.shape, '\n')

    # Assign a value to each example as the average accuracy obtained from all the random subsets in which it was included
    # Store the total number of times the example was included in the second column of the table
    example_value_table = np.zeros((len(train_set), 2), dtype=float)

    for example_idx, _ in enumerate(train_set):
        idxs_table_rows, _ = np.where(idxs_table == example_idx)
        if len(idxs_table_rows) != 0:
            example_value_table[example_idx][0] = np.sum(acc_table[idxs_table_rows]) / len(idxs_table_rows)
            example_value_table[example_idx][1] = len(idxs_table_rows)

        print('Example idx %d, count = %d, value = %0.4f' % (example_idx, example_value_table[example_idx][1], example_value_table[example_idx][0]))

    # Get the index of the example with the maximum value
    max_value_example_idx = np.argmax(example_value_table[:, 0])

    # Get the index of the example with the minimum non-zero value
    # Find the indices of the non-zero value examples first
    non_zero_value_example_idxs = np.nonzero(example_value_table[:, 0])
    min_value_example_idx_in_non_zero_array = np.argmin(example_value_table[non_zero_value_example_idxs, 0])
    min_value_example_idx = non_zero_value_example_idxs[0][min_value_example_idx_in_non_zero_array]

    sorted_example_value_idxs = np.argsort(example_value_table[:, 0])[::-1]
    print('\nSorted examples in descending value order:')
    for i in range(len(train_set)):
        print('Example idx %d, count = %d, value = %0.4f' % (sorted_example_value_idxs[i], example_value_table[sorted_example_value_idxs[i]][1], example_value_table[sorted_example_value_idxs[i]][0]))

    print('\n' + 'max value: %0.4f, example idx %d, count = %d' % (example_value_table[max_value_example_idx, 0], max_value_example_idx, example_value_table[max_value_example_idx, 1]))
    print('min value: %0.4f, example idx %d, count = % d' % (example_value_table[min_value_example_idx, 0], min_value_example_idx, example_value_table[min_value_example_idx, 1]))

    path = 'numpy_arrays/subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '/'
    os.makedirs(path, exist_ok=True)

    np.save(path + 'acc_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.npy', acc_table)
    np.save(path + 'idxs_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.npy', idxs_table)
    np.save(path + 'example_value_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.npy', example_value_table)

    sys.stdout = orig_stdout
    f.close()
    print('Done!')