Skip to content
Snippets Groups Projects
Select Git revision
  • 0aaa27ebc15171f555caf9cccd6164dfca2e5970
  • master default protected
2 results

knn_random_subsets.py

Blame
  • knn_random_subsets.py 9.55 KiB
    from mpi4py import MPI
    import argparse
    import sys
    import os
    
    import numpy as np
    import torch
    from torchvision import datasets
    from torchvision.transforms import ToTensor
    from classifiers.k_nearest_neighbors import kNearestNeighbors
    def positive_int(s: str) -> int:
        try:
            v = int(s)
        except ValueError:
            raise  argparse.ArgumentTypeError(f'Expected integer, got {s!r}')
    
        if v <= 0:
            raise argparse.ArgumentTypeError(f'Expected positive integer, got {v}')
    
        return v
    def get_random_balanced_subset_indices(dataset, subset_size):
        idxs = []
        dataset_size = len(dataset)
    
        for class_num, _ in enumerate(dataset.classes):
            # Get all the indices corresponding to the class number
            class_idxs = torch.nonzero((dataset.targets == class_num))
            # Remove unnecessary dimensions and convert to numpy array
            class_idxs = torch.squeeze(class_idxs).numpy()
    
            # Set the number of examples to be selected from the given class depending on the portion that it occupies in
            # complete dataset
            class_subset_size = (int)((class_idxs.size / dataset_size) * subset_size)
    
            # Get a random sample of the indices
            class_idxs = np.random.choice(class_idxs, class_subset_size, replace=False)
    
            idxs += list(class_idxs)
    
        return idxs
    def ACC(classifier, X_train, y, X_test, y_test, device):
        classifier.train(X_train, y, device)
        y_pred = classifier.predict(X_test, k=3)
        num_correct = torch.sum(y_test == y_pred)
        acc = num_correct / y_test.shape[0]
        return acc
    
    # Get the MPI communicator object
    comm = MPI.COMM_WORLD
    
    # Create send and receive buffer for the tables that will hold the test accuracies and subset indices
    acc_table_send_buffer = []
    acc_table_rec_buffer = []
    
    idxs_table_send_buffer = []
    idxs_table_rec_buffer = []
    
    if comm.rank == 0:
        # Create an argument parser to set the subset_size and num_of_repetitions
        parser = argparse.ArgumentParser(
            'Script for evaluating the accuracy of kNN classifier on multiple random subsets of the FashionMNIST training set')
        parser.add_argument('subset_size', type=positive_int,
                            help='The size of the random subsets generated by the program')
        parser.add_argument('reps', type=positive_int,
                            help='The total number of random subsets the program will generate for testing the accuracy of the kNN classifier')
        args = vars(parser.parse_args())
    
        subset_size = args['subset_size']
        num_of_repetitions = args['reps']
    
        print('Program started.')
        print('Number of available processor cores:', comm.size)
        print('Running...')
    
        # Redirect stdout of the first process to write output to file
        orig_stdout = sys.stdout
    
        # Create log directory if it doesn't exist
        if not os.path.exists('log/'):
            os.mkdir('log')
    
        f = open('log/output_log_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.txt', 'w')
        sys.stdout = f
    
        # Raise an exception if the number of repetitions is not fully divisible by the number of cores
        if num_of_repetitions % comm.size != 0:
            raise Exception('The specified number of iterations should be a multiple of the number of available cores.')
    
        # To split the computational load equally among the processors, the total number of repetitions is divided by the number
        # of available processor cores
        repetitions_per_processor = (int)(num_of_repetitions / comm.size)
    
        # Check if GPU acceleration is available
        if torch.cuda.is_available():
            device = "cuda"
        else:
            device = "cpu"
    
        print(f"Using device: {device}")
        print('subset_size:', subset_size)
        print('num_of_repetitions:', num_of_repetitions, '\n')
    
        # Obtain the FashionMNIST dataset
        train_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=True)
        test_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=False)
    
        X_train = train_set.data.type(torch.float32)
        y_train = train_set.targets.to(device)
    
        X_test = test_set.data.type(torch.float32)
        y_test = test_set.targets.to(device)
    
        print('X_train shape:', X_train.shape, X_train.dtype)
        print('y_train shape:', y_train.shape, y_train.dtype)
        print('X_test shape:', X_test.shape, X_test.dtype)
        print('y_test shape:', y_test.shape, y_test.dtype, '\n')
    
        X_train = torch.reshape(X_train, (X_train.shape[0], -1)).to(device)
        X_test = torch.reshape(X_test, (X_test.shape[0], -1)).to(device)
    
        # Create the table that will hold the obtained accuracies and subset indices
        acc_table = np.zeros((comm.size, repetitions_per_processor), dtype=float)
        idxs_table = np.zeros((comm.size, repetitions_per_processor, subset_size), dtype=np.uint16)
    
        print('acc_table shape:', acc_table.shape)
        print('idxs_table shape:', idxs_table.shape, '\n')
    
        acc_table_send_buffer = acc_table
        idxs_table_send_buffer = idxs_table
    
    if comm.rank != 0:
        subset_size = None
        num_of_repetitions = None
        repetitions_per_processor = None
        device = None
        train_set = None
        X_train = None
        y_train = None
        X_test = None
        y_test = None
    
    # Scatter the acc_table and idxs_table arrays to the available cores
    acc_table_local = comm.scatter(acc_table_send_buffer, root=0)
    idxs_table_local = comm.scatter(idxs_table_send_buffer, root=0)
    
    # Broadcast the rest of the data
    subset_size = comm.bcast(subset_size, root=0)
    num_of_repetitions = comm.bcast(num_of_repetitions, root=0)
    repetitions_per_processor = comm.bcast(repetitions_per_processor, root=0)
    device = comm.bcast(device, root=0)
    train_set = comm.bcast(train_set, root=0)
    X_train = comm.bcast(X_train, root=0)
    y_train = comm.bcast(y_train, root=0)
    X_test = comm.bcast(X_test, root=0)
    y_test = comm.bcast(y_test, root=0)
    
    # Set random seeds for reproducible results
    torch.manual_seed(comm.rank)
    np.random.seed(comm.rank)
    
    # Evaluate accuracy multiple times
    for rep in range(repetitions_per_processor):
        # Get a new balanced random subset
        idxs = get_random_balanced_subset_indices(train_set, subset_size=subset_size)
        S = X_train[idxs]
        S_y = y_train[idxs].to(device)
    
        classifier = kNearestNeighbors()
        acc = ACC(classifier, S, S_y, X_test, y_test, device)
    
        acc_table_local[rep] = acc
        idxs_table_local[rep] = idxs
    
        # For debugging purposes, doesn't work with file log because of multiprocessing
        # print("Rank {}, Rep {}, Acc = {}".format(comm.rank, rep, acc))
    
    # Gather the local arrays to the receiver buffers
    acc_table_rec_buffer = comm.gather(acc_table_local, root=0)
    idxs_table_rec_buffer = comm.gather(idxs_table_local, root=0)
    
    if comm.rank == 0:
        print('Gathered acc_table on rank 0')
        acc_table = np.array(acc_table_rec_buffer).reshape(num_of_repetitions)
        # print(acc_table, acc_table.shape, '\n')
    
        print('Gathered idxs_table on rank 0')
        idxs_table = np.array(idxs_table_rec_buffer).reshape((num_of_repetitions, subset_size))
        # print(idxs_table, idxs_table.shape, '\n')
    
        # Assign a value to each example as the average accuracy obtained from all the random subsets in which it was included
        # Store the total number of times the example was included in the second column of the table
        example_value_table = np.zeros((len(train_set), 2), dtype=float)
    
        for example_idx, _ in enumerate(train_set):
            idxs_table_rows, _ = np.where(idxs_table == example_idx)
            if len(idxs_table_rows) != 0:
                example_value_table[example_idx][0] = np.sum(acc_table[idxs_table_rows]) / len(idxs_table_rows)
                example_value_table[example_idx][1] = len(idxs_table_rows)
    
            print('Example idx %d, count = %d, value = %0.4f' % (example_idx, example_value_table[example_idx][1], example_value_table[example_idx][0]))
    
        # Get the index of the example with the maximum value
        max_value_example_idx = np.argmax(example_value_table[:, 0])
    
        # Get the index of the example with the minimum non-zero value
        # Find the indices of the non-zero value examples first
        non_zero_value_example_idxs = np.nonzero(example_value_table[:, 0])
        min_value_example_idx_in_non_zero_array = np.argmin(example_value_table[non_zero_value_example_idxs, 0])
        min_value_example_idx = non_zero_value_example_idxs[0][min_value_example_idx_in_non_zero_array]
    
        sorted_example_value_idxs = np.argsort(example_value_table[:, 0])[::-1]
        print('\nSorted examples in descending value order:')
        for i in range(len(train_set)):
            print('Example idx %d, count = %d, value = %0.4f' % (sorted_example_value_idxs[i], example_value_table[sorted_example_value_idxs[i]][1], example_value_table[sorted_example_value_idxs[i]][0]))
    
        print('\n' + 'max value: %0.4f, example idx %d, count = %d' % (example_value_table[max_value_example_idx, 0], max_value_example_idx, example_value_table[max_value_example_idx, 1]))
        print('min value: %0.4f, example idx %d, count = % d' % (example_value_table[min_value_example_idx, 0], min_value_example_idx, example_value_table[min_value_example_idx, 1]))
    
        path = 'numpy_arrays/subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '/'
        os.makedirs(path, exist_ok=True)
    
        np.save(path + 'acc_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.npy', acc_table)
        np.save(path + 'idxs_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.npy', idxs_table)
        np.save(path + 'example_value_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.npy', example_value_table)
    
        sys.stdout = orig_stdout
        f.close()
        print('Done!')