Something went wrong on our end
Select Git revision
knn_random_subsets.py 9.55 KiB
from mpi4py import MPI
import argparse
import sys
import os
import numpy as np
import torch
from torchvision import datasets
from torchvision.transforms import ToTensor
from classifiers.k_nearest_neighbors import kNearestNeighbors
def positive_int(s: str) -> int:
try:
v = int(s)
except ValueError:
raise argparse.ArgumentTypeError(f'Expected integer, got {s!r}')
if v <= 0:
raise argparse.ArgumentTypeError(f'Expected positive integer, got {v}')
return v
def get_random_balanced_subset_indices(dataset, subset_size):
idxs = []
dataset_size = len(dataset)
for class_num, _ in enumerate(dataset.classes):
# Get all the indices corresponding to the class number
class_idxs = torch.nonzero((dataset.targets == class_num))
# Remove unnecessary dimensions and convert to numpy array
class_idxs = torch.squeeze(class_idxs).numpy()
# Set the number of examples to be selected from the given class depending on the portion that it occupies in
# complete dataset
class_subset_size = (int)((class_idxs.size / dataset_size) * subset_size)
# Get a random sample of the indices
class_idxs = np.random.choice(class_idxs, class_subset_size, replace=False)
idxs += list(class_idxs)
return idxs
def ACC(classifier, X_train, y, X_test, y_test, device):
classifier.train(X_train, y, device)
y_pred = classifier.predict(X_test, k=3)
num_correct = torch.sum(y_test == y_pred)
acc = num_correct / y_test.shape[0]
return acc
# Get the MPI communicator object
comm = MPI.COMM_WORLD
# Create send and receive buffer for the tables that will hold the test accuracies and subset indices
acc_table_send_buffer = []
acc_table_rec_buffer = []
idxs_table_send_buffer = []
idxs_table_rec_buffer = []
if comm.rank == 0:
# Create an argument parser to set the subset_size and num_of_repetitions
parser = argparse.ArgumentParser(
'Script for evaluating the accuracy of kNN classifier on multiple random subsets of the FashionMNIST training set')
parser.add_argument('subset_size', type=positive_int,
help='The size of the random subsets generated by the program')
parser.add_argument('reps', type=positive_int,
help='The total number of random subsets the program will generate for testing the accuracy of the kNN classifier')
args = vars(parser.parse_args())
subset_size = args['subset_size']
num_of_repetitions = args['reps']
print('Program started.')
print('Number of available processor cores:', comm.size)
print('Running...')
# Redirect stdout of the first process to write output to file
orig_stdout = sys.stdout
# Create log directory if it doesn't exist
if not os.path.exists('log/'):
os.mkdir('log')
f = open('log/output_log_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.txt', 'w')
sys.stdout = f
# Raise an exception if the number of repetitions is not fully divisible by the number of cores
if num_of_repetitions % comm.size != 0:
raise Exception('The specified number of iterations should be a multiple of the number of available cores.')
# To split the computational load equally among the processors, the total number of repetitions is divided by the number
# of available processor cores
repetitions_per_processor = (int)(num_of_repetitions / comm.size)
# Check if GPU acceleration is available
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
print(f"Using device: {device}")
print('subset_size:', subset_size)
print('num_of_repetitions:', num_of_repetitions, '\n')
# Obtain the FashionMNIST dataset
train_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=True)
test_set = datasets.FashionMNIST(root='datasets/', download=True, transform=ToTensor(), train=False)
X_train = train_set.data.type(torch.float32)
y_train = train_set.targets.to(device)
X_test = test_set.data.type(torch.float32)
y_test = test_set.targets.to(device)
print('X_train shape:', X_train.shape, X_train.dtype)
print('y_train shape:', y_train.shape, y_train.dtype)
print('X_test shape:', X_test.shape, X_test.dtype)
print('y_test shape:', y_test.shape, y_test.dtype, '\n')
X_train = torch.reshape(X_train, (X_train.shape[0], -1)).to(device)
X_test = torch.reshape(X_test, (X_test.shape[0], -1)).to(device)
# Create the table that will hold the obtained accuracies and subset indices
acc_table = np.zeros((comm.size, repetitions_per_processor), dtype=float)
idxs_table = np.zeros((comm.size, repetitions_per_processor, subset_size), dtype=np.uint16)
print('acc_table shape:', acc_table.shape)
print('idxs_table shape:', idxs_table.shape, '\n')
acc_table_send_buffer = acc_table
idxs_table_send_buffer = idxs_table
if comm.rank != 0:
subset_size = None
num_of_repetitions = None
repetitions_per_processor = None
device = None
train_set = None
X_train = None
y_train = None
X_test = None
y_test = None
# Scatter the acc_table and idxs_table arrays to the available cores
acc_table_local = comm.scatter(acc_table_send_buffer, root=0)
idxs_table_local = comm.scatter(idxs_table_send_buffer, root=0)
# Broadcast the rest of the data
subset_size = comm.bcast(subset_size, root=0)
num_of_repetitions = comm.bcast(num_of_repetitions, root=0)
repetitions_per_processor = comm.bcast(repetitions_per_processor, root=0)
device = comm.bcast(device, root=0)
train_set = comm.bcast(train_set, root=0)
X_train = comm.bcast(X_train, root=0)
y_train = comm.bcast(y_train, root=0)
X_test = comm.bcast(X_test, root=0)
y_test = comm.bcast(y_test, root=0)
# Set random seeds for reproducible results
torch.manual_seed(comm.rank)
np.random.seed(comm.rank)
# Evaluate accuracy multiple times
for rep in range(repetitions_per_processor):
# Get a new balanced random subset
idxs = get_random_balanced_subset_indices(train_set, subset_size=subset_size)
S = X_train[idxs]
S_y = y_train[idxs].to(device)
classifier = kNearestNeighbors()
acc = ACC(classifier, S, S_y, X_test, y_test, device)
acc_table_local[rep] = acc
idxs_table_local[rep] = idxs
# For debugging purposes, doesn't work with file log because of multiprocessing
# print("Rank {}, Rep {}, Acc = {}".format(comm.rank, rep, acc))
# Gather the local arrays to the receiver buffers
acc_table_rec_buffer = comm.gather(acc_table_local, root=0)
idxs_table_rec_buffer = comm.gather(idxs_table_local, root=0)
if comm.rank == 0:
print('Gathered acc_table on rank 0')
acc_table = np.array(acc_table_rec_buffer).reshape(num_of_repetitions)
# print(acc_table, acc_table.shape, '\n')
print('Gathered idxs_table on rank 0')
idxs_table = np.array(idxs_table_rec_buffer).reshape((num_of_repetitions, subset_size))
# print(idxs_table, idxs_table.shape, '\n')
# Assign a value to each example as the average accuracy obtained from all the random subsets in which it was included
# Store the total number of times the example was included in the second column of the table
example_value_table = np.zeros((len(train_set), 2), dtype=float)
for example_idx, _ in enumerate(train_set):
idxs_table_rows, _ = np.where(idxs_table == example_idx)
if len(idxs_table_rows) != 0:
example_value_table[example_idx][0] = np.sum(acc_table[idxs_table_rows]) / len(idxs_table_rows)
example_value_table[example_idx][1] = len(idxs_table_rows)
print('Example idx %d, count = %d, value = %0.4f' % (example_idx, example_value_table[example_idx][1], example_value_table[example_idx][0]))
# Get the index of the example with the maximum value
max_value_example_idx = np.argmax(example_value_table[:, 0])
# Get the index of the example with the minimum non-zero value
# Find the indices of the non-zero value examples first
non_zero_value_example_idxs = np.nonzero(example_value_table[:, 0])
min_value_example_idx_in_non_zero_array = np.argmin(example_value_table[non_zero_value_example_idxs, 0])
min_value_example_idx = non_zero_value_example_idxs[0][min_value_example_idx_in_non_zero_array]
sorted_example_value_idxs = np.argsort(example_value_table[:, 0])[::-1]
print('\nSorted examples in descending value order:')
for i in range(len(train_set)):
print('Example idx %d, count = %d, value = %0.4f' % (sorted_example_value_idxs[i], example_value_table[sorted_example_value_idxs[i]][1], example_value_table[sorted_example_value_idxs[i]][0]))
print('\n' + 'max value: %0.4f, example idx %d, count = %d' % (example_value_table[max_value_example_idx, 0], max_value_example_idx, example_value_table[max_value_example_idx, 1]))
print('min value: %0.4f, example idx %d, count = % d' % (example_value_table[min_value_example_idx, 0], min_value_example_idx, example_value_table[min_value_example_idx, 1]))
path = 'numpy_arrays/subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '/'
os.makedirs(path, exist_ok=True)
np.save(path + 'acc_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.npy', acc_table)
np.save(path + 'idxs_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.npy', idxs_table)
np.save(path + 'example_value_table_subset_size=' + str(subset_size) + '_num_of_repetitions=' + str(num_of_repetitions) + '.npy', example_value_table)
sys.stdout = orig_stdout
f.close()
print('Done!')