csci5521/assignments/hwk02/KNN.m
2023-10-21 23:32:00 -05:00

54 lines
No EOL
2.3 KiB
Matlab

% implements KNN, returns the test error for the k-nearest neighbors
% algorithms when using a specified number of neighbors (k) for
% classification using a majority rules with tie-breaking.
function [test_err] = KNN(k, training_data, test_data, training_labels, test_labels)
n = length(test_data(:,1)); % get number of rows in test data
preds = zeros(length(test_labels),1); % predict labels for each test point
% compute pairwise euclidean distance between the test data and the training data
pairwise_distance = pdist2(training_data, test_data);
unique_classes = unique(training_labels);
% for each data point (row) in the test data
for t = 1:n
% TODO: compute k-nearest neighbors for data point
distances = pairwise_distance(:,t);
[~, smallest_indexes] = sort(distances, 'ascend');
smallest_k_indexes = smallest_indexes(1:k);
distances_by_class = zeros(max(unique_classes), 2);
for i = 1:length(unique_classes)
class = unique_classes(i);
this_class_distances = distances(training_labels == class,:);
distances_by_class(i,1) = class;
distances_by_class(i,2) = mean(this_class_distances);
end
distances_by_class_table = array2table(distances_by_class);
% TODO: classify test point using majority rule. Include tie-breaking
% using whichever class is closer by distance. Fill in preds with the
% predicted label.
smallest_k_labels = training_labels(smallest_k_indexes);
labels_by_count = tabulate(smallest_k_labels);
labels_by_count_sorted = sortrows(labels_by_count, 2);
most_frequent_label = labels_by_count_sorted(1,:);
most_frequent_label_count = most_frequent_label(2);
labels_that_have_most_frequent_count = labels_by_count_sorted(labels_by_count_sorted(:,2) == most_frequent_label_count,1);
if length(labels_that_have_most_frequent_count) > 1
common_indexes = find(ismember(distances_by_class, labels_that_have_most_frequent_count));
common_distances = distances_by_class(common_indexes,:);
sorted_distances = sortrows(common_distances,2);
preds(t) = sorted_distances(1,1);
else
winning_label = mode(smallest_k_labels);
preds(t) = winning_label;
end
end
test_err = sum(preds ~= test_labels)/n; % error rate
end % Function end