csci5521/assignments/hwk02/KNN.m

% implements KNN, returns the test error for the k-nearest neighbors
% algorithms when using a specified number of neighbors (k) for
% classification using a majority rules with tie-breaking.
function [test_err] = KNN(k, training_data, test_data, training_labels, test_labels)
  
  n = length(test_data(:,1)); % get number of rows in test data
  preds = zeros(length(test_labels),1); % predict labels for each test point

  % compute pairwise euclidean distance between the test data and the training data
  pairwise_distance = pdist2(training_data, test_data);

  unique_classes = unique(training_labels);

  % for each data point (row) in the test data
  for t = 1:n
    % TODO: compute k-nearest neighbors for data point
    distances = pairwise_distance(:,t);
    [~, smallest_indexes] = sort(distances, 'ascend');
    smallest_k_indexes = smallest_indexes(1:k);

    distances_by_class = zeros(max(unique_classes), 2);
    for i = 1:length(unique_classes)
      class = unique_classes(i);
      this_class_distances = distances(training_labels == class,:);
      distances_by_class(i,1) = class;
      distances_by_class(i,2) = mean(this_class_distances);
    end
    distances_by_class_table = array2table(distances_by_class);

    % TODO: classify test point using majority rule. Include tie-breaking
    % using whichever class is closer by distance. Fill in preds with the
    % predicted label.
    smallest_k_labels = training_labels(smallest_k_indexes);
    
    labels_by_count = tabulate(smallest_k_labels);
    labels_by_count_sorted = sortrows(labels_by_count, 2);
    most_frequent_label = labels_by_count_sorted(1,:);
    most_frequent_label_count = most_frequent_label(2);
    labels_that_have_most_frequent_count = labels_by_count_sorted(labels_by_count_sorted(:,2) == most_frequent_label_count,1);
    if length(labels_that_have_most_frequent_count) > 1
      common_indexes = find(ismember(distances_by_class, labels_that_have_most_frequent_count));
      common_distances = distances_by_class(common_indexes,:);
      sorted_distances = sortrows(common_distances,2);
      preds(t) = sorted_distances(1,1);
    else
      winning_label = mode(smallest_k_labels);
      preds(t) = winning_label;
    end

  end

  test_err = sum(preds ~= test_labels)/n; % error rate

end % Function end
initial hwk2 2023-10-12 23:51:06 +00:00			`% implements KNN, returns the test error for the k-nearest neighbors`
			`% algorithms when using a specified number of neighbors (k) for`
			`% classification using a majority rules with tie-breaking.`
			`function [test_err] = KNN(k, training_data, test_data, training_labels, test_labels)`

			`n = length(test_data(:,1)); % get number of rows in test data`
			`preds = zeros(length(test_labels),1); % predict labels for each test point`

updates 2023-10-22 04:32:00 +00:00			`% compute pairwise euclidean distance between the test data and the training data`
			`pairwise_distance = pdist2(training_data, test_data);`

			`unique_classes = unique(training_labels);`
initial hwk2 2023-10-12 23:51:06 +00:00
			`% for each data point (row) in the test data`
			`for t = 1:n`
			`% TODO: compute k-nearest neighbors for data point`
updates 2023-10-22 04:32:00 +00:00			`distances = pairwise_distance(:,t);`
			`[~, smallest_indexes] = sort(distances, 'ascend');`
			`smallest_k_indexes = smallest_indexes(1:k);`
initial hwk2 2023-10-12 23:51:06 +00:00
updates 2023-10-22 04:32:00 +00:00			`distances_by_class = zeros(max(unique_classes), 2);`
			`for i = 1:length(unique_classes)`
			`class = unique_classes(i);`
			`this_class_distances = distances(training_labels == class,:);`
			`distances_by_class(i,1) = class;`
			`distances_by_class(i,2) = mean(this_class_distances);`
			`end`
			`distances_by_class_table = array2table(distances_by_class);`
initial hwk2 2023-10-12 23:51:06 +00:00
			`% TODO: classify test point using majority rule. Include tie-breaking`
			`% using whichever class is closer by distance. Fill in preds with the`
			`% predicted label.`
updates 2023-10-22 04:32:00 +00:00			`smallest_k_labels = training_labels(smallest_k_indexes);`

			`labels_by_count = tabulate(smallest_k_labels);`
			`labels_by_count_sorted = sortrows(labels_by_count, 2);`
			`most_frequent_label = labels_by_count_sorted(1,:);`
			`most_frequent_label_count = most_frequent_label(2);`
			`labels_that_have_most_frequent_count = labels_by_count_sorted(labels_by_count_sorted(:,2) == most_frequent_label_count,1);`
			`if length(labels_that_have_most_frequent_count) > 1`
			`common_indexes = find(ismember(distances_by_class, labels_that_have_most_frequent_count));`
			`common_distances = distances_by_class(common_indexes,:);`
			`sorted_distances = sortrows(common_distances,2);`
			`preds(t) = sorted_distances(1,1);`
			`else`
			`winning_label = mode(smallest_k_labels);`
			`preds(t) = winning_label;`
			`end`
initial hwk2 2023-10-12 23:51:06 +00:00
			`end`

			`test_err = sum(preds ~= test_labels)/n; % error rate`

			`end % Function end`