K-最近邻程序总是报告相同的 class 值
K-Nearest Neighbors program always reports same class value
我已经编写了一个 KNN 算法的简短实现,用于根据鸢尾花数据集的简短片段确定样本点 {5.2,3.1} 的 class,但是 class始终报告为 1 (Virginica)。在我的代码中出现问题的地方对我来说并不是很明显。有人可以帮我弄清楚 where/why 发生这种情况吗?
#include <iostream>
#include <math.h>
#include <string>
//Setosa = 0, Virginica = 1, Verscicolor = 2
//[0] and [1] = data point, [2] = class, [3] = distance
double train_data[15][3] = {
{5.3,3.7,0},{5.1,3.8,0},{7.2,3.0,1},
{5.4,3.4,0},{5.1,3.3,0},{5.4,3.9,0},
{7.4,2.8,1},{6.1,2.8,2},{7.3,2.9,1},
{6.0,2.7,2},{5.8,2.8,1},{6.3,2.3,2},
{5.1,2.5,2},{6.3,2.5,2},{5.5,2.4,2}
};
double Distance(double attr1, double attr2, double sAttr1, double sAttr2)
{
return sqrt(pow(attr1-sAttr1, 2.0)+pow(attr2-sAttr2, 2.0));
}
int findMaxIndex(float *classes)
{
int maxIndex = 0;
for (int i = 0; i < 3; i++){
if (classes[i] > classes[maxIndex])
maxIndex = i;
}
return 0;
}
int main(){
for(int i = 0; i < 15; i++){
train_data[i][3] = Distance(train_data[i][0],train_data[i][1],5.2,3.1);
}
for(int i = 0; i < 15; i++){
for (int j = i+1; j < 15; j++){
if (train_data[i][3] < train_data[j][3]){
//swap
for(int k = 0; k < 4; k++){
double temp = train_data[i][k];
train_data[i][k] = train_data[j][k];
train_data[j][k] = temp;
}
}
}
}
//Based on a value for k determine the class
int K = 5;
float *classes = new float[3];
for (int i =0; i < 3; i++){
classes[i] = 0;
}
for (int i = 0 ; i < K; i++)
{
classes[(int)train_data[i][2]-1]++;
}
int predictedLabel = findMaxIndex(classes)+1;
std::cout << "Predicted class for point {5.2,3.1} is: " << predictedLabel << std::endl;
return 0;
}
如果启用警告,您会看到
test.cpp|33 col 32| warning: array subscript 3 is above array bounds of ‘double [3]’ [-Warray-bounds]
|| 33 | train_data[i][3] = Distance(train_data[i][0],train_data[i][1],5.2,3.1);
数组索引从 0 开始。
稍后,您还减去 (class - 1) 以索引 classes
数组。哎呀。那已经是从零开始的,所以它会变成负数。
考虑避免整个错误来源:
struct {
double x, y;
int _class = 0;
double distance = 0;
} train_data[] = {
{5.3, 3.7, 0}, {5.1, 3.8, 0}, {7.2, 3.0, 1}, //
{5.4, 3.4, 0}, {5.1, 3.3, 0}, {5.4, 3.9, 0}, //
{7.4, 2.8, 1}, {6.1, 2.8, 2}, {7.3, 2.9, 1}, //
{6.0, 2.7, 2}, {5.8, 2.8, 1}, {6.3, 2.3, 2}, //
{5.1, 2.5, 2}, {6.3, 2.5, 2}, {5.5, 2.4, 2} //
};
for(auto& node : train_data) {
node.distance = Distance(node.x, node.y, 5.2, 3.1);
}
另请注意,std::swap(node1, node2)
将正常工作。或者使节点类型可排序。
更多地道的 C++
这是我的看法
#include <array>
#include <iostream>
#include <cmath>
#include <string>
#include <vector>
//Setosa = 0, Virginica = 1, Verscicolor = 2
enum Class { Setosa, Virginica, Verscicolor, NCLASSES };
struct node {
double x, y;
Class _class = Setosa;
double distance = 0;
bool operator<(node const& other) const {
return distance > other.distance;
}
};
std::vector<node> train_data{
{5.3, 3.7, Setosa}, {5.1, 3.8, Setosa}, {7.2, 3.0, Virginica}, //
{5.4, 3.4, Setosa}, {5.1, 3.3, Setosa}, {5.4, 3.9, Setosa}, //
{7.4, 2.8, Virginica}, {6.1, 2.8, Verscicolor}, {7.3, 2.9, Virginica}, //
{6.0, 2.7, Verscicolor}, {5.8, 2.8, Virginica}, {6.3, 2.3, Verscicolor}, //
{5.1, 2.5, Verscicolor}, {6.3, 2.5, Verscicolor}, {5.5, 2.4, Verscicolor} //
};
double Distance(double x1, double y1, double x2, double y2) {
return sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2));
}
// Based on a value for k determine the class
Class predict(double x, double y, unsigned K) {
for (auto& node : train_data) {
node.distance = Distance(node.x, node.y, x, y);
}
sort(train_data.begin(), train_data.end());
// tally buckets:
std::array<unsigned, NCLASSES> classes = {0, 0, 0};
for (unsigned i = 0; i < K; i++) {
classes[train_data[i]._class]++;
}
auto index = std::max_element(classes.begin(), classes.end()) //
- classes.begin();
return static_cast<Class>(index);
}
int main()
{
std::cout << "Predicted class for point {5.2,3.1} is: "
<< predict(5.2, 3.1, 5) << "\n";
}
版画
Predicted class for point {5.2,3.1} is: 1
我怀疑你的排序顺序倒置了?翻转排序顺序:
Predicted class for point {5.2,3.1} is: 0
我已经编写了一个 KNN 算法的简短实现,用于根据鸢尾花数据集的简短片段确定样本点 {5.2,3.1} 的 class,但是 class始终报告为 1 (Virginica)。在我的代码中出现问题的地方对我来说并不是很明显。有人可以帮我弄清楚 where/why 发生这种情况吗?
#include <iostream>
#include <math.h>
#include <string>
//Setosa = 0, Virginica = 1, Verscicolor = 2
//[0] and [1] = data point, [2] = class, [3] = distance
double train_data[15][3] = {
{5.3,3.7,0},{5.1,3.8,0},{7.2,3.0,1},
{5.4,3.4,0},{5.1,3.3,0},{5.4,3.9,0},
{7.4,2.8,1},{6.1,2.8,2},{7.3,2.9,1},
{6.0,2.7,2},{5.8,2.8,1},{6.3,2.3,2},
{5.1,2.5,2},{6.3,2.5,2},{5.5,2.4,2}
};
double Distance(double attr1, double attr2, double sAttr1, double sAttr2)
{
return sqrt(pow(attr1-sAttr1, 2.0)+pow(attr2-sAttr2, 2.0));
}
int findMaxIndex(float *classes)
{
int maxIndex = 0;
for (int i = 0; i < 3; i++){
if (classes[i] > classes[maxIndex])
maxIndex = i;
}
return 0;
}
int main(){
for(int i = 0; i < 15; i++){
train_data[i][3] = Distance(train_data[i][0],train_data[i][1],5.2,3.1);
}
for(int i = 0; i < 15; i++){
for (int j = i+1; j < 15; j++){
if (train_data[i][3] < train_data[j][3]){
//swap
for(int k = 0; k < 4; k++){
double temp = train_data[i][k];
train_data[i][k] = train_data[j][k];
train_data[j][k] = temp;
}
}
}
}
//Based on a value for k determine the class
int K = 5;
float *classes = new float[3];
for (int i =0; i < 3; i++){
classes[i] = 0;
}
for (int i = 0 ; i < K; i++)
{
classes[(int)train_data[i][2]-1]++;
}
int predictedLabel = findMaxIndex(classes)+1;
std::cout << "Predicted class for point {5.2,3.1} is: " << predictedLabel << std::endl;
return 0;
}
如果启用警告,您会看到
test.cpp|33 col 32| warning: array subscript 3 is above array bounds of ‘double [3]’ [-Warray-bounds]
|| 33 | train_data[i][3] = Distance(train_data[i][0],train_data[i][1],5.2,3.1);
数组索引从 0 开始。
稍后,您还减去 (class - 1) 以索引 classes
数组。哎呀。那已经是从零开始的,所以它会变成负数。
考虑避免整个错误来源:
struct {
double x, y;
int _class = 0;
double distance = 0;
} train_data[] = {
{5.3, 3.7, 0}, {5.1, 3.8, 0}, {7.2, 3.0, 1}, //
{5.4, 3.4, 0}, {5.1, 3.3, 0}, {5.4, 3.9, 0}, //
{7.4, 2.8, 1}, {6.1, 2.8, 2}, {7.3, 2.9, 1}, //
{6.0, 2.7, 2}, {5.8, 2.8, 1}, {6.3, 2.3, 2}, //
{5.1, 2.5, 2}, {6.3, 2.5, 2}, {5.5, 2.4, 2} //
};
for(auto& node : train_data) {
node.distance = Distance(node.x, node.y, 5.2, 3.1);
}
另请注意,std::swap(node1, node2)
将正常工作。或者使节点类型可排序。
更多地道的 C++
这是我的看法
#include <array>
#include <iostream>
#include <cmath>
#include <string>
#include <vector>
//Setosa = 0, Virginica = 1, Verscicolor = 2
enum Class { Setosa, Virginica, Verscicolor, NCLASSES };
struct node {
double x, y;
Class _class = Setosa;
double distance = 0;
bool operator<(node const& other) const {
return distance > other.distance;
}
};
std::vector<node> train_data{
{5.3, 3.7, Setosa}, {5.1, 3.8, Setosa}, {7.2, 3.0, Virginica}, //
{5.4, 3.4, Setosa}, {5.1, 3.3, Setosa}, {5.4, 3.9, Setosa}, //
{7.4, 2.8, Virginica}, {6.1, 2.8, Verscicolor}, {7.3, 2.9, Virginica}, //
{6.0, 2.7, Verscicolor}, {5.8, 2.8, Virginica}, {6.3, 2.3, Verscicolor}, //
{5.1, 2.5, Verscicolor}, {6.3, 2.5, Verscicolor}, {5.5, 2.4, Verscicolor} //
};
double Distance(double x1, double y1, double x2, double y2) {
return sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2));
}
// Based on a value for k determine the class
Class predict(double x, double y, unsigned K) {
for (auto& node : train_data) {
node.distance = Distance(node.x, node.y, x, y);
}
sort(train_data.begin(), train_data.end());
// tally buckets:
std::array<unsigned, NCLASSES> classes = {0, 0, 0};
for (unsigned i = 0; i < K; i++) {
classes[train_data[i]._class]++;
}
auto index = std::max_element(classes.begin(), classes.end()) //
- classes.begin();
return static_cast<Class>(index);
}
int main()
{
std::cout << "Predicted class for point {5.2,3.1} is: "
<< predict(5.2, 3.1, 5) << "\n";
}
版画
Predicted class for point {5.2,3.1} is: 1
我怀疑你的排序顺序倒置了?翻转排序顺序:
Predicted class for point {5.2,3.1} is: 0