R kknn 包和加权 k-最近邻计算
R kknn package and weighted k-nearest neighbors calculations
我正在尝试手动计算从 R kknn 包输出的距离和重量度量。当数据未缩放时,我能够正确计算欧氏距离和逆权重,如下所示:
欧氏距离
sqrt((6-8)^2 + (4-5)^2) = 2.236068
sqrt((6-3)^2 + (4-7)^2) = 4.242641
sqrt((6-7)^2 + (4-3)^2) = 1.414214
反权重
1 / (2.236068 / 4.242641) = 1.897368
1 / (1.414214 / 4.242641) = 3.000000.
我没有看到矩形权重是如何计算的,因为我得到:
1/2 * 1 = 0.50
1/2 * 1 = 0.50
kknn 包给出了 1 和 1。
最后,我在计算数据缩放时的距离和权重时一点运气都没有。感谢任何帮助,因为我正在尝试了解 kknn 包的工作原理。
library(kknn)
training <- data.frame(class = c(1, 0, 1), height = c(8, 3, 7), weight = c(5, 7, 3))
training
holdouts <- data.frame(class = 1, height = 6, weight = 4)
holdouts
rectangular_no_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "rectangular", k = 2, scale = FALSE)
rectangular_no_scale[["D"]]
rectangular_no_scale[["W"]]
inversion_no_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "inv", k = 2, scale = FALSE)
inversion_no_scale[["D"]]
inversion_no_scale[["W"]]
rectangular_with_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "rectangular", k = 2, scale = TRUE)
rectangular_with_scale[["D"]]
rectangular_with_scale[["W"]]
inversion_with_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "inv", k = 2, scale = TRUE)
inversion_with_scale[["D"]]
inversion_with_scale[["W"]]
kknn
的源代码(只需在控制台模式下键入 kknn + return
)有助于理解计算:
library(kknn)
training <- data.frame(class = c(1, 0, 1), height = c(8, 3, 7), weight = c(5, 7, 3))
training
#> class height weight
#> 1 1 8 5
#> 2 0 3 7
#> 3 1 7 3
holdouts <- data.frame(class = 1, height = 6, weight = 4)
holdouts
#> class height weight
#> 1 1 6 4
# Euclidian distance
d <- sqrt((training$height-holdouts$height)^2 +(training$weight-holdouts$weight)^2)
d <- d[order(d)]
d
#> [1] 1.414214 2.236068 4.242641
rectangular_no_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "rectangular", k = 2, scale = FALSE)
rectangular_no_scale[["D"]]
#> [1] 1.414214 2.236068
d[1:2]
#> [1] 1.414214 2.236068
rectangular_no_scale[["W"]]
#> [,1] [,2]
#> [1,] 1 1
#
# source code:
# if (kernel == "rectangular")
# W <- matrix(1, nrow = p, ncol = k)
# This is why you get 1,1 : weights are the same and not normalized
inversion_no_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "inv", k = 2, scale = FALSE)
inversion_no_scale[["D"]]
#> [1] 1.414214 2.236068
d[1:2]
#> [1] 1.414214 2.236068
inversion_no_scale[["W"]]
#> [,1] [,2]
#> [1,] 3 1.897367
#
# Source code :
# W <- D/maxdist
# if (kernel == "inv")
# W <- 1/W
max(d)/d[1:2]
#> [1] 3.000000 1.897367
rectangular_with_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "rectangular", k = 2, scale = TRUE)
height_sd <- sqrt(var(training$height))
weight_sd <- sqrt(var(training$weight))
training_scaled <- training
training_scaled$height <- training$height / height_sd
training_scaled$weight <- training$weight / weight_sd
holdouts_scaled <- holdouts
holdouts_scaled$height <- holdouts$height / height_sd
holdouts_scaled$weight <- holdouts$weight / weight_sd
rectangular_with_scale[["D"]]
#> [1] 0.6267832 0.9063270
d_scaled <- sqrt((training_scaled$height-holdouts_scaled$height)^2 +(training_scaled$weight-holdouts_scaled$weight)^2)
d_scaled <- d[order(d_scaled)]
d_scaled
#> [1] 0.6267832 0.9063270 1.8803495
rectangular_with_scale[["W"]]
#> [,1] [,2]
#> [1,] 1 1
# Same as before : 1,1
inversion_with_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "inv", k = 2, scale = TRUE)
inversion_with_scale[["D"]]
#> [1] 0.6267832 0.9063270
d_scaled[1:2]
#> [1] 0.6267832 0.9063270
inversion_with_scale[["W"]]
#> [,1] [,2]
#> [1,] 3 2.074692
max(d_scaled)/d_scaled[1:2]
#> [1] 3.000000 2.074692
总而言之,rectangular
内核使用相同的权重,并且不需要归一化来找到 k 个最近的邻居,这就是权重简单设置为 1 的原因。
缩放只是将每列除以其标准差,然后进行计算。
我正在尝试手动计算从 R kknn 包输出的距离和重量度量。当数据未缩放时,我能够正确计算欧氏距离和逆权重,如下所示:
欧氏距离
sqrt((6-8)^2 + (4-5)^2) = 2.236068
sqrt((6-3)^2 + (4-7)^2) = 4.242641
sqrt((6-7)^2 + (4-3)^2) = 1.414214
反权重
1 / (2.236068 / 4.242641) = 1.897368
1 / (1.414214 / 4.242641) = 3.000000.
我没有看到矩形权重是如何计算的,因为我得到:
1/2 * 1 = 0.50
1/2 * 1 = 0.50
kknn 包给出了 1 和 1。
最后,我在计算数据缩放时的距离和权重时一点运气都没有。感谢任何帮助,因为我正在尝试了解 kknn 包的工作原理。
library(kknn)
training <- data.frame(class = c(1, 0, 1), height = c(8, 3, 7), weight = c(5, 7, 3))
training
holdouts <- data.frame(class = 1, height = 6, weight = 4)
holdouts
rectangular_no_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "rectangular", k = 2, scale = FALSE)
rectangular_no_scale[["D"]]
rectangular_no_scale[["W"]]
inversion_no_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "inv", k = 2, scale = FALSE)
inversion_no_scale[["D"]]
inversion_no_scale[["W"]]
rectangular_with_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "rectangular", k = 2, scale = TRUE)
rectangular_with_scale[["D"]]
rectangular_with_scale[["W"]]
inversion_with_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "inv", k = 2, scale = TRUE)
inversion_with_scale[["D"]]
inversion_with_scale[["W"]]
kknn
的源代码(只需在控制台模式下键入 kknn + return
)有助于理解计算:
library(kknn)
training <- data.frame(class = c(1, 0, 1), height = c(8, 3, 7), weight = c(5, 7, 3))
training
#> class height weight
#> 1 1 8 5
#> 2 0 3 7
#> 3 1 7 3
holdouts <- data.frame(class = 1, height = 6, weight = 4)
holdouts
#> class height weight
#> 1 1 6 4
# Euclidian distance
d <- sqrt((training$height-holdouts$height)^2 +(training$weight-holdouts$weight)^2)
d <- d[order(d)]
d
#> [1] 1.414214 2.236068 4.242641
rectangular_no_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "rectangular", k = 2, scale = FALSE)
rectangular_no_scale[["D"]]
#> [1] 1.414214 2.236068
d[1:2]
#> [1] 1.414214 2.236068
rectangular_no_scale[["W"]]
#> [,1] [,2]
#> [1,] 1 1
#
# source code:
# if (kernel == "rectangular")
# W <- matrix(1, nrow = p, ncol = k)
# This is why you get 1,1 : weights are the same and not normalized
inversion_no_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "inv", k = 2, scale = FALSE)
inversion_no_scale[["D"]]
#> [1] 1.414214 2.236068
d[1:2]
#> [1] 1.414214 2.236068
inversion_no_scale[["W"]]
#> [,1] [,2]
#> [1,] 3 1.897367
#
# Source code :
# W <- D/maxdist
# if (kernel == "inv")
# W <- 1/W
max(d)/d[1:2]
#> [1] 3.000000 1.897367
rectangular_with_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "rectangular", k = 2, scale = TRUE)
height_sd <- sqrt(var(training$height))
weight_sd <- sqrt(var(training$weight))
training_scaled <- training
training_scaled$height <- training$height / height_sd
training_scaled$weight <- training$weight / weight_sd
holdouts_scaled <- holdouts
holdouts_scaled$height <- holdouts$height / height_sd
holdouts_scaled$weight <- holdouts$weight / weight_sd
rectangular_with_scale[["D"]]
#> [1] 0.6267832 0.9063270
d_scaled <- sqrt((training_scaled$height-holdouts_scaled$height)^2 +(training_scaled$weight-holdouts_scaled$weight)^2)
d_scaled <- d[order(d_scaled)]
d_scaled
#> [1] 0.6267832 0.9063270 1.8803495
rectangular_with_scale[["W"]]
#> [,1] [,2]
#> [1,] 1 1
# Same as before : 1,1
inversion_with_scale <- kknn(class ~., training, holdouts, distance = 2, kernel = "inv", k = 2, scale = TRUE)
inversion_with_scale[["D"]]
#> [1] 0.6267832 0.9063270
d_scaled[1:2]
#> [1] 0.6267832 0.9063270
inversion_with_scale[["W"]]
#> [,1] [,2]
#> [1,] 3 2.074692
max(d_scaled)/d_scaled[1:2]
#> [1] 3.000000 2.074692
总而言之,rectangular
内核使用相同的权重,并且不需要归一化来找到 k 个最近的邻居,这就是权重简单设置为 1 的原因。
缩放只是将每列除以其标准差,然后进行计算。