在 R 中绘制决策边界

Drawing decision boundaries in R

我从 knn 函数中得到了一系列模型化的 class 标签。我有一个包含基本数字训练数据的数据框,以及另一个用于测试数据的数据框。我将如何为 knn 函数的返回值绘制决策边界?我必须在锁定的机器上复制我的发现,所以请尽可能限制使用第 3 方库。

我只有两个 class 标签,"orange" 和 "blue"。它们是用训练数据绘制在一个简单的二维图上的。同样,我只想围绕 knn 函数的结果画一个边界。

代码:

library(class)

n <- 100

set.seed(1)
x <- round(runif(n, 1, n))
set.seed(2)
y <- round(runif(n, 1, n))
train.df <- data.frame(x, y)

set.seed(1)
x.test <- round(runif(n, 1, n))
set.seed(2)
y.test <- round(runif(n, 1, n))
test.df <- data.frame(x.test, y.test)

k <- knn(train.df, test.df, classes, k=25)

plot(test.df, col=k)

classes 只是 class 标签的向量,这些标签是根据较早的代码确定的。

如果你需要,下面是我工作的完整代码:

library(class)

n <- 100
set.seed(1)
x <- round(runif(n, 1, n))
set.seed(2)
y <- round(runif(n, 1, n))

# ============================================================
# Bayes Classifier + Decision Boundary Code
# ============================================================

classes <- "null"
colours <- "null"

for (i in 1:n)
{

    # P(C = j | X = x, Y = y) = prob
    # "The probability that the class (C) is orange (j) when X is some x, and Y is some y"
    # Two predictors that influence classification: x, y
    # If x and y are both under 50, there is a 90% chance of being orange (grouping)
    # If x and y and both over 50, or if one of them is over 50, grouping is blue
    # Algorithm favours whichever grouping has a higher chance of success, then plots using that colour
    # When prob (from above) is 50%, the boundary is drawn

    percentChance <- 0
    if (x[i] < 50 && y[i] < 50)
    {
        # 95% chance of orange and 5% chance of blue
        # Bayes Decision Boundary therefore assigns to orange when x < 50 and y < 50
        # "colours" is the Decision Boundary grouping, not the plotted grouping
        percentChance <- 95
        colours[i] <- "orange"
    }
    else
    {
        percentChance <- 10
        colours[i] <- "blue"
    }

    if (round(runif(1, 1, 100)) > percentChance)
    {
        classes[i] <- "blue"
    }
    else
    {
        classes[i] <- "orange"
    }
}

boundary.x <- seq(0, 100, by=1)
boundary.y <- 0
for (i in 1:101)
{
    if (i > 49)
    {
        boundary.y[i] <- -10 # just for the sake of visual consistency, real value is 0
    }
    else
    {
        boundary.y[i] <- 50
    }
}
df <- data.frame(boundary.x, boundary.y)

plot(x, y, col=classes)
lines(df, type="l", lty=2, lwd=2, col="red")

# ============================================================
# K-Nearest neighbour code
# ============================================================

#library(class)

#n <- 100

#set.seed(1)
#x <- round(runif(n, 1, n))
#set.seed(2)
#y <- round(runif(n, 1, n))
train.df <- data.frame(x, y)

set.seed(1)
x.test <- round(runif(n, 1, n))
set.seed(2)
y.test <- round(runif(n, 1, n))
test.df <- data.frame(x.test, y.test)

k <- knn(train.df, test.df, classes, k=25)

plot(test.df, col=k)

在网格上获取 class 概率预测,并在 P=0.5(或任何您想要的截止点)处绘制等高线。这也是 Venables 和 Ripley 的 classic MASS 教科书以及 Hastie、Tibshirani 和 Friedman 的 Elements of Statistical Learning 中使用的方法。

# class labels: simple distance from origin
classes <- ifelse(x^2 + y^2 > 60^2, "blue", "orange")
classes.test <- ifelse(x.test^2 + y.test^2 > 60^2, "blue", "orange")

grid <- expand.grid(x=1:100, y=1:100)
classes.grid <- knn(train.df, grid, classes, k=25, prob=TRUE)  # note last argument
prob.grid <- attr(classes.grid, "prob")
prob.grid <- ifelse(classes.grid == "blue", prob.grid, 1 - prob.grid)

# plot the boundary
contour(x=1:100, y=1:100, z=matrix(prob.grid, nrow=100), levels=0.5,
        col="grey", drawlabels=FALSE, lwd=2)
# add points from test dataset
points(test.df, col=classes.test)

基本也见the same question on CrossValidated.