按颜色识别点

Question

我正在学习这里的教程：https://www.rpubs.com/loveb/som。本教程介绍如何在虹膜数据上使用 Kohonen 网络（也称为 SOM，一种机器学习算法）。

我运行教程中的这段代码：

library(kohonen) #fitting SOMs
library(ggplot2) #plots
library(GGally) #plots
library(RColorBrewer) #colors, using predefined palettes

iris_complete <-iris[complete.cases(iris),] 
iris_unique <- unique(iris_complete) # Remove duplicates

#scale data
iris.sc = scale(iris_unique[, 1:4]) #Levels/Factors cannot be scaled... But used in predictive SOM:s using xyf. Later.

#build grid
iris.grid = somgrid(xdim = 10, ydim=10, topo="hexagonal", toroidal = TRUE)

set.seed(33) #for reproducability
iris.som <- som(iris.sc, grid=iris.grid, rlen=700, alpha=c(0.05,0.01), keep.data = TRUE)

#plot 1
plot(iris.som, type="count")

#plot2
var <- 1 #define the variable to plot
plot(iris.som, type = "property", property = getCodes(iris.som)[,var], main=colnames(getCodes(iris.som))[var], palette.name=terrain.colors)

以上代码在虹膜数据上拟合了一个 Kohonen 网络。数据集中的每个观察值都分配给下图中的每个“彩色圆圈”（也称为“神经元”）。

我的问题：在这些图中，您如何确定哪些观察值被分配给了哪些圆圈？假设我想知道哪些观察属于下面用黑色三角形勾勒出的圆圈：

这可以吗？现在，我正在尝试使用 iris.som$classif 以某种方式追踪哪些点在哪个圆圈中。有更好的方法吗？

更新：@Jonny Phelps 向我展示了如何识别三角形内的观察结果（请参阅下面的答案）。但我仍然不确定是否可以识别不规则形状。例如。

在之前的 post (Labelling Points on a Plot (R Language)) 中，一位用户向我展示了如何为网格上的每个圆分配任意数字：

根据上图，您如何使用“som$classif”语句找出哪些观测值位于圆圈 92、91、82、81、72 和 71 中？

谢谢

Answer 1

据我所知，使用 iris.som$unit.classif 和 iris.som$grid 是在绘图网格中隔离圆圈的方法。我假设分类器值与 iris.som$grid 的行索引匹配，因此这将需要更多验证。如果这对您的问题有帮助，请告诉我:)

findTriangle <- function(top_row, top_column, side_length, iris.som,
                         reverse=FALSE){
  
  # top_row: row index of the top most triangle value
  # top_column: column index...
  # side_length: how many rows does the triangle occupy?
  # iris.som: the som object
  # reverse: set to TRUE to flip the triangle
  
  # make the grid
  grid_pts <- as.data.frame(iris.som$grid$pts)
  grid_pts$column <-  rep(1:iris.som$grid$xdim, by=iris.som$grid$ydim)
  grid_pts$row <- rep(1:iris.som$grid$ydim, each=iris.som$grid$xdim)
  grid_pts$classif <- 1:nrow(grid_pts)
  
  # starting point - top most point of the triangle
  # use reverse for triangles the other way around
  grid_pts$triangle <- FALSE
  grid_pts[grid_pts$column == top_column & grid_pts$row == top_row, ][["triangle"]] <- TRUE
  
  # loop through the remaining rows and fill out the triangle
  value_row <- top_row
  value_start_column <- grid_pts[grid_pts$triangle == TRUE,]$x
  value_end_column <- grid_pts[grid_pts$triangle == TRUE,]$x
  if(reverse){
    row_move <- -1
  }else{
    row_move <- 1
  }
  
  # update triangle
  for(row in 1:(side_length-1)){
    value_row <- value_row + row_move
    value_start_column <- value_start_column - 0.5
    value_end_column <- value_end_column + 0.5
    grid_pts[grid_pts$row == value_row & 
               grid_pts$x >= value_start_column & 
               grid_pts$x <= value_end_column, ]$triangle <- TRUE
  }

  # visualise
  pl <- ggplot(grid_pts, aes(x=x, y=rev(row), col=as.factor(triangle))) + 
    geom_point(size=7) + 
    scale_color_manual(values=c("grey", "indianred")) + 
    theme_void()
  print(pl)
  
  return(grid_pts)
}

# take the grid and pick out the triangle
top_row <- 2
top_column <- 6
side_length <- 4
reverse <- FALSE # set to TRUE to flip the triangle ie go from the bottom
grid_pts <- findTriangle(top_row, top_column, side_length, iris.som, reverse)

# now add the classifier and merge to get the co-ordinates
iris.sc2 <- as.data.frame(iris.sc)
iris.sc2$classif <- iris.som$unit.classif
iris.sc2 <- merge(iris.sc2, grid_pts, by=c("classif"), all.x=TRUE)

# filter to the points in the triangle
iris.sc2[iris.sc2$triangle==TRUE,]

输出数据：

   classif Sepal.Length Sepal.Width Petal.Length Petal.Width   x        y column row triangle
21      16  -1.01537328   0.5506423   -1.3287735  -1.3042249 6.0 1.732051      6   2     TRUE
22      16  -1.01537328   0.3214643   -1.4419091  -1.3042249 6.0 1.732051      6   2     TRUE
39      25  -0.89501479   1.0089981   -1.3287735  -1.3042249 5.5 2.598076      5   3     TRUE
40      25  -0.77465630   1.0089981   -1.2722057  -1.3042249 5.5 2.598076      5   3     TRUE
41      25  -0.77465630   0.7798202   -1.3287735  -1.3042249 5.5 2.598076      5   3     TRUE
42      25  -1.01537328   0.7798202   -1.2722057  -1.3042249 5.5 2.598076      5   3     TRUE
43      25  -0.89501479   0.7798202   -1.2722057  -1.3042249 5.5 2.598076      5   3     TRUE
44      26  -0.89501479   0.5506423   -1.1590702  -0.9108454 6.5 2.598076      6   3     TRUE
45      26  -1.01537328   0.7798202   -1.2156380  -1.0419719 6.5 2.598076      6   3     TRUE
58      36  -0.53393933   0.7798202   -1.2722057  -1.0419719 6.0 3.464102      6   4     TRUE
59      36  -0.41358084   1.0089981   -1.3853413  -1.3042249 6.0 3.464102      6   4     TRUE
60      36  -0.53393933   0.7798202   -1.1590702  -1.3042249 6.0 3.464102      6   4     TRUE
61      37  -1.01537328   1.0089981   -1.2156380  -0.7797188 7.0 3.464102      7   4     TRUE
62      37  -1.01537328   1.0089981   -1.3853413  -1.1730984 7.0 3.464102      7   4     TRUE
63      37  -0.89501479   1.0089981   -1.3287735  -1.1730984 7.0 3.464102      7   4     TRUE
74      44   0.06785311   0.3214643    0.5945312   0.7937995 4.5 4.330127      4   5     TRUE
75      46  -0.65429782   1.4673539   -1.2722057  -1.3042249 6.5 4.330127      6   5     TRUE
76      46  -0.53393933   1.4673539   -1.2722057  -1.3042249 6.5 4.330127      6   5     TRUE
77      47  -0.89501479   1.6965319   -1.0459346  -1.0419719 7.5 4.330127      7   5     TRUE
78      47  -0.89501479   1.6965319   -1.2156380  -1.3042249 7.5 4.330127      7   5     TRUE
79      47  -0.89501479   1.4673539   -1.2722057  -1.0419719 7.5 4.330127      7   5     TRUE
80      47  -0.89501479   1.6965319   -1.2722057  -1.1730984 7.5 4.330127      7   5     TRUE

网格上的验证绘图：

Answer 2

我在我的 post 中详细说明了示例，但是，不是在鸢尾花数据集上，但我想这没问题：R, SOM, Kohonen Package, Outlier Detection 并且还添加了您可能需要的代码片段。他们显示

如何生成数据、添加离群值并在图上描绘它们
如何训练 SOM
如何进行聚类
如何使用层次聚类将聚类边界添加到 SOM 图
最后，我添加了 SOM 预测的集群，以将它们与我生成数据的真实集群进行比较

我认为这回答了您的问题。将 SOM 的性能与 t-SNE 进行比较也很好。我只使用 SOM 作为对我生成的数据和真实葡萄酒数据集的实验。如果你有两个以上的变量，准备热图也很好。祝您分析一切顺利！

Answer 3

编辑：现在有了 Shiny App！

plotly 解决方案也是可能的，您可以将鼠标悬停在单个神经元上以显示相关的虹膜行名（此处称为 id）。根据您的 iris.som 数据和 Jonny Phelps 的网格方法，您可以将行号作为连接字符串分配给各个神经元，并在鼠标悬停时显示这些：

library(ggplot2)
library(plotly)
ga <- data.frame(g=iris.som$unit.classif, 
                 sample=seq_len(dim(iris.som$data[[1]])[1]))
grid_pts <- as.data.frame(iris.som$grid$pts)
grid_pts$column <- rep(1:iris.som$grid$xdim, by=iris.som$grid$ydim)
grid_pts$row <- rep(1:iris.som$grid$ydim, each=iris.som$grid$xdim)
grid_pts$classif <- 1:nrow(grid_pts)
grid_pts$id <- sapply(seq_along(grid_pts$classif), 
                      function(x) paste(ga$sample[ga$g==x], collapse=", "))
grid_pts$count <- sapply(seq_along(grid_pts$classif), 
                         function(x) length(ga$sample[ga$g==x]))
grid_pts$count <- factor(grid_pts$count, levels=0:max(grid_pts$count))
p1 <- ggplot(grid_pts, aes(x=x, y=y, colour=count, row=row, column=column, id=id)) +
    geom_point(size=8) +
    scale_colour_manual(values=c("grey50", heat.colors(length(unique(grid_pts$count))))) +
    theme_void() +
    theme(plot.margin=unit(c(1,rep(.3, 3)),"cm"))
ggplotly(p1)

这是一个完整的 Shiny 应用程序，它允许套索选择并显示 table 和数据：

invisible(suppressPackageStartupMessages(
    lapply(c("shiny","dplyr","ggplot2", "plotly", "kohonen", "GGally", "DT"),
           require, character.only=TRUE)))

iris_complete <- iris[complete.cases(iris),] 
iris_unique <- unique(iris_complete) # Remove duplicates

#scale data
iris.sc = scale(iris_unique[, 1:4]) #Levels/Factors cannot be scaled... But used in predictive SOM:s using xyf. Later.

#build grid
iris.grid = somgrid(xdim = 10, ydim=10, topo="hexagonal", toroidal = TRUE)

set.seed(33) #for reproducability
iris.som <- som(iris.sc, grid=iris.grid, rlen=700, alpha=c(0.05,0.01), keep.data = TRUE)

ga <- data.frame(g=iris.som$unit.classif, 
                 sample=seq_len(dim(iris.som$data[[1]])[1]))
grid_pts <- as.data.frame(iris.som$grid$pts)
grid_pts$column <- rep(1:iris.som$grid$xdim, by=iris.som$grid$ydim)
grid_pts$row <- rep(1:iris.som$grid$ydim, each=iris.som$grid$xdim)
grid_pts$classif <- 1:nrow(grid_pts)
grid_pts$id <- sapply(seq_along(grid_pts$classif), 
                      function(x) paste(ga$sample[ga$g==x], collapse=", "))
grid_pts$count <- sapply(seq_along(grid_pts$classif), 
                         function(x) length(ga$sample[ga$g==x]))
grid_pts$count <- factor(grid_pts$count, levels=0:max(grid_pts$count))

# Shiny app, adapted from https://gist.github.com/dgrapov/128e3be71965bf00495768e47f0428b9

ui <- fluidPage(
    fluidRow(
        column(12, plotlyOutput("plot", height = "600px")),
        column(12, DT::dataTableOutput('data_table'))
    )
)


server <- function(input, output){
    
    output$plot <- renderPlotly({
        req(data()) 
        p <- ggplot(data = data()$data, 
            aes(x=x, y=y, classif=classif, colour=count, row=row, column=column, id=id)) +
            geom_point(size=8) +
            scale_colour_manual(
                values=c("grey50", heat.colors(length(unique(grid_pts$count))))
            ) +
            theme_void() +
            theme(plot.margin=unit(c(1, rep(.3, 3)), "cm"))
        
        obj <- data()$sel
        if(nrow(obj) != 0) {
            p <- p + geom_point(data=obj, mapping=aes(x=x, y=y, classif=classif, 
                    count=count, row=row, column=column, id=id), color="blue", 
                    size=5, inherit.aes=FALSE)
        }
        ggplotly(p, source="p1") %>% layout(dragmode = "lasso")
    })
   
    selected <- reactive({
        event_data("plotly_selected", source = "p1")
    })

    output$data_table <- DT::renderDataTable(
        data()$sel, filter='top', options=list(  
            pageLength=5, autoWidth=TRUE
        )
    )
    
    data <- reactive({
        tmp <- grid_pts 
        sel <- tryCatch(filter(grid_pts, paste(x, y, sep="_") %in% 
                paste(selected()$x, selected()$y, sep="_")),
            error=function(e){NULL})
        list(data=tmp, sel=sel)
    })
}  

shinyApp(ui,server)

按颜色识别点

Identifying points by color

r

data-visualization

cluster-analysis

machine-learning

data-manipulation