具有单个输入变量的 kmeans 聚类图

kmeans clustering plot with a single input variable

我有一些数据是这样的;

   id_row year_row      value
1 1031296     2012 0.13908350
2 1031296     2013 0.11825776
3 1031296     2014 0.03925923
4 1031296     2015 0.07821547
5 1031296     2016 0.04694897
6 1031296     2017 0.07790232

我可以按年份过滤 运行 kmeans

kmdata <- results %>%
  filter(year_row == "2010")

km <- kmeans(as.vector(kmdata$value), centers = 4, iter.max = 10, nstart = 1)
km

但是我想计算每年的 kmeans 并查看每个 id_row 随着时间的推移如何改变集群。

由于数据不是矩阵,因此在尝试绘制模型时出现错误。

library(cluster) clusplot(kmdata$value, km$clusters, color=T, shade=T, labels=2, lines=0)

Error in is.list(s.x.2d) : x is not a data matrix

我使用的方法是"okay"吗?我在网上查看并找到了一些 kmeans 示例,发现许多示例使用多个 inputs 而我只有一个 cosine 相似性输入。

##         Murder Assault UrbanPop     Rape
## Alabama 1.2426   0.783   -0.521 -0.00342
## Alaska  0.5079   1.107   -1.212  2.48420
## Arizona 0.0716   1.479    0.999  1.04288

数据:

structure(list(id_row = c("1031296", "1031296", "1031296", "1031296", 
"1031296", "1031296", "1031296", "1031296", "1130310", "1130310", 
"1130310", "1130310", "1130310", "1130310", "1130310", "1130310", 
"1130310", "1130310", "1130310", "1130310", "1130310", "1130310", 
"1130310", "1137411", "1137411", "1336920", "1336920", "1336920", 
"1336920", "1336920", "1336920", "1336920", "1336920", "1336920", 
"1336920", "1336920", "1336920", "1336920", "1336920", "1336920", 
"1336920", "1336920", "1336920", "1336920", "1413329", "1413329", 
"1413329", "1413329", "1413329", "1413329", "1413329", "1413329", 
"1413329", "1413329", "1413329", "1413329", "1413329", "1413329", 
"1413329", "1413329", "1413329", "1413329", "1413329", "16732", 
"16732", "16732", "16732", "16732", "16732", "16732", "16732", 
"16732", "16732", "16732", "16732", "16732", "16732", "16732", 
"21344", "21344", "21344", "21344", "21344", "21344", "21344", 
"21344", "21344", "21344", "21344", "21344", "21344", "21344", 
"21344", "29989", "29989", "29989", "29989", "313616", "313616", 
"46989", "46989", "46989", "46989", "46989", "46989", "46989", 
"46989", "46989", "5513", "5513", "5513", "5513", "5513", "5513", 
"5513", "5513", "5513", "5513", "5513", "5513", "5513", "5513", 
"5513", "5513", "716823", "716823", "716823", "716823", "716823", 
"716823", "716823", "716823", "716823", "716823", "789073", "789073", 
"789073", "789073", "789073", "789073", "789073", "789073", "789073", 
"789073", "789073", "789073", "789073", "797468", "797468", "797468", 
"797468", "797468", "797468", "797468", "797468", "797468", "797468", 
"797468", "797468", "797468", "797468", "797468", "797468", "80661", 
"80661", "80661", "80661", "80661", "80661", "80661", "80661", 
"80661", "80661", "80661", "80661", "80661", "80661", "80661", 
"80661", "866787", "866787", "866787", "866787", "866787", "866787", 
"866787", "866787", "866787", "866787", "866787", "866787", "866787", 
"866787", "866787", "866787", "866787", "882184", "882184", "882184", 
"882184", "91142", "91142", "91142", "91142", "91142", "91142", 
"91142", "91142", "91142", "91142", "91142", "91142", "91142", 
"91142", "91142", "91142", "91142", "912595", "95521", "95521", 
"95521", "95521", "95521", "95521", "95521", "95521", "95521", 
"95521", "95521", "95521"), year_row = c("2012", "2013", "2014", 
"2015", "2016", "2017", "2018", "2019", "2004", "2005", "2006", 
"2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", 
"2015", "2016", "2017", "2018", "2003", "2004", "2001", "2002", 
"2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", 
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", 
"2019", "2003", "2003", "2004", "2004", "2005", "2006", "2007", 
"2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
"2016", "2017", "2018", "2019", "2002", "2003", "2004", "2005", 
"2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
"2016", "2017", "2018", "2005", "2006", "2007", "2008", "2009", 
"2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", 
"2018", "2019", "2005", "2006", "2007", "2008", "2010", "2011", 
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", 
"2019", "2003", "2004", "2005", "2006", "2007", "2008", "2009", 
"2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", 
"2018", "2001", "2002", "2003", "2004", "2005", "2005", "2006", 
"2006", "2007", "2008", "2005", "2005", "2006", "2006", "2007", 
"2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
"2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", 
"2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", 
"2004", "2005", "2006", "2009", "2010", "2011", "2012", "2013", 
"2014", "2015", "2016", "2016", "2017", "2017", "2018", "2019", 
"2006", "2006", "2007", "2007", "2008", "2008", "2009", "2010", 
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", 
"2019", "2016", "2017", "2018", "2019", "2003", "2004", "2005", 
"2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", 
"2014", "2015", "2016", "2017", "2018", "2019", "2018", "2006", 
"2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", 
"2017", "2018", "2019"), value = c(0.139083502412409, 0.11825775641964, 
0.0392592265955874, 0.0782154662932015, 0.0469489736719239, 0.0779023179300866, 
0.0228012955999517, 0.0854168153956153, 0.999737539238827, 0.0443179732423611, 
0.0390309184765143, 0.0922585629702825, 0.0403666403458272, 0.0382194133579655, 
0.042698343847385, 0.0685255449505098, 0.0675200147346398, 0.0187881296791695, 
0.0429479468414007, 0.079743052611441, 0.0320744404500168, 0.0144941429460794, 
0.119160368459038, 0.0925697035527265, 0.083984708174856, 0.996283500380756, 
0.107778943258269, 0.173435313229931, 0.0900909715473757, 0.0197546332298797, 
0.144120296067433, 0.158299486589792, 0.186295755413315, 0.101668114945428, 
0.0539410318683912, 0.0436257634521463, 0.0469995547968916, 0.0297825730932798, 
0.0378571859484953, 0.0409750669985696, 0.0835845366556822, 0.0461210474287448, 
0.0327580476668409, 0.177115131073337, 0.159254253746574, 0.165016169958592, 
0.217868629318303, 0.218151233840694, 0.0295314037649514, 0.350667808112922, 
0.04872107872219, 0.0428538370791108, 0.0702414653935244, 0.0509909654321864, 
0.021307630695821, 0.0487040360447408, 0.041478962700618, 0.0899399982611924, 
0.0596779333637508, 0.0594380923275606, 0.0260485423561843, 0.0227124484448211, 
0.0283345344486783, 0, 0, 0.987417394803821, 0.977452829626341, 
0.0935080361786257, 0.0399062483581079, 0.0597891120112862, 0.315545198466048, 
0.163328528827512, 0.0874148150892009, 0.0510720020721022, 0.0667940605980389, 
0.169532406681824, 0.0910555503799401, 0.0279487917930926, 0.10928052636183, 
0.123476844322464, 0.103160715130179, 0.103249999036791, 0.0745839591361995, 
0.0631175647480072, 0.184211621364709, 0.0215167736361518, 0.0245822231545278, 
0.0989784724113916, 0.0229286224340945, 0.0226191481684307, 0.0233422198272636, 
0.0273923715753037, 0.0252371778483782, 0.995932814180916, 0.173246569547786, 
0.0803668586813332, 0.117020596135848, 0, 0, 0.166465264703167, 
0.121736420297069, 0.222592282376611, 0.112875298902015, 0.239757945494177, 
0.06973597297872, 0.0830930852483126, 0.0805690109704797, 0.0616970606582679, 
0.949058915832725, 0.772825147232639, 0.275521756883282, 0.104905821737462, 
0.190089446388639, 0.104877738913191, 0.0451743677658758, 0.107005078500435, 
0.501394828959975, 0.469521731740851, 0.52003539194839, 0.467749776421354, 
0.354695678996227, 0.122712271145558, 0.416883650557191, 0.19336131647959, 
0.0617013322716825, 0.164405233667766, 0.231328666854185, 0.13516176196116, 
0.244769963995398, 0.245233564251184, 0.0202645676328879, 0.0203938119548491, 
0.0440061980952809, 0.119647769350871, 0.788760048600453, 0.52096301163371, 
0.894490022586396, 0, 0.915841803524472, 0.18031433341574, 0.203234762827244, 
0.228630682218131, 0.0912296950189682, 0.136106113682158, 0.164573356080639, 
0.0745781930106895, 0.150260763176162, 0.158653568728859, 0.0783486847140882, 
0.0869476996735634, 0.0324141335754994, 0.0898424570938522, 0.0363991230061337, 
0.032310166107677, 0.0209754067589013, 0.265484318305701, 0.113478924043708, 
0.0186602705559273, 0.0255246104570098, 0.056393297717265, 0.0857604028464242, 
0.0124478249166918, 0.00637473097535723, 0.207577271505867, 0.337100773405183, 
0.0646190164032464, 0.0917033805466042, 0.196505785433459, 0.331131037406129, 
0.210704702017685, 0.0637807753855683, 0.0539481325014424, 0.0989683802933529, 
0.524316699544961, 0.507211406678685, 0.0528130064031331, 0.0492601567601492, 
0.0952275608333137, 0.231443497541783, 0.0923624848840547, 0.0512562995607162, 
0.0899452189237439, 0.0899452189237439, 0.196385666544902, 0.196385666544902, 
0.0860496103484817, 0.0828699425192967, 0.0782477404202879, 0.0604891402552598, 
0.0620081387111392, 0.0581289157948599, 0.139040164810116, 0.121876448051833, 
0.0469641320576142, 0.0584450497367173, 0.0683450569694576, 0.107780652102444, 
0.0343457213273257, 0.318083029206905, 0.057398518201345, 0.134372218626067, 
0.159580001800562, 0.089498808618003, 0.0802305351945032, 0.121212589768212, 
0.0941452821751688, 0.146898998896027, 0.0785225299750667, 0.0507434601283108, 
0.0850646939602678, 0.121330800725537, 0.0186249957267043, 0.0693968500893254, 
0.0183033849029344, 0.0375008562807299, 0.0310986292138113, 0.0225677736567973, 
0.059073285118026, 0.892838347294089, 0.0311951595296633, 0.026834748568959, 
0.0472249488059499, 0.125624455369426, 0.0861728208246999, 0.0702399536446421, 
0.0265279690855791, 0.083416879130688, 0.0463856364022548, 0.131546576568187, 
0.058743275128742)), row.names = c(NA, -230L), class = "data.frame")

您可以使用 nest 创建嵌套的小标题,然后将 kmeans 应用于每个组:

library(tidyverse)
x <- results %>% 
  as_tibble() %>% 
  select(-id_row) %>% 
  group_by(year_row) %>% 
  nest(.key = "value") %>%
  filter(map_int(value, nrow)> 4) %>% 
  mutate(kmeans = map(value, ~kmeans(.x[[1]], centers = 4, iter.max = 10, nstart = 1)))

请注意,我过滤了一些年份,因为它们没有足够的观察结果。

然后你可以像这样制作一个聚类图:

cluster::clusplot(x$value[[1]], x$kmeans[[1]]$cluster)