如何聚类具有两个变量和出现频率的数据框
How to cluster a data frame with two variables and frequency of occurrence
假设我有一个包含示例的数据框:
structure(list(V1 = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L,
6L, 6L, 6L, 7L, 7L, 7L, 9L, 9L, 9L, 13L, 13L, 13L, 15L, 15L,
18L, 22L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 27L, 27L, 28L, 32L, 32L, 32L, 32L, 32L, 36L, 36L, 36L, 36L,
36L, 36L, 36L, 37L, 37L, 37L, 37L, 37L, 37L, 38L, 38L, 38L, 38L,
38L, 39L, 39L, 39L, 39L, 40L, 40L, 40L, 41L, 41L, 42L, 45L, 45L,
45L, 45L, 47L, 47L, 47L, 48L, 50L, 50L, 51L, 53L, 53L, 54L),
V2 = c(2L, 7L, 20L, 3L, 5L, 6L, 7L, 13L, 15L, 18L, 19L, 20L,
4L, 5L, 6L, 7L, 9L, 12L, 6L, 9L, 12L, 13L, 15L, 18L, 7L,
9L, 13L, 15L, 18L, 9L, 20L, 44L, 12L, 27L, 44L, 15L, 18L,
58L, 16L, 18L, 19L, 23L, 27L, 28L, 29L, 32L, 45L, 47L, 50L,
51L, 52L, 53L, 54L, 55L, 28L, 29L, 29L, 45L, 47L, 53L, 54L,
55L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 38L, 39L, 40L, 41L,
42L, 43L, 39L, 40L, 41L, 42L, 43L, 40L, 41L, 42L, 43L, 41L,
42L, 43L, 42L, 43L, 43L, 47L, 53L, 54L, 55L, 53L, 54L, 55L,
49L, 51L, 52L, 52L, 54L, 55L, 55L), N = c(1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), row.names = c(NA,
-104L), class = c("data.table", "data.frame"))
我应该如何构造它以便能够使用 cor()、dist() 和 hclust() 对它们进行聚类?
供您参考,V1 和 V2 是 material 个数字。
N = 它们以相同顺序出现在一起的频率。
我也可以将 N 更改为 0 - 1 之间的数字来表示相关性,如果这样更容易的话。
据我所知,我必须先将其更改为矩阵,看起来像这样,其中 V1 可以是行,V2 可以是列,N 可以是值。但是我不知道
1 2 3 4 ...
1 0 1 1 4
2 1 0 2 2
3 1 4 0 1
4 1 0 3 0
...
"To my knowledge I have to change it to a matrix first, that look like this, where V1 can be the rows and V2 the column and N the values."
您需要距离矩阵来聚类分析,而您的定义不是距离矩阵。
我觉得你想让一起出现的材料离得近一些,不一起出现的材料离得远一点,所以我建议材料之间的距离在一起出现时为1/N,不出现时为2 .然后你会得到这样的东西:
> ## df is your data.frame
> dd <- dcast(rbind(df, df[, .(V1 = V2, V2 = V1, N)]), V1~V2, value.var = "N")[, -1]
> dd <- 1/dd
> dd[is.na(dd)] <- 2
> diag(dd) <- 0
> dd <- as.dist(dd)
> dd
1 2 3 4 5 6 7 9 ...
2 1.0
3 2.0 1.0
4 2.0 2.0 0.5
5 2.0 1.0 1.0 2.0
6 2.0 1.0 1.0 2.0 0.5
7 1.0 0.5 1.0 2.0 2.0 1.0
9 2.0 2.0 0.5 2.0 1.0 1.0 0.5
...
> hc <- hclust(dd)
> plot(hc)
具有相似性矩阵的示例
> ss <- dcast(rbind(df, df[, .(V1 = V2, V2 = V1, N)]), V1~V2, value.var = "N")[, -1]
> ss <- ss/max(ss, na.rm = TRUE)
> ss[is.na(ss)] <- 0
> diag(ss) <- 1
> ss
1 2 3 4 5 6 7 9 ...
1: 1.0 0.5 0.0 0 0.0 0.0 0.5 0.0
2: 0.5 1.0 0.5 0 0.5 0.5 1.0 0.0
3: 0.0 0.5 1.0 1 0.5 0.5 0.5 1.0
4: 0.0 0.0 1.0 1 0.0 0.0 0.0 0.0
5: 0.0 0.5 0.5 0 1.0 1.0 0.0 0.5
6: 0.0 0.5 0.5 0 1.0 1.0 0.5 0.5
7: 0.5 1.0 0.5 0 0.0 0.5 1.0 1.0
8: 0.0 0.0 1.0 0 0.5 0.5 1.0 1.0
9: 0.0 0.0 0.5 0 0.5 0.0 0.0 0.5
10: 0.0 0.5 0.0 0 0.5 0.5 0.0 0.0
...
> dd <- as.dist(1 - ss)
> dd
1 2 3 4 5 6 7 9 ...
2 0.5
3 1.0 0.5
4 1.0 1.0 0.0
5 1.0 0.5 0.5 1.0
6 1.0 0.5 0.5 1.0 0.0
7 0.5 0.0 0.5 1.0 1.0 0.5
9 1.0 1.0 0.0 1.0 0.5 0.5 0.0
...
> hc2 <- hclust(dd)
> plot(hc2)
PAM 例子:
> # hclust - 5
> cl <- cutree(hc2, 5)
> summary(as.factor(cl))
1 2 3 4 5
562 1 1 2 1
>
> # pam - 5 with dd
> pam1 <- pam(dd, 5)
> summary(as.factor(pam1$clustering))
1 2 3 4 5
402 105 22 21 17
>
> # pam - 5 with sqrt(ss)
> dd2 <- as.dist(1 - sqrt(ss))
> pam2 <- pam(dd2, 5)
> summary(as.factor(pam2$clustering))
1 2 3 4 5
362 95 23 61 26
假设我有一个包含示例的数据框:
structure(list(V1 = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L,
6L, 6L, 6L, 7L, 7L, 7L, 9L, 9L, 9L, 13L, 13L, 13L, 15L, 15L,
18L, 22L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 27L, 27L, 28L, 32L, 32L, 32L, 32L, 32L, 36L, 36L, 36L, 36L,
36L, 36L, 36L, 37L, 37L, 37L, 37L, 37L, 37L, 38L, 38L, 38L, 38L,
38L, 39L, 39L, 39L, 39L, 40L, 40L, 40L, 41L, 41L, 42L, 45L, 45L,
45L, 45L, 47L, 47L, 47L, 48L, 50L, 50L, 51L, 53L, 53L, 54L),
V2 = c(2L, 7L, 20L, 3L, 5L, 6L, 7L, 13L, 15L, 18L, 19L, 20L,
4L, 5L, 6L, 7L, 9L, 12L, 6L, 9L, 12L, 13L, 15L, 18L, 7L,
9L, 13L, 15L, 18L, 9L, 20L, 44L, 12L, 27L, 44L, 15L, 18L,
58L, 16L, 18L, 19L, 23L, 27L, 28L, 29L, 32L, 45L, 47L, 50L,
51L, 52L, 53L, 54L, 55L, 28L, 29L, 29L, 45L, 47L, 53L, 54L,
55L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 38L, 39L, 40L, 41L,
42L, 43L, 39L, 40L, 41L, 42L, 43L, 40L, 41L, 42L, 43L, 41L,
42L, 43L, 42L, 43L, 43L, 47L, 53L, 54L, 55L, 53L, 54L, 55L,
49L, 51L, 52L, 52L, 54L, 55L, 55L), N = c(1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), row.names = c(NA,
-104L), class = c("data.table", "data.frame"))
我应该如何构造它以便能够使用 cor()、dist() 和 hclust() 对它们进行聚类?
供您参考,V1 和 V2 是 material 个数字。 N = 它们以相同顺序出现在一起的频率。 我也可以将 N 更改为 0 - 1 之间的数字来表示相关性,如果这样更容易的话。
据我所知,我必须先将其更改为矩阵,看起来像这样,其中 V1 可以是行,V2 可以是列,N 可以是值。但是我不知道
1 2 3 4 ...
1 0 1 1 4
2 1 0 2 2
3 1 4 0 1
4 1 0 3 0
...
"To my knowledge I have to change it to a matrix first, that look like this, where V1 can be the rows and V2 the column and N the values."
您需要距离矩阵来聚类分析,而您的定义不是距离矩阵。
我觉得你想让一起出现的材料离得近一些,不一起出现的材料离得远一点,所以我建议材料之间的距离在一起出现时为1/N,不出现时为2 .然后你会得到这样的东西:
> ## df is your data.frame
> dd <- dcast(rbind(df, df[, .(V1 = V2, V2 = V1, N)]), V1~V2, value.var = "N")[, -1]
> dd <- 1/dd
> dd[is.na(dd)] <- 2
> diag(dd) <- 0
> dd <- as.dist(dd)
> dd
1 2 3 4 5 6 7 9 ...
2 1.0
3 2.0 1.0
4 2.0 2.0 0.5
5 2.0 1.0 1.0 2.0
6 2.0 1.0 1.0 2.0 0.5
7 1.0 0.5 1.0 2.0 2.0 1.0
9 2.0 2.0 0.5 2.0 1.0 1.0 0.5
...
> hc <- hclust(dd)
> plot(hc)
具有相似性矩阵的示例
> ss <- dcast(rbind(df, df[, .(V1 = V2, V2 = V1, N)]), V1~V2, value.var = "N")[, -1]
> ss <- ss/max(ss, na.rm = TRUE)
> ss[is.na(ss)] <- 0
> diag(ss) <- 1
> ss
1 2 3 4 5 6 7 9 ...
1: 1.0 0.5 0.0 0 0.0 0.0 0.5 0.0
2: 0.5 1.0 0.5 0 0.5 0.5 1.0 0.0
3: 0.0 0.5 1.0 1 0.5 0.5 0.5 1.0
4: 0.0 0.0 1.0 1 0.0 0.0 0.0 0.0
5: 0.0 0.5 0.5 0 1.0 1.0 0.0 0.5
6: 0.0 0.5 0.5 0 1.0 1.0 0.5 0.5
7: 0.5 1.0 0.5 0 0.0 0.5 1.0 1.0
8: 0.0 0.0 1.0 0 0.5 0.5 1.0 1.0
9: 0.0 0.0 0.5 0 0.5 0.0 0.0 0.5
10: 0.0 0.5 0.0 0 0.5 0.5 0.0 0.0
...
> dd <- as.dist(1 - ss)
> dd
1 2 3 4 5 6 7 9 ...
2 0.5
3 1.0 0.5
4 1.0 1.0 0.0
5 1.0 0.5 0.5 1.0
6 1.0 0.5 0.5 1.0 0.0
7 0.5 0.0 0.5 1.0 1.0 0.5
9 1.0 1.0 0.0 1.0 0.5 0.5 0.0
...
> hc2 <- hclust(dd)
> plot(hc2)
PAM 例子:
> # hclust - 5
> cl <- cutree(hc2, 5)
> summary(as.factor(cl))
1 2 3 4 5
562 1 1 2 1
>
> # pam - 5 with dd
> pam1 <- pam(dd, 5)
> summary(as.factor(pam1$clustering))
1 2 3 4 5
402 105 22 21 17
>
> # pam - 5 with sqrt(ss)
> dd2 <- as.dist(1 - sqrt(ss))
> pam2 <- pam(dd2, 5)
> summary(as.factor(pam2$clustering))
1 2 3 4 5
362 95 23 61 26