字符列之间的对应关系
Correspondence between character columns
我有一个包含五个字符列的数据框。每列都有有限数量的值(分类数据)。在数据集中,一列中的每个值与其他列中的其他值出现的次数可变。
这是一个示例数据集:
d<- structure(list(ID = c(17, 12, 12, 17, 17, 12, 12, 17, 31, 13),
card = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3), curf = c("c11", "c11",
"c11", "c11", "c12", "c12", "c12", "c12", "c08", "c08"),
mas = c("m2_indo", "m2_indo", "m2_indo", "m2_indo", "m2_indo",
"m2_indo", "m2_indo", "m2_indo", "m3_every", "m3_every"),
vac = c("v_100", "v_100", "v_100", "v_100", "v_200", "v_200",
"v_200", "v_200", "v_100", "v_100"), scho = c("s_nope", "s_nope",
"s_nope", "s_nope", "s_50", "s_50", "s_50", "s_50", "s_nope",
"s_nope"), alco = c("a3_nsol", "a3_nsol", "a3_nsol", "a3_nsol",
"a2_thu", "a2_thu", "a2_thu", "a2_thu", "a1_sat", "a1_sat"
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))
ID card curf mas vac scho alco
<dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
1 17 1 c11 m2_indo v_100 s_nope a3_nsol
2 12 1 c11 m2_indo v_100 s_nope a3_nsol
3 12 1 c11 m2_indo v_100 s_nope a3_nsol
4 17 1 c11 m2_indo v_100 s_nope a3_nsol
5 17 2 c12 m2_indo v_200 s_50 a2_thu
6 12 2 c12 m2_indo v_200 s_50 a2_thu
7 12 2 c12 m2_indo v_200 s_50 a2_thu
8 17 2 c12 m2_indo v_200 s_50 a2_thu
9 31 3 c08 m3_every v_100 s_nope a1_sat
10 13 3 c08 m3_every v_100 s_nope a1_sat
我想计算列的每个可能值与其他列中的值同时出现的次数。
目标是 table 例如:
col1 col2 No_of_Occurence
c11 m2_indo xxx
c12 m2_indo xxx
c08 m2_indo xxx
c11 v_100 xxx
c12 v_100 xxx
c08 v_100 xxx
...
s_50 a2_thu xxx
我没有看到任何合理的计算方法?
这是一种一次性完成所有字符列的方法,无需提前知道列的名称。
long1 <- d %>%
mutate(Row=row_number()) %>%
pivot_longer(cols=where(is.character), names_to="Col1", values_to="Value1")
long2 <- d %>%
mutate(Row=row_number()) %>%
pivot_longer(cols=where(is.character), names_to="Col2", values_to="Value2")
long1 %>%
left_join(long2, by="Row") %>%
filter(Col1 != Col2) %>% group_by(Value1, Value2) %>%
summarise(N=n(), .groups="drop")
# A tibble: 58 x 3
Value1 Value2 N
* <chr> <chr> <int>
1 a1_sat c08 2
2 a1_sat m3_every 2
3 a1_sat s_nope 2
4 a1_sat v_100 2
5 a2_thu c12 4
6 a2_thu m2_indo 4
7 a2_thu s_50 4
8 a2_thu v_200 4
9 a3_nsol c11 4
10 a3_nsol m2_indo 4
# … with 48 more rows
我有一个包含五个字符列的数据框。每列都有有限数量的值(分类数据)。在数据集中,一列中的每个值与其他列中的其他值出现的次数可变。
这是一个示例数据集:
d<- structure(list(ID = c(17, 12, 12, 17, 17, 12, 12, 17, 31, 13),
card = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3), curf = c("c11", "c11",
"c11", "c11", "c12", "c12", "c12", "c12", "c08", "c08"),
mas = c("m2_indo", "m2_indo", "m2_indo", "m2_indo", "m2_indo",
"m2_indo", "m2_indo", "m2_indo", "m3_every", "m3_every"),
vac = c("v_100", "v_100", "v_100", "v_100", "v_200", "v_200",
"v_200", "v_200", "v_100", "v_100"), scho = c("s_nope", "s_nope",
"s_nope", "s_nope", "s_50", "s_50", "s_50", "s_50", "s_nope",
"s_nope"), alco = c("a3_nsol", "a3_nsol", "a3_nsol", "a3_nsol",
"a2_thu", "a2_thu", "a2_thu", "a2_thu", "a1_sat", "a1_sat"
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))
ID card curf mas vac scho alco
<dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
1 17 1 c11 m2_indo v_100 s_nope a3_nsol
2 12 1 c11 m2_indo v_100 s_nope a3_nsol
3 12 1 c11 m2_indo v_100 s_nope a3_nsol
4 17 1 c11 m2_indo v_100 s_nope a3_nsol
5 17 2 c12 m2_indo v_200 s_50 a2_thu
6 12 2 c12 m2_indo v_200 s_50 a2_thu
7 12 2 c12 m2_indo v_200 s_50 a2_thu
8 17 2 c12 m2_indo v_200 s_50 a2_thu
9 31 3 c08 m3_every v_100 s_nope a1_sat
10 13 3 c08 m3_every v_100 s_nope a1_sat
我想计算列的每个可能值与其他列中的值同时出现的次数。
目标是 table 例如:
col1 col2 No_of_Occurence
c11 m2_indo xxx
c12 m2_indo xxx
c08 m2_indo xxx
c11 v_100 xxx
c12 v_100 xxx
c08 v_100 xxx
...
s_50 a2_thu xxx
我没有看到任何合理的计算方法?
这是一种一次性完成所有字符列的方法,无需提前知道列的名称。
long1 <- d %>%
mutate(Row=row_number()) %>%
pivot_longer(cols=where(is.character), names_to="Col1", values_to="Value1")
long2 <- d %>%
mutate(Row=row_number()) %>%
pivot_longer(cols=where(is.character), names_to="Col2", values_to="Value2")
long1 %>%
left_join(long2, by="Row") %>%
filter(Col1 != Col2) %>% group_by(Value1, Value2) %>%
summarise(N=n(), .groups="drop")
# A tibble: 58 x 3
Value1 Value2 N
* <chr> <chr> <int>
1 a1_sat c08 2
2 a1_sat m3_every 2
3 a1_sat s_nope 2
4 a1_sat v_100 2
5 a2_thu c12 4
6 a2_thu m2_indo 4
7 a2_thu s_50 4
8 a2_thu v_200 4
9 a3_nsol c11 4
10 a3_nsol m2_indo 4
# … with 48 more rows