计算数据框中值之间交互的比例
calculate proportion of interactions between values in dataframe
我有一个数据框,它的结构与这个玩具数据集相似。与其处理二进制数据,我宁愿生成新的列来表示二进制数据列(value1、value2 和 value3)之间的值的交互,因为只有 8 种可能的值组合(例如,TRUE.TRUE.TRUE , TRUE.TRUE.FALSE, 等等).
具体来说,我想计算每个对象和主题的每个组合的比例。
subject object value1 value2 value3
1 A TRUE TRUE FALSE
1 A TRUE TRUE TRUE
1 B TRUE FALSE TRUE
1 B TRUE FALSE TRUE
1 B TRUE TRUE TRUE
2 B TRUE FALSE FALSE
2 A TRUE TRUE FALSE
2 B FALSE FALSE FALSE
3 A TRUE TRUE FALSE
3 B FALSE TRUE FALSE
3 B TRUE TRUE TRUE
... ... ... ... ...
期望的输出:
subject object combination value
1 A True.True.True .5
1 A True.True.False .5
1 B True.True.True .33
1 B True.False.True .67
...
etc for subject 2 and 3...
试试这个:
library(tidyverse)
#Data
df <- structure(list(subject = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L), object = c("A", "A", "B", "B", "B", "B", "A", "B", "A",
"B", "B"), value1 = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
FALSE, TRUE, FALSE, TRUE), value2 = c(TRUE, TRUE, FALSE, FALSE,
TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE), value3 = c(FALSE,
TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE
)), class = "data.frame", row.names = c(NA, -11L))
#Code
df %>% mutate(value=paste(value1,value2,value3,sep = '.')) %>% group_by(subject,object,value) %>%
summarize(N=n()) %>% ungroup() %>% group_by(subject,object) %>% mutate(Prop=N/sum(N))
# A tibble: 10 x 5
# Groups: subject, object [6]
subject object value N Prop
<int> <chr> <chr> <int> <dbl>
1 1 A TRUE.TRUE.FALSE 1 0.5
2 1 A TRUE.TRUE.TRUE 1 0.5
3 1 B TRUE.FALSE.TRUE 2 0.667
4 1 B TRUE.TRUE.TRUE 1 0.333
5 2 A TRUE.TRUE.FALSE 1 1
6 2 B FALSE.FALSE.FALSE 1 0.5
7 2 B TRUE.FALSE.FALSE 1 0.5
8 3 A TRUE.TRUE.FALSE 1 1
9 3 B FALSE.TRUE.FALSE 1 0.5
10 3 B TRUE.TRUE.TRUE 1 0.5
你可以这样做:
df%>%
group_by(subject, object)%>%
mutate(val = str_c(value1,value2,value3,sep = "."),
value = c(prop.table(table(val))[val]))
# A tibble: 11 x 7
# Groups: subject, object [6]
subject object value1 value2 value3 val value
<int> <chr> <lgl> <lgl> <lgl> <chr> <dbl>
1 1 A TRUE TRUE FALSE TRUE.TRUE.FALSE 0.5
2 1 A TRUE TRUE TRUE TRUE.TRUE.TRUE 0.5
3 1 B TRUE FALSE TRUE TRUE.FALSE.TRUE 0.667
4 1 B TRUE FALSE TRUE TRUE.FALSE.TRUE 0.667
5 1 B TRUE TRUE TRUE TRUE.TRUE.TRUE 0.333
6 2 B TRUE FALSE FALSE TRUE.FALSE.FALSE 0.5
7 2 A TRUE TRUE FALSE TRUE.TRUE.FALSE 1
8 2 B FALSE FALSE FALSE FALSE.FALSE.FALSE 0.5
9 3 A TRUE TRUE FALSE TRUE.TRUE.FALSE 1
10 3 B FALSE TRUE FALSE FALSE.TRUE.FALSE 0.5
11 3 B TRUE TRUE TRUE TRUE.TRUE.TRUE 0.5
我有一个数据框,它的结构与这个玩具数据集相似。与其处理二进制数据,我宁愿生成新的列来表示二进制数据列(value1、value2 和 value3)之间的值的交互,因为只有 8 种可能的值组合(例如,TRUE.TRUE.TRUE , TRUE.TRUE.FALSE, 等等).
具体来说,我想计算每个对象和主题的每个组合的比例。
subject object value1 value2 value3
1 A TRUE TRUE FALSE
1 A TRUE TRUE TRUE
1 B TRUE FALSE TRUE
1 B TRUE FALSE TRUE
1 B TRUE TRUE TRUE
2 B TRUE FALSE FALSE
2 A TRUE TRUE FALSE
2 B FALSE FALSE FALSE
3 A TRUE TRUE FALSE
3 B FALSE TRUE FALSE
3 B TRUE TRUE TRUE
... ... ... ... ...
期望的输出:
subject object combination value
1 A True.True.True .5
1 A True.True.False .5
1 B True.True.True .33
1 B True.False.True .67
...
etc for subject 2 and 3...
试试这个:
library(tidyverse)
#Data
df <- structure(list(subject = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L), object = c("A", "A", "B", "B", "B", "B", "A", "B", "A",
"B", "B"), value1 = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
FALSE, TRUE, FALSE, TRUE), value2 = c(TRUE, TRUE, FALSE, FALSE,
TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE), value3 = c(FALSE,
TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE
)), class = "data.frame", row.names = c(NA, -11L))
#Code
df %>% mutate(value=paste(value1,value2,value3,sep = '.')) %>% group_by(subject,object,value) %>%
summarize(N=n()) %>% ungroup() %>% group_by(subject,object) %>% mutate(Prop=N/sum(N))
# A tibble: 10 x 5
# Groups: subject, object [6]
subject object value N Prop
<int> <chr> <chr> <int> <dbl>
1 1 A TRUE.TRUE.FALSE 1 0.5
2 1 A TRUE.TRUE.TRUE 1 0.5
3 1 B TRUE.FALSE.TRUE 2 0.667
4 1 B TRUE.TRUE.TRUE 1 0.333
5 2 A TRUE.TRUE.FALSE 1 1
6 2 B FALSE.FALSE.FALSE 1 0.5
7 2 B TRUE.FALSE.FALSE 1 0.5
8 3 A TRUE.TRUE.FALSE 1 1
9 3 B FALSE.TRUE.FALSE 1 0.5
10 3 B TRUE.TRUE.TRUE 1 0.5
你可以这样做:
df%>%
group_by(subject, object)%>%
mutate(val = str_c(value1,value2,value3,sep = "."),
value = c(prop.table(table(val))[val]))
# A tibble: 11 x 7
# Groups: subject, object [6]
subject object value1 value2 value3 val value
<int> <chr> <lgl> <lgl> <lgl> <chr> <dbl>
1 1 A TRUE TRUE FALSE TRUE.TRUE.FALSE 0.5
2 1 A TRUE TRUE TRUE TRUE.TRUE.TRUE 0.5
3 1 B TRUE FALSE TRUE TRUE.FALSE.TRUE 0.667
4 1 B TRUE FALSE TRUE TRUE.FALSE.TRUE 0.667
5 1 B TRUE TRUE TRUE TRUE.TRUE.TRUE 0.333
6 2 B TRUE FALSE FALSE TRUE.FALSE.FALSE 0.5
7 2 A TRUE TRUE FALSE TRUE.TRUE.FALSE 1
8 2 B FALSE FALSE FALSE FALSE.FALSE.FALSE 0.5
9 3 A TRUE TRUE FALSE TRUE.TRUE.FALSE 1
10 3 B FALSE TRUE FALSE FALSE.TRUE.FALSE 0.5
11 3 B TRUE TRUE TRUE TRUE.TRUE.TRUE 0.5