循环(或应用?)两个因子水平的所有组合
Looping (or applying?) over all combinations of two factor levels
我的长数据框的前 9 行如下所示:
ptid<-c(rep(3, 3), rep(4, 3), rep(5, 3))
reviewer<-c("d", "b", "f", "a", "e", "c", "a", "f", "b")
outcome<-c(rep("Yes", 2), rep("No", 4), rep("Yes", 3))
dta <- data.frame(ptid, reviewer, outcome, stringsAsFactors=FALSE)
ptid
的每个值在 dta 内正好重复 3 次。
reviewer
变量a:f
有6个值,结果变量是二进制的,没有任何缺失数据。
我制作了一个如下所示的空矩阵:
mat<-matrix(, nrow=6, ncol=6, dimnames = list(letters[1:6], letters[1:6]))
我想在矩阵的下半部分填入每个审阅者组合彼此达成一致的次数百分比。我可以使用以下代码对矩阵中的位置 [a,b]
执行此操作:
combo<-dta[which(dta$reviewer=="a" | dta$reviewer=="b"), c("ptid", "reviewer", "outcome")]
wide<-combo %>% spread(reviewer, outcome)
wide<-na.omit(wide)
ab_agree<-table(wide$a==wide$b)[2]
ab<-paste(signif((ab_agree/dim(wide)[1])*100, 3), "%", sep="")
mat["a", "b"]<-ab
我将非常感谢帮助编写一个函数,以最有效的方式填充矩阵的每一列。我最初开始写一个 for
循环,但我觉得 apply
.
有更好的方法
我不使用 apply
或 for
循环,而是使用 dplyr
和 tidyr
的函数(因为您已经在使用这些包)。
library(dplyr)
library(tidyr)
首先我得到所有评论者对的所有组合:
reviewer_combos <- tibble(reviewer1 = letters[1:6], reviewer2 = letters[1:6]) %>%
complete(reviewer1, reviewer2) %>%
filter(reviewer1 != reviewer2)
reviewer_combos
# A tibble: 30 x 2
# reviewer1 reviewer2
# <chr> <chr>
# 1 a b
# 2 a c
# 3 a d
# 4 a e
# 5 a f
# 6 b a
# 7 b c
# 8 b d
# 9 b e
# 10 b f
# ... with 20 more rows
然后将reviwer1和reviwer2的答案用两个left_join
加入到数据中,判断是否一致:
reviewer_combos <- reviewer_combos %>%
left_join(dta, by = c("reviewer1" = "reviewer")) %>%
left_join(dta, by = c("reviewer2" = "reviewer"), suffix = c("", "2")) %>%
mutate(agree = as.integer(outcome == outcome2))
reviewer_combos
# A tibble: 66 x 7
# reviewer1 reviewer2 ptid outcome ptid2 outcome2 agree
# <chr> <chr> <dbl> <chr> <dbl> <chr> <int>
# 1 a b 4.00 No 3.00 Yes 0
# 2 a b 4.00 No 5.00 Yes 0
# 3 a b 5.00 Yes 3.00 Yes 1
# 4 a b 5.00 Yes 5.00 Yes 1
# 5 a c 4.00 No 4.00 No 1
# 6 a c 5.00 Yes 4.00 No 0
# 7 a d 4.00 No 3.00 Yes 0
# 8 a d 5.00 Yes 3.00 Yes 1
# 9 a e 4.00 No 4.00 No 1
# 10 a e 5.00 Yes 4.00 No 0
# ... with 56 more rows
最后用group_by
和summarize
确定每个审稿人组的同意百分比,并用spread
以你想要的格式显示它们:
reviewer_percentage <- reviewer_combos %>%
group_by(reviewer1, reviewer2) %>%
summarize(percentage_agree = sum(agree)/n()) %>%
spread(reviewer2, percentage_agree)
reviewer_percentage
# A tibble: 6 x 7
# Groups: reviewer1 [6]
# reviewer1 a b c d e f
# * <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 a NA 0.500 0.500 0.500 0.500 0.500
# 2 b 0.500 NA 0 1.00 0 0.500
# 3 c 0.500 0 NA 0 1.00 0.500
# 4 d 0.500 1.00 0 NA 0 0.500
# 5 e 0.500 0 1.00 0 NA 0.500
# 6 f 0.500 0.500 0.500 0.500 0.500 NA
如果您需要将其作为 matrix
并且上三角部分为 NA
,您可以这样做:
reviewer_percentage_mat <- reviewer_percentage %>%
ungroup() %>%
select(-reviewer1) %>%
as.matrix()
rownames(reviewer_percentage_mat) <- reviewer_percentage$reviewer1
reviewer_percentage_mat[upper.tri(reviewer_percentage_mat)] <- NA
reviewer_percentage_mat
# a b c d e f
# a NA NA NA NA NA NA
# b 0.5 NA NA NA NA NA
# c 0.5 0.0 NA NA NA NA
# d 0.5 1.0 0.0 NA NA NA
# e 0.5 0.0 1.0 0.0 NA NA
# f 0.5 0.5 0.5 0.5 0.5 NA
数据
dta <- structure(list(ptid = c(3, 3, 3, 4, 4, 4, 5, 5, 5),
reviewer = c("d", "b", "f", "a", "e", "c", "a", "f", "b"),
outcome = c("Yes", "Yes", "No", "No", "No", "No", "Yes", "Yes", "Yes")),
.Names = c("ptid", "reviewer", "outcome"),
row.names = c(NA, -9L),
class = "data.frame")
我的长数据框的前 9 行如下所示:
ptid<-c(rep(3, 3), rep(4, 3), rep(5, 3))
reviewer<-c("d", "b", "f", "a", "e", "c", "a", "f", "b")
outcome<-c(rep("Yes", 2), rep("No", 4), rep("Yes", 3))
dta <- data.frame(ptid, reviewer, outcome, stringsAsFactors=FALSE)
ptid
的每个值在 dta 内正好重复 3 次。
reviewer
变量a:f
有6个值,结果变量是二进制的,没有任何缺失数据。
我制作了一个如下所示的空矩阵:
mat<-matrix(, nrow=6, ncol=6, dimnames = list(letters[1:6], letters[1:6]))
我想在矩阵的下半部分填入每个审阅者组合彼此达成一致的次数百分比。我可以使用以下代码对矩阵中的位置 [a,b]
执行此操作:
combo<-dta[which(dta$reviewer=="a" | dta$reviewer=="b"), c("ptid", "reviewer", "outcome")]
wide<-combo %>% spread(reviewer, outcome)
wide<-na.omit(wide)
ab_agree<-table(wide$a==wide$b)[2]
ab<-paste(signif((ab_agree/dim(wide)[1])*100, 3), "%", sep="")
mat["a", "b"]<-ab
我将非常感谢帮助编写一个函数,以最有效的方式填充矩阵的每一列。我最初开始写一个 for
循环,但我觉得 apply
.
我不使用 apply
或 for
循环,而是使用 dplyr
和 tidyr
的函数(因为您已经在使用这些包)。
library(dplyr)
library(tidyr)
首先我得到所有评论者对的所有组合:
reviewer_combos <- tibble(reviewer1 = letters[1:6], reviewer2 = letters[1:6]) %>%
complete(reviewer1, reviewer2) %>%
filter(reviewer1 != reviewer2)
reviewer_combos
# A tibble: 30 x 2
# reviewer1 reviewer2
# <chr> <chr>
# 1 a b
# 2 a c
# 3 a d
# 4 a e
# 5 a f
# 6 b a
# 7 b c
# 8 b d
# 9 b e
# 10 b f
# ... with 20 more rows
然后将reviwer1和reviwer2的答案用两个left_join
加入到数据中,判断是否一致:
reviewer_combos <- reviewer_combos %>%
left_join(dta, by = c("reviewer1" = "reviewer")) %>%
left_join(dta, by = c("reviewer2" = "reviewer"), suffix = c("", "2")) %>%
mutate(agree = as.integer(outcome == outcome2))
reviewer_combos
# A tibble: 66 x 7
# reviewer1 reviewer2 ptid outcome ptid2 outcome2 agree
# <chr> <chr> <dbl> <chr> <dbl> <chr> <int>
# 1 a b 4.00 No 3.00 Yes 0
# 2 a b 4.00 No 5.00 Yes 0
# 3 a b 5.00 Yes 3.00 Yes 1
# 4 a b 5.00 Yes 5.00 Yes 1
# 5 a c 4.00 No 4.00 No 1
# 6 a c 5.00 Yes 4.00 No 0
# 7 a d 4.00 No 3.00 Yes 0
# 8 a d 5.00 Yes 3.00 Yes 1
# 9 a e 4.00 No 4.00 No 1
# 10 a e 5.00 Yes 4.00 No 0
# ... with 56 more rows
最后用group_by
和summarize
确定每个审稿人组的同意百分比,并用spread
以你想要的格式显示它们:
reviewer_percentage <- reviewer_combos %>%
group_by(reviewer1, reviewer2) %>%
summarize(percentage_agree = sum(agree)/n()) %>%
spread(reviewer2, percentage_agree)
reviewer_percentage
# A tibble: 6 x 7
# Groups: reviewer1 [6]
# reviewer1 a b c d e f
# * <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 a NA 0.500 0.500 0.500 0.500 0.500
# 2 b 0.500 NA 0 1.00 0 0.500
# 3 c 0.500 0 NA 0 1.00 0.500
# 4 d 0.500 1.00 0 NA 0 0.500
# 5 e 0.500 0 1.00 0 NA 0.500
# 6 f 0.500 0.500 0.500 0.500 0.500 NA
如果您需要将其作为 matrix
并且上三角部分为 NA
,您可以这样做:
reviewer_percentage_mat <- reviewer_percentage %>%
ungroup() %>%
select(-reviewer1) %>%
as.matrix()
rownames(reviewer_percentage_mat) <- reviewer_percentage$reviewer1
reviewer_percentage_mat[upper.tri(reviewer_percentage_mat)] <- NA
reviewer_percentage_mat
# a b c d e f
# a NA NA NA NA NA NA
# b 0.5 NA NA NA NA NA
# c 0.5 0.0 NA NA NA NA
# d 0.5 1.0 0.0 NA NA NA
# e 0.5 0.0 1.0 0.0 NA NA
# f 0.5 0.5 0.5 0.5 0.5 NA
数据
dta <- structure(list(ptid = c(3, 3, 3, 4, 4, 4, 5, 5, 5),
reviewer = c("d", "b", "f", "a", "e", "c", "a", "f", "b"),
outcome = c("Yes", "Yes", "No", "No", "No", "No", "Yes", "Yes", "Yes")),
.Names = c("ptid", "reviewer", "outcome"),
row.names = c(NA, -9L),
class = "data.frame")