我如何 运行 对 R 数据框中的多对向量进行多对 Wilcoxon 符号秩检验?
How would I run multiple paired Wilcoxon signed rank tests over many pairs of vectors in an R data frame?
我有一个包含 161 个免疫标记的数据集,每个标记都是数据框中的一个向量。使用 R,我想使用 Wilcoxon 符号秩(配对)测试比较 78 对这些向量。免疫标志物的名称以“_MOM”或“_CB”区分。
这是一个带有示例变量名称的“玩具”数据集:
# Create toy data frame
toydata = data.frame(CCBB_dyad_number=c(1,2,3,4,5,6,7,8,9,10),
cCMV_status = c("cCMV+", "cCMV-", "cCMV-",
"cCMV+", "cCMV+", "cCMV-",
"cCMV-", "cCMV+", "cCMV+",
"cCMV+"),
maternal_CMV_IgM_status = c("negative", "negative", "positive",
"negative", "positive", "negative",
"positive", "positive", "positive",
"negative"),
TB40E_conc_CB = c(1.954727, NA, 1.992956,
1.831331, 1.905936, 2.053446,
2.055809, 1.739377, 2.052576,
1.961838),
AD169r_conc_CB = c(5.86714, 6.469020, 9.387268,
5.733174, 6.480673, 5.176167,
7.548077, 7.209173, 4.944089,
9.667219),
TB40E_conc_MOM = c(7.389400, 5.917861, 7.022016,
8.017846, 10.046830, 7.503896,
6.427719, 9.498801, 7.351678,
6.050478),
AD169r_conc_MOM = c(7.011906, 6.506734, 9.986478,
5.673412, 3.825439, 5.795331,
7.082124, 6.810222, 5.54213,
8.271366)
)
在一些帮助下,我编写了代码来遍历所有 161 个向量,并使用 lapply
:
生成具有 p 值和测试类型的新数据框
# Pull actual names of variables, not just numbers
excluded_vars <- toydata %>%
select(., c(CCBB_dyad_number,
cCMV_status,
maternal_CMV_IgM_status)) %>%
names(.)
var_list <- toydata %>%
select(., -any_of(excluded_vars)) %>%
names(.)
out = lapply(var_list, function(v){
#cat(paste0("Wilcox: ", v, "\n")) #Loop message for checking
fmla <- formula(paste(v, " ~ cCMV_status"))
wilcox.test(fmla, data = toydata, paired = FALSE) %>%
purrr::flatten() %>% #Unnest/convert to plain list
as.data.frame(stringsAsFactors=FALSE) %>% #Set as data frame
mutate(Variable = v) %>% #add new variable column (could also get it from data.name)
select(Variable, W.statistic=W, P.value=p.value, Method=method) %>%
mutate(P.value=scientific(P.value, digits=2, format="e"))
}) %>% #%T>% { names(out) <- var_list } %>% #Didn't actually need this, but could if wanted a named list
purrr::compact() %>% #Remove any empty data frames/list elements (NULL)
dplyr::bind_rows() #Bind list of data frames into single data frame
out$FDR_P.value <- p.adjust(out$P.value, method="fdr", n=length(out$P.value)) %>%
scientific(., digits = 2, format = "e")
col_order <- c("Variable", "W.statistic", "P.value", # Reorder columns for tabling
"FDR_P.value", "Method")
out <- out[, col_order]
kable(out, "html", booktabs = T) %>%
kable_styling(latex_options = c("striped", "scale_down")) # Print output as a nice table
但是,我在思考如何编写代码以通过多个不同的向量对循环签名等级测试时遇到了麻烦。我在想我会提取向量(或者只是向量名称?),像这样:
toy_cCMV_pos <- toydata %>%
filter(cCMV_status == 'cCMV+') %>%
select(., -any_of(excluded_vars))
variable.set1 <- toy_cCMV_pos %>%
select(., ends_with("_MOM"))
variable.set2 <- toy_cCMV_pos %>%
select(., ends_with("_CB"))
有人建议像这样循环遍历向量。但是,我一直收到“未定义的列选择”错误,并且因为我不太明白下面的代码在做什么,所以我无法排除故障。
for (a in variable.set1) {
groups = unique(toy_cCMV_pos[,a])
for (b in variable.set2) {
wilcox.test(x=toy_cCMV_pos[which(toy_cCMV_pos[a]==groups[1]),b],
y=toy_cCMV_pos[which(toy_cCMV_pos[a]==groups[2]),b],
paired=TRUE)
}
}
# Keep getting error "undefined columns selected"
我希望能够像秩和检验一样将结果(包括 p 值)提取到新的数据框中。
谁能帮我想清楚如何做 运行 这些配对测试?
不确定这是否是您要查找的内容,但在这里我对每个前缀组在 CB
和 MOM
之间执行 Wilcoxon 检验。
library(tidyverse)
library(broom)
toydata = data.frame(CCBB_dyad_number=c(1,2,3,4,5,6,7,8,9,10), cCMV_status = c("cCMV+", "cCMV-", "cCMV-", "cCMV+", "cCMV+", "cCMV-", "cCMV-", "cCMV+", "cCMV+", "cCMV+"), maternal_CMV_IgM_status = c("negative", "negative", "positive", "negative", "positive", "negative", "positive", "positive", "positive", "negative"), TB40E_conc_CB = c(1.954727, NA, 1.992956, 1.831331, 1.905936, 2.053446, 2.055809, 1.739377, 2.052576, 1.961838), AD169r_conc_CB = c(5.86714, 6.469020, 9.387268, 5.733174, 6.480673, 5.176167, 7.548077, 7.209173, 4.944089, 9.667219), TB40E_conc_MOM = c(7.389400, 5.917861, 7.022016, 8.017846, 10.046830, 7.503896, 6.427719, 9.498801, 7.351678, 6.050478), AD169r_conc_MOM = c(7.011906, 6.506734, 9.986478, 5.673412, 3.825439, 5.795331, 7.082124, 6.810222, 5.54213, 8.271366))
toydata %>%
as_tibble() %>%
gather("var", "val", -1:-3) %>%
separate(var, c("marker", "conc", "type")) %>%
spread(type, val) %>%
group_by(marker) %>%
summarize(wilcox = tidy(wilcox.test(MOM, CB)))
#> # A tibble: 2 × 2
#> marker wilcox$statistic $p.value $method $alternative
#> <chr> <dbl> <dbl> <chr> <chr>
#> 1 AD169r 49 0.971 Wilcoxon rank sum exact test two.sided
#> 2 TB40E 90 0.0000217 Wilcoxon rank sum exact test two.sided
编辑:原始解决方案按行删除了缺失值,因此一些有效数据被删除得过于领先,结果与其他方法不一致。
更正确的做法是:
library(tidyr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
toydata = data.frame(CCBB_dyad_number=c(1,2,3,4,5,6,7,8,9,10),
cCMV_status = c("cCMV+", "cCMV-", "cCMV-",
"cCMV+", "cCMV+", "cCMV-",
"cCMV-", "cCMV+", "cCMV+",
"cCMV+"),
maternal_CMV_IgM_status = c("negative", "negative", "positive",
"negative", "positive", "negative",
"positive", "positive", "positive",
"negative"),
TB40E_conc_CB = c(1.954727, NA, 1.992956,
1.831331, 1.905936, 2.053446,
2.055809, 1.739377, 2.052576,
1.961838),
AD169r_conc_CB = c(5.86714, 6.469020, 9.387268,
5.733174, 6.480673, 5.176167,
7.548077, 7.209173, 4.944089,
9.667219),
TB40E_conc_MOM = c(7.389400, 5.917861, 7.022016,
8.017846, 10.046830, 7.503896,
6.427719, 9.498801, 7.351678,
6.050478),
AD169r_conc_MOM = c(7.011906, 6.506734, 9.986478,
5.673412, 3.825439, 5.795331,
7.082124, 6.810222, 5.54213,
8.271366))
toydata |>
select(ends_with("MOM"), ends_with("CB")) |>
pivot_longer(everything(),
names_to=c(".value", "group"),
names_sep="_(?!.*_)") |>
pivot_longer(-group,
names_to="variable",
values_to="value") |>
group_by(variable) |>
do(broom::tidy(wilcox.test(.$value ~ .$group, paired=TRUE, na.action=na.pass)))
#> # A tibble: 2 × 5
#> # Groups: variable [2]
#> variable statistic p.value method alternative
#> <chr> <dbl> <dbl> <chr> <chr>
#> 1 AD169r_conc 28 1 Wilcoxon signed rank exact test two.sided
#> 2 TB40E_conc 0 0.00391 Wilcoxon signed rank exact test two.sided
由 reprex package (v2.0.1)
于 2021-09-09 创建
结果与单独计算的结果相符:
> wilcox.test(toydata$TB40E_conc_CB, toydata$TB40E_conc_MOM, paired=TRUE)
Wilcoxon signed rank exact test
data: toydata$TB40E_conc_CB and toydata$TB40E_conc_MOM
V = 0, p-value = 0.003906
alternative hypothesis: true location shift is not equal to 0
和
> wilcox.test(toydata$AD169r_conc_CB, toydata$AD169r_conc_MOM, paired=TRUE)
Wilcoxon signed rank exact test
data: toydata$AD169r_conc_CB and toydata$AD169r_conc_MOM
V = 28, p-value = 1
alternative hypothesis: true location shift is not equal to 0
建议的解决方案的结果是 tibble/dataframe,因此您可以修改它,只选择需要的列。
我有一个包含 161 个免疫标记的数据集,每个标记都是数据框中的一个向量。使用 R,我想使用 Wilcoxon 符号秩(配对)测试比较 78 对这些向量。免疫标志物的名称以“_MOM”或“_CB”区分。
这是一个带有示例变量名称的“玩具”数据集:
# Create toy data frame
toydata = data.frame(CCBB_dyad_number=c(1,2,3,4,5,6,7,8,9,10),
cCMV_status = c("cCMV+", "cCMV-", "cCMV-",
"cCMV+", "cCMV+", "cCMV-",
"cCMV-", "cCMV+", "cCMV+",
"cCMV+"),
maternal_CMV_IgM_status = c("negative", "negative", "positive",
"negative", "positive", "negative",
"positive", "positive", "positive",
"negative"),
TB40E_conc_CB = c(1.954727, NA, 1.992956,
1.831331, 1.905936, 2.053446,
2.055809, 1.739377, 2.052576,
1.961838),
AD169r_conc_CB = c(5.86714, 6.469020, 9.387268,
5.733174, 6.480673, 5.176167,
7.548077, 7.209173, 4.944089,
9.667219),
TB40E_conc_MOM = c(7.389400, 5.917861, 7.022016,
8.017846, 10.046830, 7.503896,
6.427719, 9.498801, 7.351678,
6.050478),
AD169r_conc_MOM = c(7.011906, 6.506734, 9.986478,
5.673412, 3.825439, 5.795331,
7.082124, 6.810222, 5.54213,
8.271366)
)
在一些帮助下,我编写了代码来遍历所有 161 个向量,并使用 lapply
:
# Pull actual names of variables, not just numbers
excluded_vars <- toydata %>%
select(., c(CCBB_dyad_number,
cCMV_status,
maternal_CMV_IgM_status)) %>%
names(.)
var_list <- toydata %>%
select(., -any_of(excluded_vars)) %>%
names(.)
out = lapply(var_list, function(v){
#cat(paste0("Wilcox: ", v, "\n")) #Loop message for checking
fmla <- formula(paste(v, " ~ cCMV_status"))
wilcox.test(fmla, data = toydata, paired = FALSE) %>%
purrr::flatten() %>% #Unnest/convert to plain list
as.data.frame(stringsAsFactors=FALSE) %>% #Set as data frame
mutate(Variable = v) %>% #add new variable column (could also get it from data.name)
select(Variable, W.statistic=W, P.value=p.value, Method=method) %>%
mutate(P.value=scientific(P.value, digits=2, format="e"))
}) %>% #%T>% { names(out) <- var_list } %>% #Didn't actually need this, but could if wanted a named list
purrr::compact() %>% #Remove any empty data frames/list elements (NULL)
dplyr::bind_rows() #Bind list of data frames into single data frame
out$FDR_P.value <- p.adjust(out$P.value, method="fdr", n=length(out$P.value)) %>%
scientific(., digits = 2, format = "e")
col_order <- c("Variable", "W.statistic", "P.value", # Reorder columns for tabling
"FDR_P.value", "Method")
out <- out[, col_order]
kable(out, "html", booktabs = T) %>%
kable_styling(latex_options = c("striped", "scale_down")) # Print output as a nice table
但是,我在思考如何编写代码以通过多个不同的向量对循环签名等级测试时遇到了麻烦。我在想我会提取向量(或者只是向量名称?),像这样:
toy_cCMV_pos <- toydata %>%
filter(cCMV_status == 'cCMV+') %>%
select(., -any_of(excluded_vars))
variable.set1 <- toy_cCMV_pos %>%
select(., ends_with("_MOM"))
variable.set2 <- toy_cCMV_pos %>%
select(., ends_with("_CB"))
有人建议像这样循环遍历向量。但是,我一直收到“未定义的列选择”错误,并且因为我不太明白下面的代码在做什么,所以我无法排除故障。
for (a in variable.set1) {
groups = unique(toy_cCMV_pos[,a])
for (b in variable.set2) {
wilcox.test(x=toy_cCMV_pos[which(toy_cCMV_pos[a]==groups[1]),b],
y=toy_cCMV_pos[which(toy_cCMV_pos[a]==groups[2]),b],
paired=TRUE)
}
}
# Keep getting error "undefined columns selected"
我希望能够像秩和检验一样将结果(包括 p 值)提取到新的数据框中。
谁能帮我想清楚如何做 运行 这些配对测试?
不确定这是否是您要查找的内容,但在这里我对每个前缀组在 CB
和 MOM
之间执行 Wilcoxon 检验。
library(tidyverse)
library(broom)
toydata = data.frame(CCBB_dyad_number=c(1,2,3,4,5,6,7,8,9,10), cCMV_status = c("cCMV+", "cCMV-", "cCMV-", "cCMV+", "cCMV+", "cCMV-", "cCMV-", "cCMV+", "cCMV+", "cCMV+"), maternal_CMV_IgM_status = c("negative", "negative", "positive", "negative", "positive", "negative", "positive", "positive", "positive", "negative"), TB40E_conc_CB = c(1.954727, NA, 1.992956, 1.831331, 1.905936, 2.053446, 2.055809, 1.739377, 2.052576, 1.961838), AD169r_conc_CB = c(5.86714, 6.469020, 9.387268, 5.733174, 6.480673, 5.176167, 7.548077, 7.209173, 4.944089, 9.667219), TB40E_conc_MOM = c(7.389400, 5.917861, 7.022016, 8.017846, 10.046830, 7.503896, 6.427719, 9.498801, 7.351678, 6.050478), AD169r_conc_MOM = c(7.011906, 6.506734, 9.986478, 5.673412, 3.825439, 5.795331, 7.082124, 6.810222, 5.54213, 8.271366))
toydata %>%
as_tibble() %>%
gather("var", "val", -1:-3) %>%
separate(var, c("marker", "conc", "type")) %>%
spread(type, val) %>%
group_by(marker) %>%
summarize(wilcox = tidy(wilcox.test(MOM, CB)))
#> # A tibble: 2 × 2
#> marker wilcox$statistic $p.value $method $alternative
#> <chr> <dbl> <dbl> <chr> <chr>
#> 1 AD169r 49 0.971 Wilcoxon rank sum exact test two.sided
#> 2 TB40E 90 0.0000217 Wilcoxon rank sum exact test two.sided
编辑:原始解决方案按行删除了缺失值,因此一些有效数据被删除得过于领先,结果与其他方法不一致。
更正确的做法是:
library(tidyr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
toydata = data.frame(CCBB_dyad_number=c(1,2,3,4,5,6,7,8,9,10),
cCMV_status = c("cCMV+", "cCMV-", "cCMV-",
"cCMV+", "cCMV+", "cCMV-",
"cCMV-", "cCMV+", "cCMV+",
"cCMV+"),
maternal_CMV_IgM_status = c("negative", "negative", "positive",
"negative", "positive", "negative",
"positive", "positive", "positive",
"negative"),
TB40E_conc_CB = c(1.954727, NA, 1.992956,
1.831331, 1.905936, 2.053446,
2.055809, 1.739377, 2.052576,
1.961838),
AD169r_conc_CB = c(5.86714, 6.469020, 9.387268,
5.733174, 6.480673, 5.176167,
7.548077, 7.209173, 4.944089,
9.667219),
TB40E_conc_MOM = c(7.389400, 5.917861, 7.022016,
8.017846, 10.046830, 7.503896,
6.427719, 9.498801, 7.351678,
6.050478),
AD169r_conc_MOM = c(7.011906, 6.506734, 9.986478,
5.673412, 3.825439, 5.795331,
7.082124, 6.810222, 5.54213,
8.271366))
toydata |>
select(ends_with("MOM"), ends_with("CB")) |>
pivot_longer(everything(),
names_to=c(".value", "group"),
names_sep="_(?!.*_)") |>
pivot_longer(-group,
names_to="variable",
values_to="value") |>
group_by(variable) |>
do(broom::tidy(wilcox.test(.$value ~ .$group, paired=TRUE, na.action=na.pass)))
#> # A tibble: 2 × 5
#> # Groups: variable [2]
#> variable statistic p.value method alternative
#> <chr> <dbl> <dbl> <chr> <chr>
#> 1 AD169r_conc 28 1 Wilcoxon signed rank exact test two.sided
#> 2 TB40E_conc 0 0.00391 Wilcoxon signed rank exact test two.sided
由 reprex package (v2.0.1)
于 2021-09-09 创建结果与单独计算的结果相符:
> wilcox.test(toydata$TB40E_conc_CB, toydata$TB40E_conc_MOM, paired=TRUE)
Wilcoxon signed rank exact test
data: toydata$TB40E_conc_CB and toydata$TB40E_conc_MOM
V = 0, p-value = 0.003906
alternative hypothesis: true location shift is not equal to 0
和
> wilcox.test(toydata$AD169r_conc_CB, toydata$AD169r_conc_MOM, paired=TRUE)
Wilcoxon signed rank exact test
data: toydata$AD169r_conc_CB and toydata$AD169r_conc_MOM
V = 28, p-value = 1
alternative hypothesis: true location shift is not equal to 0
建议的解决方案的结果是 tibble/dataframe,因此您可以修改它,只选择需要的列。