在 R 中创建卡方矩阵
Create chi square matrix in R
谢谢你帮助我。我对 R 的了解足够危险,但还不够好。我正在尝试从数据帧创建卡方矩阵(与相关矩阵完全一样,但带有卡方矩阵)。我已经进行了广泛的搜索(包括此页面:Chi-square p value matrix in r 没有帮助)甚至尝试循环,但我没有到达那里。这是我目前所拥有的:
Windows10
上的 R Studio 版本 1.2.5033
数据 - 14 列 3401 行,我只使用 9 列作为卡方,8 个变量是二进制的,1 个是分类的
我已经得到了我需要的卡方结果,但我想将它组合成一个矩阵。由于它是一个标准矩阵,其中列逐渐变小,列的长度不同,因此大多数绑定命令都不起作用。我确实有 plyr,我正在尝试这样做,但失败了。这是代码:
CA <- c(1, 0, 0, 1, 1, 1, 0, 1, 0, 0)
Pos <- c(1, 1, 1, 1, 1, 0, 0, 0, 0, 0)
Mon <- c(1, 1, 1, 1, 0, 0, 0, 0, 0, 1)
Sc <- c(0, 0, 0, 0, 1, 1, 0, 0, 1, 0)
ood <- c(1, 1, 1, 1, 0, 1, 1, 0, 0, 1)
Eco <- c(1, 2, 4, 6, 7, 3, 2, 5, 7, 7)
Orp <- c(0, 0, 0, 1, 1, 1, 1, 0, 0, 0)
BC <- c(1, 1, 1, 0, 0, 0, 0, 1, 0, 1)
SA <- c(0, 0, 1, 1, 1, 1, 1, 0, 0, 1)
MV <- c(1, 1, 0, 1, 1, 0, 1, 1, 0, 0)
Ad <- c(0, 1, 0, 1, 0, 1, 0, 1, 0, 1)
YR <- c(1, 0, 1, 0, 1, 0, 1, 0, 1, 0)
KC <- c(1, 1, 1, 1, 1, 1, 0, 1, 1, 1)
H <- c(1, 1, 1, 1, 1, 1, 1, 0, 0 ,1)
data <- data.frame(CA, Pos, Mon, Sc, ood, Eco, Orp, BC, SA, MV, Ad, YR, KC, H)
或者,这里是 dput() 数据:
structure(list(CA = c(0, 0, 1, 1, 1, 1, 0, 1, 0), Pos = c(0,
0, 0, 0, 1, 1, 0, 1, 0), Mon = c(1, 0, 1, 1, 0, 1, 1, 1, 1),
Sc = c(0, NA, 0, 0, 1, 1, 1, 1, 0), ood = c(1, 1, 1, 0, 1,
0, 1, 1, 1), Eco = c(7, 6, 7, 0, 1, 6, 5, 1, 3), Orp = c(0,
0, 0, 0, 0, 0, 0, 0, 0), BC = c(1, 0, 1, 1, 1, 1, 1, 1, 1
), SA = c(0, 1, 0, 0, 0, 0, 0, 0, 0), MV = c(15, 13, 16,
12, 16, 14, 11, 18, 12), Ad = c(2, 2, 2, 3, 3, 2, 2, 2, 2
), YR = c(2, 2, 1, 1, 2, 2, 2, 1, 2), KC = c(2, 2, 1, 1,
1, 1, 1, 1, 1), H = c(0, 1, 1, 0, 0, 1, 1, 0, 1)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), spec = structure(list(
cols = list(CA = structure(list(), class = c("collector_double",
"collector")), Pos = structure(list(), class = c("collector_double",
"collector")), Mon = structure(list(), class = c("collector_double",
"collector")), Sc = structure(list(), class = c("collector_double",
"collector")), ood = structure(list(), class = c("collector_double",
"collector")), Eco = structure(list(), class = c("collector_double",
"collector")), Orp = structure(list(), class = c("collector_double",
"collector")), BC = structure(list(), class = c("collector_double",
"collector")), SA = structure(list(), class = c("collector_double",
"collector")), MV = structure(list(), class = c("collector_double",
"collector")), Ad = structure(list(), class = c("collector_double",
"collector")), YR = structure(list(), class = c("collector_double",
"collector")), KC = structure(list(), class = c("collector_double",
"collector")), H = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
library(tidyverse)
library(plyr)
library(reshape2)
chisqstat <- function(x, y){round(chisq.test(x, y)$statistic, 2)} #chi sq formula, pulling stat
chisqp <- function(x, y){round(chisq.test(x, y)$p.value, 4)} #chi sq formula pulling pvalue
chisqdata <- list(apply(data[c(2:7, 9, 14)], 2, chisqstat, y = data$CA), #chi sq statistic for all values with child abuse
apply(data[c(2:7, 9, 14)], 2, chisqp, y = data$CA), #chi sq pvalue for all values with child abuse
apply(data[c(3:7, 9, 14)], 2, chisqstat, y = data$Pos),
apply(data[c(3:7, 9, 14)], 2, chisqp, y = data$Pos),
apply(data[c(4:7, 9, 14)], 2, chisqstat, y = data$Mon),
apply(data[c(4:7, 9, 14)], 2, chisqp, y = data$Mon),
apply(data[c(5:7, 9, 14)], 2, chisqstat, y = data$Sc),
apply(data[c(5:7, 9, 14)], 2, chisqp, y = data$Sc),
apply(data[c(6:7, 9, 14)], 2, chisqstat, y = data$ood),
apply(data[c(6:7, 9, 14)], 2, chisqp, y = data$ood),
apply(data[c(7, 9, 14)], 2, chisqstat, y = data$Eco),
apply(data[c(7, 9, 14)], 2, chisqp, y = data$Eco),
apply(data[c(9, 14)], 2, chisqstat, y = data$Orp),
apply(data[c(9, 14)], 2, chisqp, y = data$Orp),
apply(data[c(14)], 2, chisqstat, y = data$SA),
apply(data[c(14)], 2, chisqp, y = data$SA))
do.call(rbind.fill.matrix(), chisqdata)
当我把每一行都做成:
as.matrix(apply(data[c(2:7, 9, 14], 2, chisqstat, y = data$CA))
我得到了我想要的行名,所以我想知道我是否需要使用 as.matrix。我知道我可以做一个完整的矩阵,其中对角线上方和下方的信息是相同的,但我发现这在视觉上是压倒性的。我想得到这样的结果(这是相关性而不是卡方):
CA Pos Mon Sc ood Eco
Pos 0.20
Mon -0.20 0.60
Sc 0.22 -0.22 -0.65
ood -0.22 0.22 0.65 -0.52
Eco 0.00 -0.18 -0.18 0.38 -0.58
Orp 0.41 0.00 -0.41 0.36 0.09 0.04
我确定我遗漏了一些明显的东西,因此非常感谢您的帮助。或者,也许有更好的方法来执行此操作,再次感谢。
看起来您添加的 dput
数据是 tibble
,这可能是从您的数据读取方式来看的:
> class(data)
[1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
虽然您之前的示例是普通的 data.frame
(我称之为 df
)。
这在如何从 here 的 chisqmatrix
代码中提取列方面发挥作用。具体来说:
m[i,j] = chisq.test(x[,i],x[,j],)$p.value
使用简单的 data.frame
,选择第一列将为您提供一个数值向量,这就是您想要的:
> df[,1]
[1] 0 0 1 1 1 1 0 1 0
> class(df[,1])
[1] "numeric"
虽然有 tibble
,但你会得到另一个 tibble
:
> data[,1]
# A tibble: 9 x 1
CA
<dbl>
1 0
2 0
3 1
4 1
5 1
6 1
7 0
8 1
9 0
> class(data[,1])
[1] "tbl_df" "tbl" "data.frame"
所以,有两种方法可以解决这个问题。一种方法是使用双括号(例如 df[[1]]
),这将提取一个数字向量:
> data[[1]]
[1] 0 0 1 1 1 1 0 1 0
或者,您可以添加 drop = TRUE
以强制转换为矢量(例如,df[,1, drop = TRUE]
):
> data[,1,drop=T]
[1] 0 0 1 1 1 1 0 1 0
因为 tibble
[
默认为 drop = FALSE
:
https://tibble.tidyverse.org/reference/subsetting.html
因此,这里是重现您想要的结果的相同函数:
chisqmatrix <- function(x) {
names = colnames(x); num = length(names)
m = matrix(nrow=num,ncol=num,dimnames=list(names,names))
for (i in 1:(num-1)) {
for (j in (i+1):num) {
#browser()
m[j,i] = chisq.test(x[, i, drop = TRUE],x[, j, drop = TRUE])$p.value
}
}
return (m)
}
mat <- chisqmatrix(data[c("CA", "Pos", "Mon", "Sc", "ood", "Eco")])
mat[-1, -ncol(mat)]
CA Pos Mon Sc ood
Pos 0.2356799 NA NA NA NA
Mon 1.0000000 1.0000000 NA NA NA
Sc 1.0000000 0.1441270 1.0000000 NA NA
ood 0.5303348 1.0000000 1.0000000 1.0000000 NA
Eco 0.4220127 0.2399069 0.6669878 0.1562356 0.2959326
注意:我删除了“Orp”,因为这会在只有一个级别的示例数据中给出错误;所以你可以替换为:
chisqmatrix(data[c("CA", "Pos", "Mon", "Sc", "ood", "Eco", "Orp")])
如果使用您的完整数据。
如果这对你有用,请告诉我。
谢谢你帮助我。我对 R 的了解足够危险,但还不够好。我正在尝试从数据帧创建卡方矩阵(与相关矩阵完全一样,但带有卡方矩阵)。我已经进行了广泛的搜索(包括此页面:Chi-square p value matrix in r 没有帮助)甚至尝试循环,但我没有到达那里。这是我目前所拥有的:
Windows10
上的 R Studio 版本 1.2.5033数据 - 14 列 3401 行,我只使用 9 列作为卡方,8 个变量是二进制的,1 个是分类的
我已经得到了我需要的卡方结果,但我想将它组合成一个矩阵。由于它是一个标准矩阵,其中列逐渐变小,列的长度不同,因此大多数绑定命令都不起作用。我确实有 plyr,我正在尝试这样做,但失败了。这是代码:
CA <- c(1, 0, 0, 1, 1, 1, 0, 1, 0, 0)
Pos <- c(1, 1, 1, 1, 1, 0, 0, 0, 0, 0)
Mon <- c(1, 1, 1, 1, 0, 0, 0, 0, 0, 1)
Sc <- c(0, 0, 0, 0, 1, 1, 0, 0, 1, 0)
ood <- c(1, 1, 1, 1, 0, 1, 1, 0, 0, 1)
Eco <- c(1, 2, 4, 6, 7, 3, 2, 5, 7, 7)
Orp <- c(0, 0, 0, 1, 1, 1, 1, 0, 0, 0)
BC <- c(1, 1, 1, 0, 0, 0, 0, 1, 0, 1)
SA <- c(0, 0, 1, 1, 1, 1, 1, 0, 0, 1)
MV <- c(1, 1, 0, 1, 1, 0, 1, 1, 0, 0)
Ad <- c(0, 1, 0, 1, 0, 1, 0, 1, 0, 1)
YR <- c(1, 0, 1, 0, 1, 0, 1, 0, 1, 0)
KC <- c(1, 1, 1, 1, 1, 1, 0, 1, 1, 1)
H <- c(1, 1, 1, 1, 1, 1, 1, 0, 0 ,1)
data <- data.frame(CA, Pos, Mon, Sc, ood, Eco, Orp, BC, SA, MV, Ad, YR, KC, H)
或者,这里是 dput() 数据:
structure(list(CA = c(0, 0, 1, 1, 1, 1, 0, 1, 0), Pos = c(0,
0, 0, 0, 1, 1, 0, 1, 0), Mon = c(1, 0, 1, 1, 0, 1, 1, 1, 1),
Sc = c(0, NA, 0, 0, 1, 1, 1, 1, 0), ood = c(1, 1, 1, 0, 1,
0, 1, 1, 1), Eco = c(7, 6, 7, 0, 1, 6, 5, 1, 3), Orp = c(0,
0, 0, 0, 0, 0, 0, 0, 0), BC = c(1, 0, 1, 1, 1, 1, 1, 1, 1
), SA = c(0, 1, 0, 0, 0, 0, 0, 0, 0), MV = c(15, 13, 16,
12, 16, 14, 11, 18, 12), Ad = c(2, 2, 2, 3, 3, 2, 2, 2, 2
), YR = c(2, 2, 1, 1, 2, 2, 2, 1, 2), KC = c(2, 2, 1, 1,
1, 1, 1, 1, 1), H = c(0, 1, 1, 0, 0, 1, 1, 0, 1)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), spec = structure(list(
cols = list(CA = structure(list(), class = c("collector_double",
"collector")), Pos = structure(list(), class = c("collector_double",
"collector")), Mon = structure(list(), class = c("collector_double",
"collector")), Sc = structure(list(), class = c("collector_double",
"collector")), ood = structure(list(), class = c("collector_double",
"collector")), Eco = structure(list(), class = c("collector_double",
"collector")), Orp = structure(list(), class = c("collector_double",
"collector")), BC = structure(list(), class = c("collector_double",
"collector")), SA = structure(list(), class = c("collector_double",
"collector")), MV = structure(list(), class = c("collector_double",
"collector")), Ad = structure(list(), class = c("collector_double",
"collector")), YR = structure(list(), class = c("collector_double",
"collector")), KC = structure(list(), class = c("collector_double",
"collector")), H = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
library(tidyverse)
library(plyr)
library(reshape2)
chisqstat <- function(x, y){round(chisq.test(x, y)$statistic, 2)} #chi sq formula, pulling stat
chisqp <- function(x, y){round(chisq.test(x, y)$p.value, 4)} #chi sq formula pulling pvalue
chisqdata <- list(apply(data[c(2:7, 9, 14)], 2, chisqstat, y = data$CA), #chi sq statistic for all values with child abuse
apply(data[c(2:7, 9, 14)], 2, chisqp, y = data$CA), #chi sq pvalue for all values with child abuse
apply(data[c(3:7, 9, 14)], 2, chisqstat, y = data$Pos),
apply(data[c(3:7, 9, 14)], 2, chisqp, y = data$Pos),
apply(data[c(4:7, 9, 14)], 2, chisqstat, y = data$Mon),
apply(data[c(4:7, 9, 14)], 2, chisqp, y = data$Mon),
apply(data[c(5:7, 9, 14)], 2, chisqstat, y = data$Sc),
apply(data[c(5:7, 9, 14)], 2, chisqp, y = data$Sc),
apply(data[c(6:7, 9, 14)], 2, chisqstat, y = data$ood),
apply(data[c(6:7, 9, 14)], 2, chisqp, y = data$ood),
apply(data[c(7, 9, 14)], 2, chisqstat, y = data$Eco),
apply(data[c(7, 9, 14)], 2, chisqp, y = data$Eco),
apply(data[c(9, 14)], 2, chisqstat, y = data$Orp),
apply(data[c(9, 14)], 2, chisqp, y = data$Orp),
apply(data[c(14)], 2, chisqstat, y = data$SA),
apply(data[c(14)], 2, chisqp, y = data$SA))
do.call(rbind.fill.matrix(), chisqdata)
当我把每一行都做成:
as.matrix(apply(data[c(2:7, 9, 14], 2, chisqstat, y = data$CA))
我得到了我想要的行名,所以我想知道我是否需要使用 as.matrix。我知道我可以做一个完整的矩阵,其中对角线上方和下方的信息是相同的,但我发现这在视觉上是压倒性的。我想得到这样的结果(这是相关性而不是卡方):
CA Pos Mon Sc ood Eco
Pos 0.20
Mon -0.20 0.60
Sc 0.22 -0.22 -0.65
ood -0.22 0.22 0.65 -0.52
Eco 0.00 -0.18 -0.18 0.38 -0.58
Orp 0.41 0.00 -0.41 0.36 0.09 0.04
我确定我遗漏了一些明显的东西,因此非常感谢您的帮助。或者,也许有更好的方法来执行此操作,再次感谢。
看起来您添加的 dput
数据是 tibble
,这可能是从您的数据读取方式来看的:
> class(data)
[1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
虽然您之前的示例是普通的 data.frame
(我称之为 df
)。
这在如何从 here 的 chisqmatrix
代码中提取列方面发挥作用。具体来说:
m[i,j] = chisq.test(x[,i],x[,j],)$p.value
使用简单的 data.frame
,选择第一列将为您提供一个数值向量,这就是您想要的:
> df[,1]
[1] 0 0 1 1 1 1 0 1 0
> class(df[,1])
[1] "numeric"
虽然有 tibble
,但你会得到另一个 tibble
:
> data[,1]
# A tibble: 9 x 1
CA
<dbl>
1 0
2 0
3 1
4 1
5 1
6 1
7 0
8 1
9 0
> class(data[,1])
[1] "tbl_df" "tbl" "data.frame"
所以,有两种方法可以解决这个问题。一种方法是使用双括号(例如 df[[1]]
),这将提取一个数字向量:
> data[[1]]
[1] 0 0 1 1 1 1 0 1 0
或者,您可以添加 drop = TRUE
以强制转换为矢量(例如,df[,1, drop = TRUE]
):
> data[,1,drop=T]
[1] 0 0 1 1 1 1 0 1 0
因为 tibble
[
默认为 drop = FALSE
:
https://tibble.tidyverse.org/reference/subsetting.html
因此,这里是重现您想要的结果的相同函数:
chisqmatrix <- function(x) {
names = colnames(x); num = length(names)
m = matrix(nrow=num,ncol=num,dimnames=list(names,names))
for (i in 1:(num-1)) {
for (j in (i+1):num) {
#browser()
m[j,i] = chisq.test(x[, i, drop = TRUE],x[, j, drop = TRUE])$p.value
}
}
return (m)
}
mat <- chisqmatrix(data[c("CA", "Pos", "Mon", "Sc", "ood", "Eco")])
mat[-1, -ncol(mat)]
CA Pos Mon Sc ood
Pos 0.2356799 NA NA NA NA
Mon 1.0000000 1.0000000 NA NA NA
Sc 1.0000000 0.1441270 1.0000000 NA NA
ood 0.5303348 1.0000000 1.0000000 1.0000000 NA
Eco 0.4220127 0.2399069 0.6669878 0.1562356 0.2959326
注意:我删除了“Orp”,因为这会在只有一个级别的示例数据中给出错误;所以你可以替换为:
chisqmatrix(data[c("CA", "Pos", "Mon", "Sc", "ood", "Eco", "Orp")])
如果使用您的完整数据。
如果这对你有用,请告诉我。