对同一数据帧中的多列执行卡方检验
Perform Chi Square Tests on Multiple Columns from the Same Data Frame
我正在尝试编写一个函数,该函数将对现有数据框中的列进行分组,并对每一列的关联 matrix/contingency table 进行卡方检验,然后报告 p 值对于每个测试。我一直在尝试模仿 使用的方法,但我发现我的 M 值没有像我认为的那样被格式化为矩阵。我不确定这是因为我的列比上面 link 中的示例多,还是我只是遗漏了一些东西,但这是我目前拥有的数据结构示例:
require(lubridate)
structure(list(ResponseID = c("R_2fpKxLYlxAoplxP", "R_enci4Hwwee9XLSp",
"R_332X6CmsgY6RE5s", "R_3GAI7CSx4a74LVp", "R_2QXRRBh4UCFoHDl",
"R_3gSKU8piHOKWf9E"), region = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Lakeland", "Macon SE", "Other"), class = "factor"),
InCalls_Qrtl = structure(c(7L, 7L, 7L, 7L, 7L, 7L), .Label = c("NA",
"No EDGE Calls", "Bottom Quartile", "Second Quartile", "Third Quartile",
"Top Quartile", "Missing"), class = "factor"), InAHT_Qrtl = structure(c(7L,
7L, 7L, 7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile",
"Second Quartile", "Third Quartile", "Top Quartile", "Missing"
), class = "factor"), InHold_Qrtl = structure(c(7L, 7L, 7L,
7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile",
"Second Quartile", "Third Quartile", "Top Quartile", "Missing"
), class = "factor"), Overall_Tenure_Period = new("Period",
.Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), Recent_Tenure_Period = new("Period",
.Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), VOA_Overall_Tenure_Orig = structure(c(9L,
9L, 9L, 9L, 9L, 9L), .Label = c("< 1 Year", "1 Year - <2 Years",
"2-5 Years", "6-10 Years", "11-15 Years", "16-20 Years",
"21-25 Years", "26 Years or Longer", "Missing"), class = "factor"),
VOA_Overall_Tenure_Mod = structure(c(5L, 5L, 5L, 5L, 5L,
5L), .Label = c("<2 Years", "2-5 Years", "6-10 Years", ">10 Years",
"Missing"), class = "factor"), VOA_Recent_Tenure_Orig = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), VOA_Recent_Tenure_Mod = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), Region = c("Lakeland", "Lakeland", "Lakeland",
"Lakeland", "Lakeland", "Lakeland"), Tenure_Code_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Most = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Split = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Question = c("Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?"), Answer = c("Slightly Satisfied",
"Slightly Satisfied", "Slightly Satisfied", "Dissatisfied",
"Completely Dissatisfied", "Slightly Dissatisfied"), Answer_TopBox = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Answer_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Answer_GenSat = c("GenSat",
"GenSat", "GenSat", "Rest", "Rest", "Rest"), Answer_Bottom = c("Rest",
"Rest", "Rest", "Rest", "Bottom", "Rest"), Answer_Bottom2 = c("Rest",
"Rest", "Rest", "Bottom2", "Rest", "Rest"), Answer_GenDissat = c("Rest",
"Rest", "Rest", "GenDissat", "GenDissat", "GenDissat")), row.names = c(NA,
6L), class = "data.frame")
然后我尝试通过执行以下操作来重新创建示例:
top_score_tests_agent <- as.data.frame(agent_data_clean_coded %>%
group_by(Region, Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least,
InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question, Answer_TopBox) %>%
summarise(freq = n())) %>% group_by(Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least,
InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(Region, freq)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$Answer_TopBox
return(M)
}))
但我发现如果我尝试通过检查 top_score_tests_agent$M[[]] 结果来检查矩阵创建,我会得到以下输出:
structure(4L, .Dim = c(1L, 1L), .Dimnames = list(NULL, "Top Box"))
我只是想知道是否有人对我做错了什么阻止我创建矩阵有任何见解,或者是否有人有任何其他方法可以做到这一点?
编辑
我能够使用@Wietze314 编写的大部分代码,但对于任何有兴趣查看最终代码的未来用户:
result2 <- df %>% select(Region, Question, starts_with("Answer")) %>%
gather(segment, answer,-Region, -Question) %>%
group_by(Question, segment) %>%
nest() %>%
mutate(test = map(data, ~chisq.test(.x$Region,.x$answer, correct=FALSE))) %>%
mutate(p = map_dbl(test, pluck,'p.value'),
Status = ifelse(p<=0.01, "99% Sig Difference", ifelse(
p>0.01 & p<=0.05, "95% Sig Difference", ifelse(
p>0.05 & p<=0.1, "90% Sig Difference", "Not Significant")))) %>%
select(-data, -test)
这给了我一个看起来像这样的输出:
structure(list(Question = c("I feel comfortable 'trusting the system' with EDGE",
"EDGE allows me to be more efficient", "The E-learning training (GU Courses)",
"When I have questions about EDGE, I feel confident they will be answered",
"Overall, the training I received prepared me to use EDGE", "Overall, how satisfied are you with using EDGE?",
"The in-person, instructor-led training", "The formal training you received in EDGE",
"EDGE allows me to be more efficient", "I feel comfortable 'trusting the system' with EDGE"
), Segment = c("Answer_GenDissat", "Answer_Bottom", "Answer_GenSat",
"Answer_TopBox", "Answer_GenDissat", "Answer_TopBox", "Answer_Top2",
"Answer_Top2", "Answer_Bottom2", "Answer_GenSat"), pvalue = c(0.231403084430793,
0.299890413606335, 0.00108798852510237, 0.487810952072342, 0.131641662666334,
0.31818165042123, 0.501077891603077, 0.634730681199174, 0.389259022098406,
0.274277276570632), Status = c("Not Significant", "Not Significant",
"99% Sig Difference", "Not Significant", "Not Significant", "Not Significant",
"Not Significant", "Not Significant", "Not Significant", "Not Significant"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L))
我想我明白目标是什么。
我复制了数据集,因为 Region
中只有一个值。
require(tidyverse)
df <- agent_data_clean_coded %>%
bind_rows(agent_data_clean_coded %>% mutate(Region = "other"))
result <- df %>% select(Region, starts_with("Answer")) %>%
gather(question, answer,-Region) %>%
group_by(question) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>%
group_by(Region,answer) %>%
summarise(freq = n()) %>%
spread(Region, freq)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$answer
return(M)
}))
我习惯于以不同的方式解决这个问题:
对于此选项,我还排除了两个 Answer_Top
变量,因为它们也包含一个级别。否则chisq.test会报错。在这种情况下,我使用带有 chisq.test
的原始数据而不是意外事件 table.
result2 <- df %>% select(Region, starts_with("Answer")) %>%
select(-contains("Top")) %>%
gather(question, answer,-Region) %>%
group_by(question) %>%
nest() %>%
mutate(test = map(data, ~
chisq.test(.x$Region,.x$answer))) %>%
mutate(p = map_dbl(test, pluck,'p.value'))
我正在尝试编写一个函数,该函数将对现有数据框中的列进行分组,并对每一列的关联 matrix/contingency table 进行卡方检验,然后报告 p 值对于每个测试。我一直在尝试模仿
require(lubridate)
structure(list(ResponseID = c("R_2fpKxLYlxAoplxP", "R_enci4Hwwee9XLSp",
"R_332X6CmsgY6RE5s", "R_3GAI7CSx4a74LVp", "R_2QXRRBh4UCFoHDl",
"R_3gSKU8piHOKWf9E"), region = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Lakeland", "Macon SE", "Other"), class = "factor"),
InCalls_Qrtl = structure(c(7L, 7L, 7L, 7L, 7L, 7L), .Label = c("NA",
"No EDGE Calls", "Bottom Quartile", "Second Quartile", "Third Quartile",
"Top Quartile", "Missing"), class = "factor"), InAHT_Qrtl = structure(c(7L,
7L, 7L, 7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile",
"Second Quartile", "Third Quartile", "Top Quartile", "Missing"
), class = "factor"), InHold_Qrtl = structure(c(7L, 7L, 7L,
7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile",
"Second Quartile", "Third Quartile", "Top Quartile", "Missing"
), class = "factor"), Overall_Tenure_Period = new("Period",
.Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), Recent_Tenure_Period = new("Period",
.Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), VOA_Overall_Tenure_Orig = structure(c(9L,
9L, 9L, 9L, 9L, 9L), .Label = c("< 1 Year", "1 Year - <2 Years",
"2-5 Years", "6-10 Years", "11-15 Years", "16-20 Years",
"21-25 Years", "26 Years or Longer", "Missing"), class = "factor"),
VOA_Overall_Tenure_Mod = structure(c(5L, 5L, 5L, 5L, 5L,
5L), .Label = c("<2 Years", "2-5 Years", "6-10 Years", ">10 Years",
"Missing"), class = "factor"), VOA_Recent_Tenure_Orig = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), VOA_Recent_Tenure_Mod = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), Region = c("Lakeland", "Lakeland", "Lakeland",
"Lakeland", "Lakeland", "Lakeland"), Tenure_Code_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Most = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Split = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Question = c("Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?"), Answer = c("Slightly Satisfied",
"Slightly Satisfied", "Slightly Satisfied", "Dissatisfied",
"Completely Dissatisfied", "Slightly Dissatisfied"), Answer_TopBox = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Answer_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Answer_GenSat = c("GenSat",
"GenSat", "GenSat", "Rest", "Rest", "Rest"), Answer_Bottom = c("Rest",
"Rest", "Rest", "Rest", "Bottom", "Rest"), Answer_Bottom2 = c("Rest",
"Rest", "Rest", "Bottom2", "Rest", "Rest"), Answer_GenDissat = c("Rest",
"Rest", "Rest", "GenDissat", "GenDissat", "GenDissat")), row.names = c(NA,
6L), class = "data.frame")
然后我尝试通过执行以下操作来重新创建示例:
top_score_tests_agent <- as.data.frame(agent_data_clean_coded %>%
group_by(Region, Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least,
InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question, Answer_TopBox) %>%
summarise(freq = n())) %>% group_by(Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least,
InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(Region, freq)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$Answer_TopBox
return(M)
}))
但我发现如果我尝试通过检查 top_score_tests_agent$M[[
structure(4L, .Dim = c(1L, 1L), .Dimnames = list(NULL, "Top Box"))
我只是想知道是否有人对我做错了什么阻止我创建矩阵有任何见解,或者是否有人有任何其他方法可以做到这一点?
编辑
我能够使用@Wietze314 编写的大部分代码,但对于任何有兴趣查看最终代码的未来用户:
result2 <- df %>% select(Region, Question, starts_with("Answer")) %>%
gather(segment, answer,-Region, -Question) %>%
group_by(Question, segment) %>%
nest() %>%
mutate(test = map(data, ~chisq.test(.x$Region,.x$answer, correct=FALSE))) %>%
mutate(p = map_dbl(test, pluck,'p.value'),
Status = ifelse(p<=0.01, "99% Sig Difference", ifelse(
p>0.01 & p<=0.05, "95% Sig Difference", ifelse(
p>0.05 & p<=0.1, "90% Sig Difference", "Not Significant")))) %>%
select(-data, -test)
这给了我一个看起来像这样的输出:
structure(list(Question = c("I feel comfortable 'trusting the system' with EDGE",
"EDGE allows me to be more efficient", "The E-learning training (GU Courses)",
"When I have questions about EDGE, I feel confident they will be answered",
"Overall, the training I received prepared me to use EDGE", "Overall, how satisfied are you with using EDGE?",
"The in-person, instructor-led training", "The formal training you received in EDGE",
"EDGE allows me to be more efficient", "I feel comfortable 'trusting the system' with EDGE"
), Segment = c("Answer_GenDissat", "Answer_Bottom", "Answer_GenSat",
"Answer_TopBox", "Answer_GenDissat", "Answer_TopBox", "Answer_Top2",
"Answer_Top2", "Answer_Bottom2", "Answer_GenSat"), pvalue = c(0.231403084430793,
0.299890413606335, 0.00108798852510237, 0.487810952072342, 0.131641662666334,
0.31818165042123, 0.501077891603077, 0.634730681199174, 0.389259022098406,
0.274277276570632), Status = c("Not Significant", "Not Significant",
"99% Sig Difference", "Not Significant", "Not Significant", "Not Significant",
"Not Significant", "Not Significant", "Not Significant", "Not Significant"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L))
我想我明白目标是什么。
我复制了数据集,因为 Region
中只有一个值。
require(tidyverse)
df <- agent_data_clean_coded %>%
bind_rows(agent_data_clean_coded %>% mutate(Region = "other"))
result <- df %>% select(Region, starts_with("Answer")) %>%
gather(question, answer,-Region) %>%
group_by(question) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>%
group_by(Region,answer) %>%
summarise(freq = n()) %>%
spread(Region, freq)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$answer
return(M)
}))
我习惯于以不同的方式解决这个问题:
对于此选项,我还排除了两个 Answer_Top
变量,因为它们也包含一个级别。否则chisq.test会报错。在这种情况下,我使用带有 chisq.test
的原始数据而不是意外事件 table.
result2 <- df %>% select(Region, starts_with("Answer")) %>%
select(-contains("Top")) %>%
gather(question, answer,-Region) %>%
group_by(question) %>%
nest() %>%
mutate(test = map(data, ~
chisq.test(.x$Region,.x$answer))) %>%
mutate(p = map_dbl(test, pluck,'p.value'))