R中string/character数据的逻辑运算
Logic operations with string/character data in R
我想对我 运行 在 R 中进行的实验进行评分。在这个实验中,受试者被问到多项选择题,只有一个正确答案。他们的回答被保存为 .csv 中的口头数据。这是数据的头部。第一行代表正确答案:
data <- structure(list(PRE_TR.1 = c(1L, 1L, 1L), PRE_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}",
"{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), PRE_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}",
"{\"WQ3\":\"<strong>D.</strong> There is no basis for predicting which post office would have the greater number of days on which mean heights were over 71 inches.\"}"
), PRE_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}",
"{\"WQ4\":\"<strong>B.</strong> The large street\"}"), PRE_RULE_LLN = c("A",
"{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}",
"{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
), PRE_IS.1 = c(0L, 2L, 2L), PRE_IS.2 = c(3L, 3L, 3L), PRE_IS.3 = c(0L,
0L, 3L), PRE_IS.4 = c(3L, 3L, 3L), PRE_IS.5 = c(2L, 2L, 1L),
PRE_TR.5 = c("C", "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}",
"{\"WQ1\":\"<strong>D.</strong> Itâââ0šÂ¬Ã¢â0žÂ¢s impossible to predict the value of the standard deviation.\"}"
), PRE_RULE_CLT = c("A", "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}",
"{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}"
), PT.1 = c(NA, TRUE, TRUE), PT.2 = c(NA, TRUE, TRUE), PT.3 = c(NA,
TRUE, TRUE), PT.4 = c(NA, TRUE, TRUE), PT.5 = c(NA, TRUE,
TRUE), PT.6 = c(NA, TRUE, TRUE), PT.7 = c(NA, TRUE, TRUE),
PT.8 = c(NA, TRUE, TRUE), PT.9 = c(NA, TRUE, TRUE), PT.10 = c(NA,
TRUE, FALSE), POST_IS.2 = c(3L, 3L, 0L), POST_IS.3 = c(0L,
0L, 3L), POST_IS.4 = c(3L, 3L, 0L), POST_IS.5 = c(2L, 2L,
0L), POST_IS.1 = c(0L, 0L, 0L), POST_TR.1 = c(1L, 1L, 1L),
POST_TR.5 = c("C", "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}",
"{\"WQ1\":\"<strong>D.</strong> Itâââ0šÂ¬Ã¢â0žÂ¢s impossible to predict the value of the standard deviation.\"}"
), POST_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}",
"{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), POST_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}",
"{\"WQ3\":\"<strong>D.</strong> There is no basis for predicting which post office would have the greater number of days on which mean heights were over 71 inches.\"}"
), POST_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}",
"{\"WQ4\":\"<strong>B.</strong> The large street\"}"), POST_RULE_CLT = c("A",
"{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}",
"{\"R1\":\"<strong>D.</strong> As the sample size increases, the distribution of sample means have a similar standard deviation to that of the population.\"}"
), POST_RULE_LLN = c("A", "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}",
"{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
), Exp1 = c("", "{\"Q0\":\"You are drawing a sample that is more like the population in size as well as having more subjects accounts for possibly outliers\"}",
"{\"Q0\":\"\nif the sample mean increases, will inevitably get closer to the population mean because the sample size is becoming closer to the population size.\"}"
), Exp2 = c("", "{\"Q0\":\"As your sample size increases, the means calculated will be around the population and outliers won’t throw off the calculation\"}",
"{\"Q0\":\"\"}"), TIME = c(NA, 508432L, 2180078L)), row.names = c(NA,
3L), class = "data.frame")
我需要对这些答案进行评分。评分系统应如下工作:对于所有以“PRE_TR”开头的列,每个参与者(行)应通过将他们在这些列中的正确答案相加而获得“TOTAL PRE_TR”的分数。然后他们需要通过在以“PRE_IS”开头的列中总结他们的正确答案等等来获得“TOTAL PRE_IS”的分数......下面我展示了我的示例输出的头部想要:
output <- structure(list(PRE_TR.1 = c(1L, 1L), PRE_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), PRE_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}"
), PRE_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}"
), PRE_RULE_LLN = c("A", "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
), PRE_IS.1 = c(0L, 2L), PRE_IS.2 = c(3L, 3L), PRE_IS.3 = c(0L,
0L), PRE_IS.4 = c(3L, 3L), PRE_IS.5 = c(2L, 2L), PRE_TR.5 = c("C",
"{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}"
), PRE_RULE_CLT = c("A", "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}"
), PT.1 = c(NA, TRUE), PT.2 = c(NA, TRUE), PT.3 = c(NA, TRUE),
PT.4 = c(NA, TRUE), PT.5 = c(NA, TRUE), PT.6 = c(NA, TRUE
), PT.7 = c(NA, TRUE), PT.8 = c(NA, TRUE), PT.9 = c(NA, TRUE
), PT.10 = c(NA, TRUE), POST_IS.2 = c(3L, 3L), POST_IS.3 = c(0L,
0L), POST_IS.4 = c(3L, 3L), POST_IS.5 = c(2L, 2L), POST_IS.1 = c(0L,
0L), POST_TR.1 = c(1L, 1L), POST_TR.5 = c("C", "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}"
), POST_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), POST_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}"
), POST_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}"
), POST_RULE_CLT = c("A", "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}"
), POST_RULE_LLN = c("A", "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
), Exp1 = c("", "{\"Q0\":\"You are drawing a sample that is more like the population in size as well as having more subjects accounts for possibly outliers\"}"
), Exp2 = c("", "{\"Q0\":\"As your sample size increases, the means calculated will be around the population and outliers won’t throw off the calculation\"}"
), TIME = c(NA, 508432L), TOTAL_PRE_TR = c(NA, 4L), TOTAL_PRE_IS = c(NA,
4L), TOTAL_PRE_RULE = c(NA, 1L), TOTAL_T = c(NA, 10L), TOTAL_POST_TR = c(NA,
4L), TOTAL_POST_IS = c(NA, 5L), TOTAL_POST_RULE = c(NA, 2L
)), row.names = 1:2, class = "data.frame")
考虑到口头答案很长,我只想能够select正确选项中的一个词,这样更容易吗?一个算法,例如“如果第 PRE_TR1 列的答案包括“B”,则为总分 TOTAL_PRE_TR.
加一分
这是我的第一遍。这里有改进的余地,但也许它会让你走上正确的方向。这个问题似乎很适合 dplyr
,我很想知道使用那个包的答案。以下是您如何在 base 中执行此操作:
基本思想是使用 grepl()
的模式匹配来检测正确答案。这很简单,因为我们可以使用 apply()
系列将您的响应数据集的每一行与正确答案的数据集进行比较。
在我看来,具有挑战性的一点是仅在一组答案中对响应进行评分。我在这里使用更多模式匹配来查找独特的组,然后在 for 循环中计算每个组内的分数总和。
这些是您期望的结果吗?
# define dataset of correct answers only
ans <- data[1,]
# define dataset of responses only
data <- data[-1,]
# Get column indices for each group of answers
vars <- unique(gsub(pattern = ".?\d+", replacement = "", names(data)))
vars
#> [1] "PRE_TR" "PRE_RULE_LLN" "PRE_IS" "PRE_RULE_CLT"
#> [5] "PT" "POST_IS" "POST_TR" "POST_RULE_CLT"
#> [9] "POST_RULE_LLN" "Ex" "TIME"
colgroups <- sapply(1:length(vars), FUN = function(x) grep(vars[x], names(data)))
# Detect correct answers
scores <- sapply(1:ncol(data), FUN = function(x) ifelse(grepl(pattern = ans[1,x], x = data[,x]),1,0))
scores
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
#> [1,] 1 1 1 1 1 0 1 1 1 1 0 1 NA NA
#> [2,] 1 1 0 1 1 0 1 0 1 0 0 1 NA NA
#> [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
#> [1,] NA NA NA NA NA NA NA NA 1 1 1 1
#> [2,] NA NA NA NA NA NA NA NA 0 0 0 0
#> [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37]
#> [1,] 1 1 0 1 1 1 1 1 1 1 NA
#> [2,] 1 1 0 1 0 1 1 1 1 1 NA
# Make output dataframe
output <- data.frame(matrix(ncol = length(vars), nrow = nrow(data)))
names(output) <- vars
output
#> PRE_TR PRE_RULE_LLN PRE_IS PRE_RULE_CLT PT POST_IS POST_TR POST_RULE_CLT
#> 1 NA NA NA NA NA NA NA NA
#> 2 NA NA NA NA NA NA NA NA
#> POST_RULE_LLN Ex TIME
#> 1 NA NA NA
#> 2 NA NA NA
# Sum correct answers one group at a time
scores <- data.frame(scores)
for(i in 1:length(colgroups)){
# We only need to sum if a group has more than one response
# Multiple respones per group
if(length(colgroups[[i]]) > 1){
output[,vars[i]] <- rowSums(scores[,colgroups[[i]]])
} else {
# One response per group
output[,vars[i]] <- scores[,colgroups[[i]]]
}
}
output
#> PRE_TR PRE_RULE_LLN PRE_IS PRE_RULE_CLT PT POST_IS POST_TR POST_RULE_CLT
#> 1 4 1 4 1 NA 5 4 1
#> 2 3 1 2 1 NA 1 3 1
#> POST_RULE_LLN Ex TIME
#> 1 1 2 NA
#> 2 1 2 NA
Created on 2021-10-08 by the reprex package (v2.0.1)
data
library(tidyverse)
fNoCharacter = function(x) !is.character(x)
dfCorrect = data %>% as_tibble() %>% slice_head() %>%
mutate(PRE_TR.1 = PRE_TR.1 %>% paste0()) %>%
mutate_if(fNoCharacter, paste0)
dfAnswers = data %>% as_tibble() %>% slice(2:nrow(.)) %>%
mutate(id = 1:nrow(.)) %>%
mutate_if(fNoCharacter, paste0)
fGetAnswerPRE_TR = function(x) ifelse(is.na(str_match(x, "(<strong>)(.)")[3]), x,
str_match(x, "(<strong>)(.)")[3])
fGetAnswerPRE_TR = Vectorize(fGetAnswerPRE_TR)
fCorrect = function(val, start_w) val==dfCorrect %>%
pivot_longer(starts_with(start_w)) %>% pull(value)
dSumCorrect = function(data, AnsName){
df1<<-data
data %>%
pivot_longer(starts_with(AnsName)) %>%
mutate(Correct = fCorrect(fGetAnswerPRE_TR(value), AnsName)) %>%
pull(Correct) %>% sum()
}
dfAnswers %>% group_by(id) %>%
nest() %>%
mutate(`TOTAL PRE_TR` = map(data, ~dSumCorrect(.x,"PRE_TR")),
`TOTAL PRE_IS` = map(data, ~dSumCorrect(.x,"PRE_IS")),
`TOTAL PRE_RULE` = map(data, ~dSumCorrect(.x,"PRE_RULE")),
`TOTAL PT` = map(data, ~dSumCorrect(.x,"PT")),
`TOTAL POST_IS` = map(data, ~dSumCorrect(.x,"POST_IS")),
`TOTAL POST_TR` = map(data, ~dSumCorrect(.x,"POST_TR")),
`TOTAL POST_RULE` = map(data, ~dSumCorrect(.x,"POST_RULE"))) %>%
unnest(c(`TOTAL PRE_TR`:`TOTAL POST_RULE`))
输出
# A tibble: 2 x 9
# Groups: id [2]
id data `TOTAL PRE_TR` `TOTAL PRE_IS` `TOTAL PRE_RULE` `TOTAL PT` `TOTAL POST_IS` `TOTAL POST_TR` `TOTAL POST_RULE`
<chr> <list> <int> <int> <int> <int> <int> <int> <int>
1 1 <tibble [1 x 37]> 4 4 2 0 5 4 2
2 2 <tibble [1 x 37]> 3 2 2 0 1 3 1
我想对我 运行 在 R 中进行的实验进行评分。在这个实验中,受试者被问到多项选择题,只有一个正确答案。他们的回答被保存为 .csv 中的口头数据。这是数据的头部。第一行代表正确答案:
data <- structure(list(PRE_TR.1 = c(1L, 1L, 1L), PRE_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}",
"{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), PRE_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}",
"{\"WQ3\":\"<strong>D.</strong> There is no basis for predicting which post office would have the greater number of days on which mean heights were over 71 inches.\"}"
), PRE_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}",
"{\"WQ4\":\"<strong>B.</strong> The large street\"}"), PRE_RULE_LLN = c("A",
"{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}",
"{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
), PRE_IS.1 = c(0L, 2L, 2L), PRE_IS.2 = c(3L, 3L, 3L), PRE_IS.3 = c(0L,
0L, 3L), PRE_IS.4 = c(3L, 3L, 3L), PRE_IS.5 = c(2L, 2L, 1L),
PRE_TR.5 = c("C", "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}",
"{\"WQ1\":\"<strong>D.</strong> Itâââ0šÂ¬Ã¢â0žÂ¢s impossible to predict the value of the standard deviation.\"}"
), PRE_RULE_CLT = c("A", "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}",
"{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}"
), PT.1 = c(NA, TRUE, TRUE), PT.2 = c(NA, TRUE, TRUE), PT.3 = c(NA,
TRUE, TRUE), PT.4 = c(NA, TRUE, TRUE), PT.5 = c(NA, TRUE,
TRUE), PT.6 = c(NA, TRUE, TRUE), PT.7 = c(NA, TRUE, TRUE),
PT.8 = c(NA, TRUE, TRUE), PT.9 = c(NA, TRUE, TRUE), PT.10 = c(NA,
TRUE, FALSE), POST_IS.2 = c(3L, 3L, 0L), POST_IS.3 = c(0L,
0L, 3L), POST_IS.4 = c(3L, 3L, 0L), POST_IS.5 = c(2L, 2L,
0L), POST_IS.1 = c(0L, 0L, 0L), POST_TR.1 = c(1L, 1L, 1L),
POST_TR.5 = c("C", "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}",
"{\"WQ1\":\"<strong>D.</strong> Itâââ0šÂ¬Ã¢â0žÂ¢s impossible to predict the value of the standard deviation.\"}"
), POST_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}",
"{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), POST_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}",
"{\"WQ3\":\"<strong>D.</strong> There is no basis for predicting which post office would have the greater number of days on which mean heights were over 71 inches.\"}"
), POST_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}",
"{\"WQ4\":\"<strong>B.</strong> The large street\"}"), POST_RULE_CLT = c("A",
"{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}",
"{\"R1\":\"<strong>D.</strong> As the sample size increases, the distribution of sample means have a similar standard deviation to that of the population.\"}"
), POST_RULE_LLN = c("A", "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}",
"{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
), Exp1 = c("", "{\"Q0\":\"You are drawing a sample that is more like the population in size as well as having more subjects accounts for possibly outliers\"}",
"{\"Q0\":\"\nif the sample mean increases, will inevitably get closer to the population mean because the sample size is becoming closer to the population size.\"}"
), Exp2 = c("", "{\"Q0\":\"As your sample size increases, the means calculated will be around the population and outliers won’t throw off the calculation\"}",
"{\"Q0\":\"\"}"), TIME = c(NA, 508432L, 2180078L)), row.names = c(NA,
3L), class = "data.frame")
我需要对这些答案进行评分。评分系统应如下工作:对于所有以“PRE_TR”开头的列,每个参与者(行)应通过将他们在这些列中的正确答案相加而获得“TOTAL PRE_TR”的分数。然后他们需要通过在以“PRE_IS”开头的列中总结他们的正确答案等等来获得“TOTAL PRE_IS”的分数......下面我展示了我的示例输出的头部想要:
output <- structure(list(PRE_TR.1 = c(1L, 1L), PRE_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), PRE_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}"
), PRE_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}"
), PRE_RULE_LLN = c("A", "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
), PRE_IS.1 = c(0L, 2L), PRE_IS.2 = c(3L, 3L), PRE_IS.3 = c(0L,
0L), PRE_IS.4 = c(3L, 3L), PRE_IS.5 = c(2L, 2L), PRE_TR.5 = c("C",
"{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}"
), PRE_RULE_CLT = c("A", "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}"
), PT.1 = c(NA, TRUE), PT.2 = c(NA, TRUE), PT.3 = c(NA, TRUE),
PT.4 = c(NA, TRUE), PT.5 = c(NA, TRUE), PT.6 = c(NA, TRUE
), PT.7 = c(NA, TRUE), PT.8 = c(NA, TRUE), PT.9 = c(NA, TRUE
), PT.10 = c(NA, TRUE), POST_IS.2 = c(3L, 3L), POST_IS.3 = c(0L,
0L), POST_IS.4 = c(3L, 3L), POST_IS.5 = c(2L, 2L), POST_IS.1 = c(0L,
0L), POST_TR.1 = c(1L, 1L), POST_TR.5 = c("C", "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}"
), POST_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), POST_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}"
), POST_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}"
), POST_RULE_CLT = c("A", "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of sample means have a smaller and smaller standard deviation\"}"
), POST_RULE_LLN = c("A", "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
), Exp1 = c("", "{\"Q0\":\"You are drawing a sample that is more like the population in size as well as having more subjects accounts for possibly outliers\"}"
), Exp2 = c("", "{\"Q0\":\"As your sample size increases, the means calculated will be around the population and outliers won’t throw off the calculation\"}"
), TIME = c(NA, 508432L), TOTAL_PRE_TR = c(NA, 4L), TOTAL_PRE_IS = c(NA,
4L), TOTAL_PRE_RULE = c(NA, 1L), TOTAL_T = c(NA, 10L), TOTAL_POST_TR = c(NA,
4L), TOTAL_POST_IS = c(NA, 5L), TOTAL_POST_RULE = c(NA, 2L
)), row.names = 1:2, class = "data.frame")
考虑到口头答案很长,我只想能够select正确选项中的一个词,这样更容易吗?一个算法,例如“如果第 PRE_TR1 列的答案包括“B”,则为总分 TOTAL_PRE_TR.
加一分这是我的第一遍。这里有改进的余地,但也许它会让你走上正确的方向。这个问题似乎很适合 dplyr
,我很想知道使用那个包的答案。以下是您如何在 base 中执行此操作:
基本思想是使用 grepl()
的模式匹配来检测正确答案。这很简单,因为我们可以使用 apply()
系列将您的响应数据集的每一行与正确答案的数据集进行比较。
在我看来,具有挑战性的一点是仅在一组答案中对响应进行评分。我在这里使用更多模式匹配来查找独特的组,然后在 for 循环中计算每个组内的分数总和。
这些是您期望的结果吗?
# define dataset of correct answers only
ans <- data[1,]
# define dataset of responses only
data <- data[-1,]
# Get column indices for each group of answers
vars <- unique(gsub(pattern = ".?\d+", replacement = "", names(data)))
vars
#> [1] "PRE_TR" "PRE_RULE_LLN" "PRE_IS" "PRE_RULE_CLT"
#> [5] "PT" "POST_IS" "POST_TR" "POST_RULE_CLT"
#> [9] "POST_RULE_LLN" "Ex" "TIME"
colgroups <- sapply(1:length(vars), FUN = function(x) grep(vars[x], names(data)))
# Detect correct answers
scores <- sapply(1:ncol(data), FUN = function(x) ifelse(grepl(pattern = ans[1,x], x = data[,x]),1,0))
scores
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
#> [1,] 1 1 1 1 1 0 1 1 1 1 0 1 NA NA
#> [2,] 1 1 0 1 1 0 1 0 1 0 0 1 NA NA
#> [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
#> [1,] NA NA NA NA NA NA NA NA 1 1 1 1
#> [2,] NA NA NA NA NA NA NA NA 0 0 0 0
#> [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37]
#> [1,] 1 1 0 1 1 1 1 1 1 1 NA
#> [2,] 1 1 0 1 0 1 1 1 1 1 NA
# Make output dataframe
output <- data.frame(matrix(ncol = length(vars), nrow = nrow(data)))
names(output) <- vars
output
#> PRE_TR PRE_RULE_LLN PRE_IS PRE_RULE_CLT PT POST_IS POST_TR POST_RULE_CLT
#> 1 NA NA NA NA NA NA NA NA
#> 2 NA NA NA NA NA NA NA NA
#> POST_RULE_LLN Ex TIME
#> 1 NA NA NA
#> 2 NA NA NA
# Sum correct answers one group at a time
scores <- data.frame(scores)
for(i in 1:length(colgroups)){
# We only need to sum if a group has more than one response
# Multiple respones per group
if(length(colgroups[[i]]) > 1){
output[,vars[i]] <- rowSums(scores[,colgroups[[i]]])
} else {
# One response per group
output[,vars[i]] <- scores[,colgroups[[i]]]
}
}
output
#> PRE_TR PRE_RULE_LLN PRE_IS PRE_RULE_CLT PT POST_IS POST_TR POST_RULE_CLT
#> 1 4 1 4 1 NA 5 4 1
#> 2 3 1 2 1 NA 1 3 1
#> POST_RULE_LLN Ex TIME
#> 1 1 2 NA
#> 2 1 2 NA
Created on 2021-10-08 by the reprex package (v2.0.1)
data
library(tidyverse)
fNoCharacter = function(x) !is.character(x)
dfCorrect = data %>% as_tibble() %>% slice_head() %>%
mutate(PRE_TR.1 = PRE_TR.1 %>% paste0()) %>%
mutate_if(fNoCharacter, paste0)
dfAnswers = data %>% as_tibble() %>% slice(2:nrow(.)) %>%
mutate(id = 1:nrow(.)) %>%
mutate_if(fNoCharacter, paste0)
fGetAnswerPRE_TR = function(x) ifelse(is.na(str_match(x, "(<strong>)(.)")[3]), x,
str_match(x, "(<strong>)(.)")[3])
fGetAnswerPRE_TR = Vectorize(fGetAnswerPRE_TR)
fCorrect = function(val, start_w) val==dfCorrect %>%
pivot_longer(starts_with(start_w)) %>% pull(value)
dSumCorrect = function(data, AnsName){
df1<<-data
data %>%
pivot_longer(starts_with(AnsName)) %>%
mutate(Correct = fCorrect(fGetAnswerPRE_TR(value), AnsName)) %>%
pull(Correct) %>% sum()
}
dfAnswers %>% group_by(id) %>%
nest() %>%
mutate(`TOTAL PRE_TR` = map(data, ~dSumCorrect(.x,"PRE_TR")),
`TOTAL PRE_IS` = map(data, ~dSumCorrect(.x,"PRE_IS")),
`TOTAL PRE_RULE` = map(data, ~dSumCorrect(.x,"PRE_RULE")),
`TOTAL PT` = map(data, ~dSumCorrect(.x,"PT")),
`TOTAL POST_IS` = map(data, ~dSumCorrect(.x,"POST_IS")),
`TOTAL POST_TR` = map(data, ~dSumCorrect(.x,"POST_TR")),
`TOTAL POST_RULE` = map(data, ~dSumCorrect(.x,"POST_RULE"))) %>%
unnest(c(`TOTAL PRE_TR`:`TOTAL POST_RULE`))
输出
# A tibble: 2 x 9
# Groups: id [2]
id data `TOTAL PRE_TR` `TOTAL PRE_IS` `TOTAL PRE_RULE` `TOTAL PT` `TOTAL POST_IS` `TOTAL POST_TR` `TOTAL POST_RULE`
<chr> <list> <int> <int> <int> <int> <int> <int> <int>
1 1 <tibble [1 x 37]> 4 4 2 0 5 4 2
2 2 <tibble [1 x 37]> 3 2 2 0 1 3 1