如何使用 stringr 或 grepl 创建新的字符变量？

Question

我有关于学生的科目代码和成绩的数据。每个人必须选修 4 门科目，其中英语是必修课。它由代码 301 表示。

列：SUB1、SUB2 等代表其他模块的主题代码，下一列代表标记。

基于这些代码，我正在尝试做两件事：

第一件事：

如果学生的科目代码为 42、43 和 44，我正在尝试创建一个包含 PCB 的课程列。如果学生的科目代码为 41、42 和 43，则为 PCM。如果代码为 55，则为 Commerce， 54 和 30。

我面临的问题是代码分布在各个列中，我很难将它们标准化。

第二件事：

根据确定的课程，我试图总结每个学生在这些科目中获得的成绩。但是，我也想加上英语成绩。

数据示例：

structure(list(Roll.No. = c(12653771L, 12653813L, 12653787L, 
12653850L, 12653660L, 12653553L, 12653902L, 12653888L, 12653440L, 
12653487L, 12653469L, 12653832L, 12653382L, 12653814L, 12653587L, 
12653508L, 12653449L, 12653445L, 12653776L, 12653806L), SUB = c(301L, 
301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 
301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L), MRK = c(93L, 
82L, 85L, 74L, 85L, 80L, 67L, 77L, 78L, 94L, 89L, 65L, 88L, 89L, 
82L, 85L, 91L, 77L, 97L, 76L), SUB.1 = c(30L, 30L, 30L, 30L, 
42L, 41L, 30L, 30L, 41L, 41L, 41L, 30L, 41L, 30L, 42L, 41L, 41L, 
41L, 30L, 30L), MRK.1 = c(74L, 97L, 75L, 73L, 72L, 81L, 62L, 
71L, 63L, 75L, 93L, 74L, 89L, 91L, 67L, 87L, 81L, 94L, 86L, 69L
), SUB.2 = c(48L, 41L, 48L, 54L, 43L, 42L, 48L, 48L, 42L, 42L, 
42L, 41L, 42L, 41L, 43L, 42L, 42L, 42L, 48L, 48L), MRK.2 = c(76L, 
95L, 79L, 74L, 72L, 75L, 67L, 74L, 60L, 72L, 93L, 56L, 79L, 68L, 
68L, 91L, 62L, 75L, 95L, 67L), SUB.3 = c(54L, 54L, 54L, 55L, 
44L, 43L, 54L, 54L, 43L, 43L, 43L, 54L, 43L, 54L, 44L, 43L, 43L, 
43L, 54L, 54L), MRK.3 = c(80L, 96L, 77L, 44L, 94L, 69L, 63L, 
74L, 57L, 67L, 80L, 67L, 72L, 89L, 95L, 87L, 68L, 82L, 94L, 69L
), SUB.4 = c(55L, 55L, 55L, 64L, 265L, 48L, 55L, 55L, 48L, 48L, 
283L, 55L, 48L, 55L, 64L, 283L, 283L, 48L, 55L, 55L), MRK.4 = c(45L, 
95L, 46L, 76L, 91L, 74L, 44L, 52L, 82L, 92L, 92L, 60L, 81L, 49L, 
83L, 89L, 90L, 83L, 93L, 61L), SUB.5 = c(NA, 64L, NA, 41L, 48L, 
NA, NA, NA, NA, NA, NA, 64L, NA, 49L, NA, 48L, 49L, NA, NA, NA
), MRK.5 = c(NA, "97", NA, "AB", "87", NA, NA, NA, NA, NA, NA, 
"71", NA, "97", NA, "83", "98", NA, NA, NA)), row.names = c(NA, 
20L), class = "data.frame")

Answer 1

假设您对参加所有这些课程的学生感兴趣，那么回答您的第一个问题：

I am trying to create a course column consisting of PCB if the student has subject codes 42, 43, and 44. PCM if the student has subject codes 41, 42, and 43. Commerce if the codes are 55, 54, and 30.

library(tidyverse)
df %>%
  mutate(across(-Roll.No., ~ as.numeric(.))) %>%
  rowwise() %>%
  mutate(subject_summary = case_when(sum(c_across(starts_with("SUB")) %in% c(42, 43, 44)) == 3 ~"PCB",
                                     sum(c_across(starts_with("SUB")) %in% c(41, 42, 43)) == 3 ~ "PCM",
                                     sum(c_across(starts_with("SUB")) %in% c(55, 54, 30)) == 3 ~ "Commerce",
                                     TRUE ~ NA_character_)) %>%
  ungroup()

给出：

# A tibble: 20 x 14
   Roll.No.   SUB   MRK SUB.1 MRK.1 SUB.2 MRK.2 SUB.3 MRK.3 SUB.4 MRK.4 SUB.5 MRK.5 subject_summary
      <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>          
 1 12653771   301    93    30    74    48    76    54    80    55    45    NA    NA Commerce       
 2 12653813   301    82    30    97    41    95    54    96    55    95    64    97 Commerce       
 3 12653787   301    85    30    75    48    79    54    77    55    46    NA    NA Commerce       
 4 12653850   301    74    30    73    54    74    55    44    64    76    41    NA Commerce       
 5 12653660   301    85    42    72    43    72    44    94   265    91    48    87 PCB            
 6 12653553   301    80    41    81    42    75    43    69    48    74    NA    NA PCM            
 7 12653902   301    67    30    62    48    67    54    63    55    44    NA    NA Commerce       
 8 12653888   301    77    30    71    48    74    54    74    55    52    NA    NA Commerce       
 9 12653440   301    78    41    63    42    60    43    57    48    82    NA    NA PCM            
10 12653487   301    94    41    75    42    72    43    67    48    92    NA    NA PCM            
11 12653469   301    89    41    93    42    93    43    80   283    92    NA    NA PCM            
12 12653832   301    65    30    74    41    56    54    67    55    60    64    71 Commerce       
13 12653382   301    88    41    89    42    79    43    72    48    81    NA    NA PCM            
14 12653814   301    89    30    91    41    68    54    89    55    49    49    97 Commerce       
15 12653587   301    82    42    67    43    68    44    95    64    83    NA    NA PCB            
16 12653508   301    85    41    87    42    91    43    87   283    89    48    83 PCM            
17 12653449   301    91    41    81    42    62    43    68   283    90    49    98 PCM            
18 12653445   301    77    41    94    42    75    43    82    48    83    NA    NA PCM            
19 12653776   301    97    30    86    48    95    54    94    55    93    NA    NA Commerce       
20 12653806   301    76    30    69    48    67    54    69    55    61    NA    NA Commerce

你的第二个问题可以通过继续上面的代码来回答（但为了方便在这里显示完整的代码）：

df %>%
  mutate(across(-Roll.No., ~ as.numeric(.))) %>%
  rowwise() %>%
  mutate(subject_summary = case_when(sum(c_across(starts_with("SUB")) %in% c(42, 43, 44)) == 3 ~"PCB",
                                     sum(c_across(starts_with("SUB")) %in% c(41, 42, 43)) == 3 ~ "PCM",
                                     sum(c_across(starts_with("SUB")) %in% c(55, 54, 30)) == 3 ~ "Commerce",
                                     TRUE ~ NA_character_)) %>%
  ungroup() %>%
  pivot_longer(cols = -c(Roll.No., subject_summary, SUB, MRK),
               names_pattern = "(.*)(\d{1})",
               names_to = c(".value", "course")) %>%
  group_by(Roll.No.) %>%
  summarize(subject_summary = first(subject_summary),
            grade = case_when(all(subject_summary == "PCB") ~ sum(MRK.[SUB. %in% c(42, 43, 44)]) + first(MRK),
                              all(subject_summary == "PCM") ~ sum(MRK.[SUB. %in% c(41, 42, 43)]) + first(MRK),
                              all(subject_summary == "Commerce") ~ sum(MRK.[SUB. %in% c(55, 54, 30)]) + first(MRK)))

给出：

# A tibble: 20 x 3
   Roll.No. subject_summary grade
      <int> <chr>           <dbl>
 1 12653382 PCM               328
 2 12653440 PCM               258
 3 12653445 PCM               328
 4 12653449 PCM               302
 5 12653469 PCM               355
 6 12653487 PCM               308
 7 12653508 PCM               350
 8 12653553 PCM               305
 9 12653587 PCB               312
10 12653660 PCB               323
11 12653771 Commerce          292
12 12653776 Commerce          370
13 12653787 Commerce          283
14 12653806 Commerce          275
15 12653813 Commerce          370
16 12653814 Commerce          318
17 12653832 Commerce          266
18 12653850 Commerce          265
19 12653888 Commerce          274
20 12653902 Commerce          236

Answer 2

这也应该回答：你的第一个问题：

df <- df %>% 
# convert to numeric and induce NAs in MRK.5 (losing AB)
mutate(MRK.5 = as.numeric(MRK.5)) %>% 
# convert NAs to 0
  mutate(across(where(anyNA), ~ replace_na(., 0))) %>% 
# create a new string of subjects 
  mutate(subjects = str_c(SUB.1, SUB.2, SUB.3, SUB.4, SUB.5, sep = "_")) %>%
# use case_when to specify course 
 mutate(course = case_when(str_detect(subjects, "42") & str_detect(subjects, "43") & str_detect(subjects, "44") ~ "PCB",
                            str_detect(subjects, "41") & str_detect(subjects, "42") & str_detect(subjects, "43") ~ "PCM",
                            str_detect(subjects, "30") & str_detect(subjects, "54") & str_detect(subjects, "55") ~ "Commerce",
                            TRUE ~ "Other"))

我不确定你是想要每门课程的三个科目加上英语的总分，还是所有科目的总分，但经过一番争论，你可以得到以下数据框，从那里你应该能够得到你需要的。

df %>%  select(-subjects) %>% 
# create consistent naming for the pivot separation
  rename(SUB.E = SUB, MRK.E = MRK) %>% 
# pivot into tidy form
 pivot_longer(cols = -c(Roll.No., subjects, course),
               names_pattern = "(\w{3})\.(.{1})",
               names_to = c(".value", "course2"))

给予：

A tibble: 120 × 5
   Roll.No. course   course2     SUB   MRK
      <int> <chr>    <chr> <dbl> <dbl>
 1 12653771 Commerce E       301    93
 2 12653771 Commerce 1        30    74
 3 12653771 Commerce 2        48    76
 4 12653771 Commerce 3        54    80
 5 12653771 Commerce 4        55    45
 6 12653771 Commerce 5         0     0
 7 12653813 Commerce E       301    82
 8 12653813 Commerce 1        30    97
 9 12653813 Commerce 2        41    95
10 12653813 Commerce 3        54    96

要计算每个学生的总分，您需要按条件过滤并使用 group_by 并汇总：

# filter for each condition 
df %>% filter(course == "Commerce" & SUB %in% c(301,30,50,54) | 
                course == "PCB" & SUB %in% c(301,42,43,44)  | 
                course == "PCM" & SUB %in% c(301,41,42,43)) %>% 
# summarise by group 
 group_by(Roll.No.,course) %>% 
  summarise(score = sum(MRK))

将以上部分合并为一个代码块：

df %>% mutate(across(, ~as.numeric(.))) %>%
  mutate(across(where(anyNA), ~ replace_na(., 0))) %>% 
  rename(SUB.E = SUB, MRK.E = MRK) %>% 
  mutate(subjects = str_c(SUB.1, SUB.2, SUB.3, SUB.4, SUB.5, sep = "_")) %>% 
  mutate(course = case_when(str_detect(subjects, "42") & str_detect(subjects, "43") & str_detect(subjects, "44") ~ "PCB",
                            str_detect(subjects, "41") & str_detect(subjects, "42") & str_detect(subjects, "43") ~ "PCM",
                            str_detect(subjects, "30") & str_detect(subjects, "54") & str_detect(subjects, "55") ~ "Commerce",
                            TRUE ~ "Other")) %>% 
  pivot_longer(cols = -c(Roll.No., subjects, course),
               names_pattern = "(\w{3})\.(.{1})",
               names_to = c(".value", "course2")) %>% 
  filter(course == "Commerce" & SUB %in% c(301,30,50,54) | 
           course == "PCB" & SUB %in% c(301,42,43,44)  | 
           course == "PCM" & SUB %in% c(301,41,42,43)) %>% 
  group_by(Roll.No., course) %>% 
  summarise(combined_mark = sum(MRK))

给予：

Roll.No. course   combined_mark
      <dbl> <chr>            <dbl>
 1 12653382 PCM                328
 2 12653440 PCM                258
 3 12653445 PCM                328
 4 12653449 PCM                302
 5 12653469 PCM                355
 6 12653487 PCM                308
 7 12653508 PCM                350
 8 12653553 PCM                305
 9 12653587 PCB                312
10 12653660 PCB                323
11 12653771 Commerce           247
12 12653776 Commerce           277
13 12653787 Commerce           237
14 12653806 Commerce           214
15 12653813 Commerce           275
16 12653814 Commerce           269
17 12653832 Commerce           206
18 12653850 Commerce           221
19 12653888 Commerce           222
20 12653902 Commerce           192

如何使用 stringr 或 grepl 创建新的字符变量？

How to use stringr or grepl to Make new character variable?

r

stringr

dplyr

grepl