将 ifelse 应用于多列并创建新列

apply ifelse to multiple columns and create new column

我有一个包含 4 列的数据框,其中包含职务。对于每一列,我想创建一个新列(category1,category2,category3,category4),根据职位名称包含的单词(例如,如果职位名称包含“前端”, "ui", "ux" 那么列 category1 应该是 1)。我设法使用以下代码对每一列进行了手动分类,但我想同时对所有 4 列进行分类。感谢您的帮助!

data_rel$category1 <-
ifelse(grepl("frontend|ui|ux", data$job4_clean),1, ifelse(grepl("backend", data$job4_clean),2, ifelse(grepl("fullstack", data$job4_clean),3, ifelse(grepl("entwickler|development|application|developer|software",data$job4_clean),4, ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",data$job4_clean),5, ifelse(grepl("research|teaching|akademischer|researcher",data$job4_clean),6, ifelse(grepl("project|manager|product|consultant|consulting",data$job4_clean),7, ifelse(grepl("it|security|technical|tech", data$job4_clean),8, ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design",data$job4_clean),9, ifelse(grepl("founder|ceo|partner|chief|executive|cto",data$job4_clean),10,NA))))))))))
data_rel <- structure(list(job1 = c("phd fellow", "java developer intern", 
"optical engineer", " dwh bi engineer", " software engineer", 
"software developer", "data engineer", "application software engineer", 
"software developer", " web developer", "web developer", "web developer", 
"software engineer", "software engineer", " es computer", "associate software engineer", 
"fullstack ios developer", "technical delivery manager project manager", 
"software architect", "software developer"), job2 = c("research scientist", 
"analytics analyst", " developer", " data ml engineer", "graduate teaching assistant", 
"software developer", "machine learning engineer", "akademischer mitarbeiter machine learning and analytics", 
"backend develope", "lead php developer", "php system analytic software specialist", 
"webcreater", "data engineer", "software engineer", "assistant network administrator", 
"frontend engineer", "application infrastructor lead", "software engineer", 
"application developer", "software developer"), job3 = c("data scientist", 
"machine learning engineer", "application developer associate manager", 
NA, "co founder cto", NA, NA, NA, NA, NA, "lead php sugarcrm developer", 
" php developer", "data analysing researcher ", NA, "application developer consultance", 
"manager l1 ui frontend ", " software architect", "software engineering manager solution architect", 
"software developer consultance", "ai developer"), job4 = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, "software architect development lead", 
"team leader", NA, NA, " application development specialist", 
" associate experience technology", NA, " software developer", 
"fullstack developer productowner", NA)), row.names = c(NA, -20L
), class = c("tbl_df", "tbl", "data.frame"))

我们可以使用 purrr 包中的 map_dfc 来遍历我们数据集的每一列来检测类别,并按列绑定结果列。我使用 case_when 而不是 ifelse 以获得更好看的输出:

library(dplyr)
library(purrr)

data_rel %>%
  map_dfc(~ case_when(
    grepl("frontend|ui|ux", .x) ~ 1, 
    grepl("backend", .x) ~ 2,
    grepl("fullstack", .x) ~ 3,
    grepl("entwickler|development|application|developer|software", .x) ~ 4,
    grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning", .x) ~ 5,
    grepl("research|teaching|akademischer|researcher", .x) ~ 6, 
    grepl("project|manager|product|consultant|consulting", .x) ~ 7,
    grepl("it|security|technical|tech", .x) ~ 8,
    grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", .x) ~ 9,
    grepl("founder|ceo|partner|chief|executive|cto", .x) ~ 10,
    TRUE ~ as.numeric(NA)
  ))

# A tibble: 20 x 4
    job1  job2  job3  job4
   <dbl> <dbl> <dbl> <dbl>
 1    NA     6     5    NA
 2     4     5     5    NA
 3     5     4     4    NA
 4     5     5    NA    NA
 5     4     6    10    NA
 6     4     4    NA    NA
 7     5     5    NA    NA
 8     4     5    NA    NA
 9     4     2    NA    NA
10     4     4    NA    NA
11     4     4     4     4
12     4    NA     4    NA
13     4     5     5    NA
14     4     4    NA    NA
15    NA    NA     4     4
16     4     1     1     8
17     3     4     4    NA
18     7     4     4     4
19     4     4     4     3
20     4     4     4    NA

或以 R 为基数:

cbind(sapply(data_rel, function(x) {
  ifelse(grepl("frontend|ui|ux", x),1, 
         ifelse(grepl("backend", x),2, 
                ifelse(grepl("fullstack", x),3, 
                       ifelse(grepl("entwickler|development|application|developer|software", x),4, 
                              ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning", x),5, 
                                     ifelse(grepl("research|teaching|akademischer|researcher", x),6, 
                                            ifelse(grepl("project|manager|product|consultant|consulting", x),7, 
                                                   ifelse(grepl("it|security|technical|tech", x),8, 
                                                          ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", x),9, 
                                                                 ifelse(grepl("founder|ceo|partner|chief|executive|cto", x),10,NA))))))))))
}))

你可以写一个函数 case_when -

library(dplyr)

change_category <- function(x) {
  case_when(grepl("frontend|ui|ux", x) ~ 1L, 
            grepl("backend", x) ~ 2L, 
            grepl("fullstack", x) ~ 3L, 
            grepl("entwickler|development|application|developer|software",x) ~ 4L, 
            grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",x) ~5L,
            grepl("research|teaching|akademischer|researcher",x) ~ 6L, 
            grepl("project|manager|product|consultant|consulting", x) ~ 7L, 
            grepl("it|security|technical|tech", x) ~ 8L, 
            grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design",x) ~ 9L, 
            grepl("founder|ceo|partner|chief|executive|cto",x) ~10L)
}

并应用 across -

data_rel %>% mutate(across(.fns = change_category, .names = '{col}_category'))

您可以将作业存储在向量中,然后使用 sapply 使用 greplmatch 迭代它们以获取作业编号。


jobs <- c("frontend|ui|ux"
        , "backend"
        , "fullstack"
        , "entwickler|development|application|developer|software"
        , "data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning"
        , "research|teaching|akademischer|researcher"
        , "project|manager|product|consultant|consulting"
        , "it|security|technical|tech"
        , "margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design"
        , "founder|ceo|partner|chief|executive|cto")

sapply(data_rel, function(x) apply(sapply(jobs, grepl, x), 1, match, x=TRUE))
#      job1 job2 job3 job4
# [1,]   NA    6    5   NA
# [2,]    4    5    5   NA
# [3,]    5    4    4   NA
# [4,]    5    5   NA   NA
# [5,]    4    6   10   NA
# [6,]    4    4   NA   NA
# [7,]    5    5   NA   NA
# [8,]    4    5   NA   NA
# [9,]    4    2   NA   NA
#[10,]    4    4   NA   NA
#[11,]    4    4    4    4
#[12,]    4   NA    4   NA
#[13,]    4    5    5   NA
#[14,]    4    4   NA   NA
#[15,]   NA   NA    4    4
#[16,]    4    1    1    8
#[17,]    3    4    4   NA
#[18,]    7    4    4    4
#[19,]    4    4    4    3
#[20,]    4    4    4   NA

扩展 @zx8754 的评论,您可以将代码放在一个函数中,使用 lapply 应用于每一列,最后使用 do.call 将列组合为 data.frame 再次.

get_level <- function(col) {
  ifelse(grepl("frontend|ui|ux", col), 1, 
    ifelse(grepl("backend", col), 2,
      ifelse(grepl("fullstack", col), 3, 
        ifelse(grepl("entwickler|development|application|developer|software",col), 4, 
          ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",col), 5, 
            ifelse(grepl("research|teaching|akademischer|researcher",col), 6, 
              ifelse(grepl("project|manager|product|consultant|consulting",col), 7, 
                ifelse(grepl("it|security|technical|tech", col), 8, 
                  ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", col), 9, 
                    ifelse(grepl("founder|ceo|partner|chief|executive|cto", col), 10,
                      NA))))))))))
}

cats <- lapply(data_rel, get_level)
cats <- do.call(cbind.data.frame, cats)
names(cats) <- paste0("category", seq_along(data_rel))