将 ifelse 应用于多列并创建新列
apply ifelse to multiple columns and create new column
我有一个包含 4 列的数据框,其中包含职务。对于每一列,我想创建一个新列(category1,category2,category3,category4),根据职位名称包含的单词(例如,如果职位名称包含“前端”, "ui", "ux" 那么列 category1 应该是 1)。我设法使用以下代码对每一列进行了手动分类,但我想同时对所有 4 列进行分类。感谢您的帮助!
data_rel$category1 <-
ifelse(grepl("frontend|ui|ux", data$job4_clean),1, ifelse(grepl("backend", data$job4_clean),2, ifelse(grepl("fullstack", data$job4_clean),3, ifelse(grepl("entwickler|development|application|developer|software",data$job4_clean),4, ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",data$job4_clean),5, ifelse(grepl("research|teaching|akademischer|researcher",data$job4_clean),6, ifelse(grepl("project|manager|product|consultant|consulting",data$job4_clean),7, ifelse(grepl("it|security|technical|tech", data$job4_clean),8, ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design",data$job4_clean),9, ifelse(grepl("founder|ceo|partner|chief|executive|cto",data$job4_clean),10,NA))))))))))
data_rel <- structure(list(job1 = c("phd fellow", "java developer intern",
"optical engineer", " dwh bi engineer", " software engineer",
"software developer", "data engineer", "application software engineer",
"software developer", " web developer", "web developer", "web developer",
"software engineer", "software engineer", " es computer", "associate software engineer",
"fullstack ios developer", "technical delivery manager project manager",
"software architect", "software developer"), job2 = c("research scientist",
"analytics analyst", " developer", " data ml engineer", "graduate teaching assistant",
"software developer", "machine learning engineer", "akademischer mitarbeiter machine learning and analytics",
"backend develope", "lead php developer", "php system analytic software specialist",
"webcreater", "data engineer", "software engineer", "assistant network administrator",
"frontend engineer", "application infrastructor lead", "software engineer",
"application developer", "software developer"), job3 = c("data scientist",
"machine learning engineer", "application developer associate manager",
NA, "co founder cto", NA, NA, NA, NA, NA, "lead php sugarcrm developer",
" php developer", "data analysing researcher ", NA, "application developer consultance",
"manager l1 ui frontend ", " software architect", "software engineering manager solution architect",
"software developer consultance", "ai developer"), job4 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "software architect development lead",
"team leader", NA, NA, " application development specialist",
" associate experience technology", NA, " software developer",
"fullstack developer productowner", NA)), row.names = c(NA, -20L
), class = c("tbl_df", "tbl", "data.frame"))
我们可以使用 purrr
包中的 map_dfc
来遍历我们数据集的每一列来检测类别,并按列绑定结果列。我使用 case_when
而不是 ifelse
以获得更好看的输出:
library(dplyr)
library(purrr)
data_rel %>%
map_dfc(~ case_when(
grepl("frontend|ui|ux", .x) ~ 1,
grepl("backend", .x) ~ 2,
grepl("fullstack", .x) ~ 3,
grepl("entwickler|development|application|developer|software", .x) ~ 4,
grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning", .x) ~ 5,
grepl("research|teaching|akademischer|researcher", .x) ~ 6,
grepl("project|manager|product|consultant|consulting", .x) ~ 7,
grepl("it|security|technical|tech", .x) ~ 8,
grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", .x) ~ 9,
grepl("founder|ceo|partner|chief|executive|cto", .x) ~ 10,
TRUE ~ as.numeric(NA)
))
# A tibble: 20 x 4
job1 job2 job3 job4
<dbl> <dbl> <dbl> <dbl>
1 NA 6 5 NA
2 4 5 5 NA
3 5 4 4 NA
4 5 5 NA NA
5 4 6 10 NA
6 4 4 NA NA
7 5 5 NA NA
8 4 5 NA NA
9 4 2 NA NA
10 4 4 NA NA
11 4 4 4 4
12 4 NA 4 NA
13 4 5 5 NA
14 4 4 NA NA
15 NA NA 4 4
16 4 1 1 8
17 3 4 4 NA
18 7 4 4 4
19 4 4 4 3
20 4 4 4 NA
或以 R 为基数:
cbind(sapply(data_rel, function(x) {
ifelse(grepl("frontend|ui|ux", x),1,
ifelse(grepl("backend", x),2,
ifelse(grepl("fullstack", x),3,
ifelse(grepl("entwickler|development|application|developer|software", x),4,
ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning", x),5,
ifelse(grepl("research|teaching|akademischer|researcher", x),6,
ifelse(grepl("project|manager|product|consultant|consulting", x),7,
ifelse(grepl("it|security|technical|tech", x),8,
ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", x),9,
ifelse(grepl("founder|ceo|partner|chief|executive|cto", x),10,NA))))))))))
}))
你可以写一个函数 case_when
-
library(dplyr)
change_category <- function(x) {
case_when(grepl("frontend|ui|ux", x) ~ 1L,
grepl("backend", x) ~ 2L,
grepl("fullstack", x) ~ 3L,
grepl("entwickler|development|application|developer|software",x) ~ 4L,
grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",x) ~5L,
grepl("research|teaching|akademischer|researcher",x) ~ 6L,
grepl("project|manager|product|consultant|consulting", x) ~ 7L,
grepl("it|security|technical|tech", x) ~ 8L,
grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design",x) ~ 9L,
grepl("founder|ceo|partner|chief|executive|cto",x) ~10L)
}
并应用 across
-
data_rel %>% mutate(across(.fns = change_category, .names = '{col}_category'))
您可以将作业存储在向量中,然后使用 sapply
使用 grepl
和 match
迭代它们以获取作业编号。
jobs <- c("frontend|ui|ux"
, "backend"
, "fullstack"
, "entwickler|development|application|developer|software"
, "data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning"
, "research|teaching|akademischer|researcher"
, "project|manager|product|consultant|consulting"
, "it|security|technical|tech"
, "margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design"
, "founder|ceo|partner|chief|executive|cto")
sapply(data_rel, function(x) apply(sapply(jobs, grepl, x), 1, match, x=TRUE))
# job1 job2 job3 job4
# [1,] NA 6 5 NA
# [2,] 4 5 5 NA
# [3,] 5 4 4 NA
# [4,] 5 5 NA NA
# [5,] 4 6 10 NA
# [6,] 4 4 NA NA
# [7,] 5 5 NA NA
# [8,] 4 5 NA NA
# [9,] 4 2 NA NA
#[10,] 4 4 NA NA
#[11,] 4 4 4 4
#[12,] 4 NA 4 NA
#[13,] 4 5 5 NA
#[14,] 4 4 NA NA
#[15,] NA NA 4 4
#[16,] 4 1 1 8
#[17,] 3 4 4 NA
#[18,] 7 4 4 4
#[19,] 4 4 4 3
#[20,] 4 4 4 NA
扩展 @zx8754 的评论,您可以将代码放在一个函数中,使用 lapply
应用于每一列,最后使用 do.call
将列组合为 data.frame
再次.
get_level <- function(col) {
ifelse(grepl("frontend|ui|ux", col), 1,
ifelse(grepl("backend", col), 2,
ifelse(grepl("fullstack", col), 3,
ifelse(grepl("entwickler|development|application|developer|software",col), 4,
ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",col), 5,
ifelse(grepl("research|teaching|akademischer|researcher",col), 6,
ifelse(grepl("project|manager|product|consultant|consulting",col), 7,
ifelse(grepl("it|security|technical|tech", col), 8,
ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", col), 9,
ifelse(grepl("founder|ceo|partner|chief|executive|cto", col), 10,
NA))))))))))
}
cats <- lapply(data_rel, get_level)
cats <- do.call(cbind.data.frame, cats)
names(cats) <- paste0("category", seq_along(data_rel))
我有一个包含 4 列的数据框,其中包含职务。对于每一列,我想创建一个新列(category1,category2,category3,category4),根据职位名称包含的单词(例如,如果职位名称包含“前端”, "ui", "ux" 那么列 category1 应该是 1)。我设法使用以下代码对每一列进行了手动分类,但我想同时对所有 4 列进行分类。感谢您的帮助!
data_rel$category1 <-
ifelse(grepl("frontend|ui|ux", data$job4_clean),1, ifelse(grepl("backend", data$job4_clean),2, ifelse(grepl("fullstack", data$job4_clean),3, ifelse(grepl("entwickler|development|application|developer|software",data$job4_clean),4, ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",data$job4_clean),5, ifelse(grepl("research|teaching|akademischer|researcher",data$job4_clean),6, ifelse(grepl("project|manager|product|consultant|consulting",data$job4_clean),7, ifelse(grepl("it|security|technical|tech", data$job4_clean),8, ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design",data$job4_clean),9, ifelse(grepl("founder|ceo|partner|chief|executive|cto",data$job4_clean),10,NA))))))))))
data_rel <- structure(list(job1 = c("phd fellow", "java developer intern",
"optical engineer", " dwh bi engineer", " software engineer",
"software developer", "data engineer", "application software engineer",
"software developer", " web developer", "web developer", "web developer",
"software engineer", "software engineer", " es computer", "associate software engineer",
"fullstack ios developer", "technical delivery manager project manager",
"software architect", "software developer"), job2 = c("research scientist",
"analytics analyst", " developer", " data ml engineer", "graduate teaching assistant",
"software developer", "machine learning engineer", "akademischer mitarbeiter machine learning and analytics",
"backend develope", "lead php developer", "php system analytic software specialist",
"webcreater", "data engineer", "software engineer", "assistant network administrator",
"frontend engineer", "application infrastructor lead", "software engineer",
"application developer", "software developer"), job3 = c("data scientist",
"machine learning engineer", "application developer associate manager",
NA, "co founder cto", NA, NA, NA, NA, NA, "lead php sugarcrm developer",
" php developer", "data analysing researcher ", NA, "application developer consultance",
"manager l1 ui frontend ", " software architect", "software engineering manager solution architect",
"software developer consultance", "ai developer"), job4 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "software architect development lead",
"team leader", NA, NA, " application development specialist",
" associate experience technology", NA, " software developer",
"fullstack developer productowner", NA)), row.names = c(NA, -20L
), class = c("tbl_df", "tbl", "data.frame"))
我们可以使用 purrr
包中的 map_dfc
来遍历我们数据集的每一列来检测类别,并按列绑定结果列。我使用 case_when
而不是 ifelse
以获得更好看的输出:
library(dplyr)
library(purrr)
data_rel %>%
map_dfc(~ case_when(
grepl("frontend|ui|ux", .x) ~ 1,
grepl("backend", .x) ~ 2,
grepl("fullstack", .x) ~ 3,
grepl("entwickler|development|application|developer|software", .x) ~ 4,
grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning", .x) ~ 5,
grepl("research|teaching|akademischer|researcher", .x) ~ 6,
grepl("project|manager|product|consultant|consulting", .x) ~ 7,
grepl("it|security|technical|tech", .x) ~ 8,
grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", .x) ~ 9,
grepl("founder|ceo|partner|chief|executive|cto", .x) ~ 10,
TRUE ~ as.numeric(NA)
))
# A tibble: 20 x 4
job1 job2 job3 job4
<dbl> <dbl> <dbl> <dbl>
1 NA 6 5 NA
2 4 5 5 NA
3 5 4 4 NA
4 5 5 NA NA
5 4 6 10 NA
6 4 4 NA NA
7 5 5 NA NA
8 4 5 NA NA
9 4 2 NA NA
10 4 4 NA NA
11 4 4 4 4
12 4 NA 4 NA
13 4 5 5 NA
14 4 4 NA NA
15 NA NA 4 4
16 4 1 1 8
17 3 4 4 NA
18 7 4 4 4
19 4 4 4 3
20 4 4 4 NA
或以 R 为基数:
cbind(sapply(data_rel, function(x) {
ifelse(grepl("frontend|ui|ux", x),1,
ifelse(grepl("backend", x),2,
ifelse(grepl("fullstack", x),3,
ifelse(grepl("entwickler|development|application|developer|software", x),4,
ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning", x),5,
ifelse(grepl("research|teaching|akademischer|researcher", x),6,
ifelse(grepl("project|manager|product|consultant|consulting", x),7,
ifelse(grepl("it|security|technical|tech", x),8,
ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", x),9,
ifelse(grepl("founder|ceo|partner|chief|executive|cto", x),10,NA))))))))))
}))
你可以写一个函数 case_when
-
library(dplyr)
change_category <- function(x) {
case_when(grepl("frontend|ui|ux", x) ~ 1L,
grepl("backend", x) ~ 2L,
grepl("fullstack", x) ~ 3L,
grepl("entwickler|development|application|developer|software",x) ~ 4L,
grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",x) ~5L,
grepl("research|teaching|akademischer|researcher",x) ~ 6L,
grepl("project|manager|product|consultant|consulting", x) ~ 7L,
grepl("it|security|technical|tech", x) ~ 8L,
grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design",x) ~ 9L,
grepl("founder|ceo|partner|chief|executive|cto",x) ~10L)
}
并应用 across
-
data_rel %>% mutate(across(.fns = change_category, .names = '{col}_category'))
您可以将作业存储在向量中,然后使用 sapply
使用 grepl
和 match
迭代它们以获取作业编号。
jobs <- c("frontend|ui|ux"
, "backend"
, "fullstack"
, "entwickler|development|application|developer|software"
, "data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning"
, "research|teaching|akademischer|researcher"
, "project|manager|product|consultant|consulting"
, "it|security|technical|tech"
, "margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design"
, "founder|ceo|partner|chief|executive|cto")
sapply(data_rel, function(x) apply(sapply(jobs, grepl, x), 1, match, x=TRUE))
# job1 job2 job3 job4
# [1,] NA 6 5 NA
# [2,] 4 5 5 NA
# [3,] 5 4 4 NA
# [4,] 5 5 NA NA
# [5,] 4 6 10 NA
# [6,] 4 4 NA NA
# [7,] 5 5 NA NA
# [8,] 4 5 NA NA
# [9,] 4 2 NA NA
#[10,] 4 4 NA NA
#[11,] 4 4 4 4
#[12,] 4 NA 4 NA
#[13,] 4 5 5 NA
#[14,] 4 4 NA NA
#[15,] NA NA 4 4
#[16,] 4 1 1 8
#[17,] 3 4 4 NA
#[18,] 7 4 4 4
#[19,] 4 4 4 3
#[20,] 4 4 4 NA
扩展 @zx8754 的评论,您可以将代码放在一个函数中,使用 lapply
应用于每一列,最后使用 do.call
将列组合为 data.frame
再次.
get_level <- function(col) {
ifelse(grepl("frontend|ui|ux", col), 1,
ifelse(grepl("backend", col), 2,
ifelse(grepl("fullstack", col), 3,
ifelse(grepl("entwickler|development|application|developer|software",col), 4,
ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",col), 5,
ifelse(grepl("research|teaching|akademischer|researcher",col), 6,
ifelse(grepl("project|manager|product|consultant|consulting",col), 7,
ifelse(grepl("it|security|technical|tech", col), 8,
ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", col), 9,
ifelse(grepl("founder|ceo|partner|chief|executive|cto", col), 10,
NA))))))))))
}
cats <- lapply(data_rel, get_level)
cats <- do.call(cbind.data.frame, cats)
names(cats) <- paste0("category", seq_along(data_rel))