仅在 R (dplyr) 中满足特定条件的情况下,用另一列数据替换一列中的某些数据
Replacing some data from a column with another column data only where a certain condition is met in R (dplyr)
我有一个包含 30 多列和 10000 多行的扩展数据框。今天我想关注两栏:languages
和 languages2
:
languages languages2
Spanish NA
Spanish NA
Other (specify) French
Other (specify) German
Other (specify) Russian
English NA
Other (specify) Portuguese
English NA
(...)
这是我需要的:
languages
Spanish
Spanish
French
German
Russian
English
Portuguese
English
(...)
我正在使用来自 dplyr
的 mutate 函数寻找答案
data %>%
mutate(languages=if_else(languages=="Other (specify)", languages2, languages)
对于更大的数据,您可能需要控制其他情况,例如防止用 NA
替换数据并保留 Other
值。此外,如果解决第一列可能包含 Other()
或 Other, lang
的情况可能有好处。您可能要考虑使用正则表达式或 pre-processing 第一列。
library("tidyverse")
dta <- tribble(
~lang1, ~lang2,
"Spanish", NA,
"Other", "English",
"Other", NA
)
mutate(dta, lang1 = case_when(
grepl("^Other,*", lang1) & !is.na(lang2) ~ lang2,
TRUE ~ lang1
))
另一种可能性(如果您希望检查在第 2 列而不是第 1 列):
library(dplyr)
df <- data.frame(languages=c("Spanish","Spanish","Other (specify)","Other (specify)","Other (specify)","English","Other (specify)","English"),languages2=c(NA,NA,"French","German","Russian",NA,"Portuguese",NA))
df %>%
mutate(languages=if_else(!is.na(languages2), languages2, languages))
languages languages2
1 Spanish <NA>
2 Spanish <NA>
3 French French
4 German German
5 Russian Russian
6 English <NA>
7 Portuguese Portuguese
8 English <NA>
使用dplyr
,我们可以用NA
替换Other (specify)
,然后使用coalesce
:
library(dplyr)
df %>%
mutate(languages = coalesce(na_if(languages, "Other (specify)"), languages2)) %>%
select(languages)
输出
languages
1 Spanish
2 Spanish
3 French
4 German
5 Russian
6 English
7 Portuguese
8 English
一个 tidyverse
选项是使用 str_replace_all
将 Other (specify)
替换为 languages2
.
中的值
library(tidyverse)
df %>%
mutate(languages = str_replace_all(languages,"Other \(specify\)", languages2)) %>%
select(languages)
数据
df <- structure(list(languages = c("Spanish", "Spanish", "Other (specify)",
"Other (specify)", "Other (specify)", "English", "Other (specify)",
"English"), languages2 = c(NA, NA, "French", "German", "Russian",
NA, "Portuguese", NA)), class = "data.frame", row.names = c(NA,
-8L))
基准
但是,如果您有大量数据并且需要更快的速度,那么您可以考虑 base R,它比 dplyr
或 data.table
.
更快
bm <- microbenchmark::microbenchmark(Konrad = mutate(df, languages = case_when(
grepl("^Other,*", languages) & !is.na(languages2) ~ languages2,
TRUE ~ languages
)),
langtang = df %>%
mutate(languages=if_else(languages=="Other (specify)", languages2, languages)),
valentina = df %>%
mutate(languages=if_else(!is.na(languages2), languages2, languages)),
andrew_stringr = df %>%
mutate(languages = str_replace_all(languages,"Other \(specify\)", languages2)),
andrew_coalesce = df %>%
mutate(languages = coalesce(na_if(languages, "Other (specify)"), languages2, 'Other (specify)')),
andrew_baseR = {df1 <- df; df1[df1$languages == "Other (specify)", "languages"] <- df1[df1$languages == "Other (specify)", "languages2" ]},
andrew_baseR_with = {df2 <- df; df2$languages <- with( df2, ifelse( languages == "Other (specify)", languages2, languages ) )},
andrew_datatable = {dt <- as.data.table(df); dt[languages == "Other (specify)", languages := languages2 ]},
times = 1000)
我有一个包含 30 多列和 10000 多行的扩展数据框。今天我想关注两栏:languages
和 languages2
:
languages languages2
Spanish NA
Spanish NA
Other (specify) French
Other (specify) German
Other (specify) Russian
English NA
Other (specify) Portuguese
English NA
(...)
这是我需要的:
languages
Spanish
Spanish
French
German
Russian
English
Portuguese
English
(...)
我正在使用来自 dplyr
的 mutate 函数寻找答案data %>%
mutate(languages=if_else(languages=="Other (specify)", languages2, languages)
对于更大的数据,您可能需要控制其他情况,例如防止用 NA
替换数据并保留 Other
值。此外,如果解决第一列可能包含 Other()
或 Other, lang
的情况可能有好处。您可能要考虑使用正则表达式或 pre-processing 第一列。
library("tidyverse")
dta <- tribble(
~lang1, ~lang2,
"Spanish", NA,
"Other", "English",
"Other", NA
)
mutate(dta, lang1 = case_when(
grepl("^Other,*", lang1) & !is.na(lang2) ~ lang2,
TRUE ~ lang1
))
另一种可能性(如果您希望检查在第 2 列而不是第 1 列):
library(dplyr)
df <- data.frame(languages=c("Spanish","Spanish","Other (specify)","Other (specify)","Other (specify)","English","Other (specify)","English"),languages2=c(NA,NA,"French","German","Russian",NA,"Portuguese",NA))
df %>%
mutate(languages=if_else(!is.na(languages2), languages2, languages))
languages languages2
1 Spanish <NA>
2 Spanish <NA>
3 French French
4 German German
5 Russian Russian
6 English <NA>
7 Portuguese Portuguese
8 English <NA>
使用dplyr
,我们可以用NA
替换Other (specify)
,然后使用coalesce
:
library(dplyr)
df %>%
mutate(languages = coalesce(na_if(languages, "Other (specify)"), languages2)) %>%
select(languages)
输出
languages
1 Spanish
2 Spanish
3 French
4 German
5 Russian
6 English
7 Portuguese
8 English
一个 tidyverse
选项是使用 str_replace_all
将 Other (specify)
替换为 languages2
.
library(tidyverse)
df %>%
mutate(languages = str_replace_all(languages,"Other \(specify\)", languages2)) %>%
select(languages)
数据
df <- structure(list(languages = c("Spanish", "Spanish", "Other (specify)",
"Other (specify)", "Other (specify)", "English", "Other (specify)",
"English"), languages2 = c(NA, NA, "French", "German", "Russian",
NA, "Portuguese", NA)), class = "data.frame", row.names = c(NA,
-8L))
基准
但是,如果您有大量数据并且需要更快的速度,那么您可以考虑 base R,它比 dplyr
或 data.table
.
bm <- microbenchmark::microbenchmark(Konrad = mutate(df, languages = case_when(
grepl("^Other,*", languages) & !is.na(languages2) ~ languages2,
TRUE ~ languages
)),
langtang = df %>%
mutate(languages=if_else(languages=="Other (specify)", languages2, languages)),
valentina = df %>%
mutate(languages=if_else(!is.na(languages2), languages2, languages)),
andrew_stringr = df %>%
mutate(languages = str_replace_all(languages,"Other \(specify\)", languages2)),
andrew_coalesce = df %>%
mutate(languages = coalesce(na_if(languages, "Other (specify)"), languages2, 'Other (specify)')),
andrew_baseR = {df1 <- df; df1[df1$languages == "Other (specify)", "languages"] <- df1[df1$languages == "Other (specify)", "languages2" ]},
andrew_baseR_with = {df2 <- df; df2$languages <- with( df2, ifelse( languages == "Other (specify)", languages2, languages ) )},
andrew_datatable = {dt <- as.data.table(df); dt[languages == "Other (specify)", languages := languages2 ]},
times = 1000)