仅过滤 R 中列中的特定字符串
Filter only specific strings from a column in R
我有一个数据集,其中一列的值以逗号分隔。我需要解析此列中的每个值并仅保留特定值并删除其他值。
我拥有的代码和数据是:
myDf <- structure(list(GeogPreferences = structure(1:4, .Label = c("Central and East Europe, Europe, North America, West Europe, US",
"Europe, North America, West Europe, US", "Global, North America",
"Northeast, Southeast, West, US"), class = "factor")), .Names = "GeogPreferences", class = "data.frame", row.names = c(NA,
-4L))
regionInterest <- c("Americas", "North America", "US", "Northeast","Southeast","West","Midwest","Southwest")
k<-lapply(as.character(myDf$GeogPreferences),function(x) {
z<-trimws(unlist(strsplit(x,split = ",")))
z <- ifelse((z %in% regionInterest), z[z %in% regionInterest], z)
})
myDf$GeogPreferences<-unlist(k)
这是我收到的错误:
Error in `$<-.data.frame`(`*tmp*`, "GeogPreferences", value = c("Central and East Europe",
: replacement has 15 rows, data has 4
我的数据集如下所示:
GeogPreferences
1 Central and East Europe, Europe, North America, West Europe, US
2 Europe, North America, West Europe, US
3 Global, North America
4 Northeast, Southeast, West, US
如果该列包含 regionInterest
中的任何字符串,我想保留该字符串,否则我想删除它。
我期望的输出是:
GeogPreferences
1 North America, US
2 North America, US
3 North America
4 Northeast, Southeast, West, US
有人可以帮我解决我做错的事吗?谢谢!
您收到的错误是由于 strsplit
创建的行多于输入的 df。同样在您的 ifelse
声明中,您在 FALSE
上返回 z 所以它没有按照您的预期进行。
这里有 tidyr
/dplyr
解决您问题的方法。
myDf %>%
mutate(id = row_number()) %>%
separate_rows(GeogPreferences, sep = ",") %>%
mutate(GeogPreferences = trimws(GeogPreferences)) %>%
filter(GeogPreferences %in% c("Americas", "North America", "US", "Northeast","Southeast","West","Midwest","Southwest")) %>%
group_by(id) %>%
summarize(GeogPreferences = toString(trimws(GeogPreferences))) %>%
select(-id)
# A tibble: 4 × 1
GeogPreferences
<chr>
1 North America, US
2 North America, US
3 North America
4 Northeast, Southeast, West, US
您可能应该首先 拆分数据然后才 运行 拆分数据子集。
这既会提高效率(因为 strsplit
它是矢量化的),而且每次拆分中矢量的大小都无关紧要。此外,在 trimws
中不需要,它只会使您的代码效率低下。相反,在 ", "
上拆分,同时指定 fixed = TRUE
。这将使 strsplit
的工作速度提高大约 X10 倍,因为它不会使用正则表达式来拆分。
以下作品仅使用 base R
do.call(rbind, # you can use `rbind.data.frame` instead if you don't want a matrix
lapply(strsplit(as.character(myDf$GeogPreferences), ", ", fixed = TRUE),
function(x) toString(x[x %in% regionInterest])))
# [,1]
# [1,] "North America, US"
# [2,] "North America, US"
# [3,] "North America"
# [4,] "Northeast, Southeast, West, US"
虽然上面的解决方案(类似于你自己的)仍然是一个按行的解决方案。相反,我们可以尝试通过按列操作来达到相同的效果。 "columnwise" 我的意思是,如果我们将对 t运行 计划的拆分进行操作,迭代次数将是 myDf$GeogPreferences
中最长句子的大小(我们逗号的数量split by) 应该明显小于数据中的行数。
她的插图使用 data.table::tstrsplit
tmp <- data.table::tstrsplit(myDf$GeogPreferences, ", ", fixed = TRUE)
res <- do.call(paste,
c(sep = ", ",
lapply(tmp, function(x) replace(x, !x %in% regionInterest, NA_character_))))
gsub("NA, |, NA", "", res)
# [1] "North America, US" "North America, US" "North America" "Northeast, Southeast, West, US"
这是 10 万行数据集的简单基准测试
bigDF <- myDf[sample(nrow(myDf), 1e5, replace = TRUE),, drop = FALSE]
library(dplyr)
library(tidyr)
library(data.table)
tidyverse <- function(x) {
x %>%
mutate(id = row_number()) %>%
separate_rows(GeogPreferences, sep = ",") %>%
mutate(GeogPreferences = trimws(GeogPreferences)) %>%
filter(GeogPreferences %in% c("Americas", "North America", "US", "Northeast","Southeast","West","Midwest","Southwest")) %>%
group_by(id) %>%
summarize(GeogPreferences = toString(trimws(GeogPreferences))) %>%
select(-id)
}
MF <- function(x) {
k <- lapply(as.character(x$GeogPreferences), function(x) {
z <- trimws(unlist(strsplit(x, split = ",")))
z <- z[z %in% regionInterest]
})
sapply(k, paste, collapse = ", ")
}
DA1 <- function(x) {
do.call(rbind,
lapply(strsplit(as.character(x$GeogPreferences), ", ", fixed = TRUE),
function(x) toString(x[x %in% regionInterest])))
}
DA2 <- function(x) {
tmp <- data.table::tstrsplit(x$GeogPreferences, ", ", fixed = TRUE)
res <- do.call(paste,
c(sep = ", ",
lapply(tmp, function(x) replace(x, !x %in% regionInterest, NA_character_))))
gsub("NA, |, NA", "", res)
}
system.time(tidyverse(bigDF))
# user system elapsed
# 17.67 0.01 17.91
system.time(MF(bigDF))
# user system elapsed
# 15.52 0.00 15.70
system.time(DA1(bigDF))
# user system elapsed
# 0.97 0.00 1.00
system.time(DA2(bigDF))
# user system elapsed
# 0.25 0.00 0.25
所以其他两个解决方案 运行 超过 15 秒,而我的两个解决方案 运行 不到一秒
如果您更喜欢更接近您的方法的解决方案,请将其更改为
regionInterest <- c("Americas", "North America", "US",
"Northeast","Southeast","West","Midwest","Southwest")
k<-lapply(as.character(myDf$GeogPreferences),function(x) {
z<-trimws(unlist(strsplit(x,split = ",")))
# this makes sure you only use z which are in regionInterest
z <- z[z %in% regionInterest]
})
# paste with collapse creates one value out of a vector of string seperated by the collapse argument
myDf$GeogPreferences<-sapply(k, paste, collapse = ", ")
希望对您有所帮助
我有一个数据集,其中一列的值以逗号分隔。我需要解析此列中的每个值并仅保留特定值并删除其他值。
我拥有的代码和数据是:
myDf <- structure(list(GeogPreferences = structure(1:4, .Label = c("Central and East Europe, Europe, North America, West Europe, US",
"Europe, North America, West Europe, US", "Global, North America",
"Northeast, Southeast, West, US"), class = "factor")), .Names = "GeogPreferences", class = "data.frame", row.names = c(NA,
-4L))
regionInterest <- c("Americas", "North America", "US", "Northeast","Southeast","West","Midwest","Southwest")
k<-lapply(as.character(myDf$GeogPreferences),function(x) {
z<-trimws(unlist(strsplit(x,split = ",")))
z <- ifelse((z %in% regionInterest), z[z %in% regionInterest], z)
})
myDf$GeogPreferences<-unlist(k)
这是我收到的错误:
Error in `$<-.data.frame`(`*tmp*`, "GeogPreferences", value = c("Central and East Europe",
: replacement has 15 rows, data has 4
我的数据集如下所示:
GeogPreferences
1 Central and East Europe, Europe, North America, West Europe, US
2 Europe, North America, West Europe, US
3 Global, North America
4 Northeast, Southeast, West, US
如果该列包含 regionInterest
中的任何字符串,我想保留该字符串,否则我想删除它。
我期望的输出是:
GeogPreferences
1 North America, US
2 North America, US
3 North America
4 Northeast, Southeast, West, US
有人可以帮我解决我做错的事吗?谢谢!
您收到的错误是由于 strsplit
创建的行多于输入的 df。同样在您的 ifelse
声明中,您在 FALSE
上返回 z 所以它没有按照您的预期进行。
这里有 tidyr
/dplyr
解决您问题的方法。
myDf %>%
mutate(id = row_number()) %>%
separate_rows(GeogPreferences, sep = ",") %>%
mutate(GeogPreferences = trimws(GeogPreferences)) %>%
filter(GeogPreferences %in% c("Americas", "North America", "US", "Northeast","Southeast","West","Midwest","Southwest")) %>%
group_by(id) %>%
summarize(GeogPreferences = toString(trimws(GeogPreferences))) %>%
select(-id)
# A tibble: 4 × 1
GeogPreferences
<chr>
1 North America, US
2 North America, US
3 North America
4 Northeast, Southeast, West, US
您可能应该首先 拆分数据然后才 运行 拆分数据子集。
这既会提高效率(因为 strsplit
它是矢量化的),而且每次拆分中矢量的大小都无关紧要。此外,在 trimws
中不需要,它只会使您的代码效率低下。相反,在 ", "
上拆分,同时指定 fixed = TRUE
。这将使 strsplit
的工作速度提高大约 X10 倍,因为它不会使用正则表达式来拆分。
以下作品仅使用 base R
do.call(rbind, # you can use `rbind.data.frame` instead if you don't want a matrix
lapply(strsplit(as.character(myDf$GeogPreferences), ", ", fixed = TRUE),
function(x) toString(x[x %in% regionInterest])))
# [,1]
# [1,] "North America, US"
# [2,] "North America, US"
# [3,] "North America"
# [4,] "Northeast, Southeast, West, US"
虽然上面的解决方案(类似于你自己的)仍然是一个按行的解决方案。相反,我们可以尝试通过按列操作来达到相同的效果。 "columnwise" 我的意思是,如果我们将对 t运行 计划的拆分进行操作,迭代次数将是 myDf$GeogPreferences
中最长句子的大小(我们逗号的数量split by) 应该明显小于数据中的行数。
她的插图使用 data.table::tstrsplit
tmp <- data.table::tstrsplit(myDf$GeogPreferences, ", ", fixed = TRUE)
res <- do.call(paste,
c(sep = ", ",
lapply(tmp, function(x) replace(x, !x %in% regionInterest, NA_character_))))
gsub("NA, |, NA", "", res)
# [1] "North America, US" "North America, US" "North America" "Northeast, Southeast, West, US"
这是 10 万行数据集的简单基准测试
bigDF <- myDf[sample(nrow(myDf), 1e5, replace = TRUE),, drop = FALSE]
library(dplyr)
library(tidyr)
library(data.table)
tidyverse <- function(x) {
x %>%
mutate(id = row_number()) %>%
separate_rows(GeogPreferences, sep = ",") %>%
mutate(GeogPreferences = trimws(GeogPreferences)) %>%
filter(GeogPreferences %in% c("Americas", "North America", "US", "Northeast","Southeast","West","Midwest","Southwest")) %>%
group_by(id) %>%
summarize(GeogPreferences = toString(trimws(GeogPreferences))) %>%
select(-id)
}
MF <- function(x) {
k <- lapply(as.character(x$GeogPreferences), function(x) {
z <- trimws(unlist(strsplit(x, split = ",")))
z <- z[z %in% regionInterest]
})
sapply(k, paste, collapse = ", ")
}
DA1 <- function(x) {
do.call(rbind,
lapply(strsplit(as.character(x$GeogPreferences), ", ", fixed = TRUE),
function(x) toString(x[x %in% regionInterest])))
}
DA2 <- function(x) {
tmp <- data.table::tstrsplit(x$GeogPreferences, ", ", fixed = TRUE)
res <- do.call(paste,
c(sep = ", ",
lapply(tmp, function(x) replace(x, !x %in% regionInterest, NA_character_))))
gsub("NA, |, NA", "", res)
}
system.time(tidyverse(bigDF))
# user system elapsed
# 17.67 0.01 17.91
system.time(MF(bigDF))
# user system elapsed
# 15.52 0.00 15.70
system.time(DA1(bigDF))
# user system elapsed
# 0.97 0.00 1.00
system.time(DA2(bigDF))
# user system elapsed
# 0.25 0.00 0.25
所以其他两个解决方案 运行 超过 15 秒,而我的两个解决方案 运行 不到一秒
如果您更喜欢更接近您的方法的解决方案,请将其更改为
regionInterest <- c("Americas", "North America", "US",
"Northeast","Southeast","West","Midwest","Southwest")
k<-lapply(as.character(myDf$GeogPreferences),function(x) {
z<-trimws(unlist(strsplit(x,split = ",")))
# this makes sure you only use z which are in regionInterest
z <- z[z %in% regionInterest]
})
# paste with collapse creates one value out of a vector of string seperated by the collapse argument
myDf$GeogPreferences<-sapply(k, paste, collapse = ", ")
希望对您有所帮助