将文本列转换为 r 中的向量
Turning a text column into a vector in r
我想看看文本列是否有超出“a”和“b”指定值的元素
specified_value=c("a","b")
df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2","a,b")
df_out=data.frame(key=c(1,2,3),text=c("c","d","1,2",NA))
这是我试过的:
df=df%>%mutate(text_vector=strsplit(text, split=","),
extra=text_vector[which(!text_vector %in% specified_value)])
但这行不通,有什么建议吗?
我们可以用分隔符,
和separate_rows
拆分'text',用'key'分组,得到不在'specified_value'中的元素setdiff
和 paste
将它们放在一起 (toString
),然后进行连接以获取原始数据集中的其他列
library(dplyr) # >= 1.0.0
library(tidyr)
df %>%
separate_rows(text) %>%
group_by(key) %>%
summarise(extra = toString(setdiff(text, specified_value))) %>%
left_join(df) %>%
mutate(extra = na_if(extra, ""))
# A tibble: 4 x 3
# key extra text
# <dbl> <chr> <chr>
#1 1 c a,b,c
#2 2 d a,d
#3 3 1, 2 1,2
#4 4 <NA> a,b
使用 setdiff
.
df$outside <- sapply({
x <- lapply(strsplit(df$text, ","), setdiff, specified_value)
replace(x, lengths(x) == 0, NA)},
paste, collapse=",")
df
# key text outside
# 1 1 a,b,c c
# 2 2 a,d d
# 3 3 1,2 1,2
# 4 4 a,b NA
数据:
df <- structure(list(key = c(1, 2, 3, 4), text = c("a,b,c", "a,d",
"1,2", "a,b")), class = "data.frame", row.names = c(NA, -4L))
specified_value <- c("a", "b")
使用stringi::stri_split_fixed
library(stringi)
!all(stri_split_fixed("a,b", ",", simplify=T) %in% specified_value) #FALSE
!all(stri_split_fixed("a,b,c", ",", simplify=T) %in% specified_value) #TRUE
使用正则表达式而不用逗号分割数据的选项:
#Collapse the specified_value in one string and remove from text
df$text1 <- gsub(paste0(specified_value, collapse = "|"), '', df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),', '', df$text1, perl = TRUE)
df
# key text text1
#1 1 a,b,c c
#2 2 a,d d
#3 3 1,2 1,2
#4 4 a,b
我想看看文本列是否有超出“a”和“b”指定值的元素
specified_value=c("a","b")
df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2","a,b")
df_out=data.frame(key=c(1,2,3),text=c("c","d","1,2",NA))
这是我试过的:
df=df%>%mutate(text_vector=strsplit(text, split=","),
extra=text_vector[which(!text_vector %in% specified_value)])
但这行不通,有什么建议吗?
我们可以用分隔符,
和separate_rows
拆分'text',用'key'分组,得到不在'specified_value'中的元素setdiff
和 paste
将它们放在一起 (toString
),然后进行连接以获取原始数据集中的其他列
library(dplyr) # >= 1.0.0
library(tidyr)
df %>%
separate_rows(text) %>%
group_by(key) %>%
summarise(extra = toString(setdiff(text, specified_value))) %>%
left_join(df) %>%
mutate(extra = na_if(extra, ""))
# A tibble: 4 x 3
# key extra text
# <dbl> <chr> <chr>
#1 1 c a,b,c
#2 2 d a,d
#3 3 1, 2 1,2
#4 4 <NA> a,b
使用 setdiff
.
df$outside <- sapply({
x <- lapply(strsplit(df$text, ","), setdiff, specified_value)
replace(x, lengths(x) == 0, NA)},
paste, collapse=",")
df
# key text outside
# 1 1 a,b,c c
# 2 2 a,d d
# 3 3 1,2 1,2
# 4 4 a,b NA
数据:
df <- structure(list(key = c(1, 2, 3, 4), text = c("a,b,c", "a,d",
"1,2", "a,b")), class = "data.frame", row.names = c(NA, -4L))
specified_value <- c("a", "b")
使用stringi::stri_split_fixed
library(stringi)
!all(stri_split_fixed("a,b", ",", simplify=T) %in% specified_value) #FALSE
!all(stri_split_fixed("a,b,c", ",", simplify=T) %in% specified_value) #TRUE
使用正则表达式而不用逗号分割数据的选项:
#Collapse the specified_value in one string and remove from text
df$text1 <- gsub(paste0(specified_value, collapse = "|"), '', df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),', '', df$text1, perl = TRUE)
df
# key text text1
#1 1 a,b,c c
#2 2 a,d d
#3 3 1,2 1,2
#4 4 a,b