检查向量中的单词是否出现在数据框不同列的同一行中
checking whether a word from a vector appears in the same row in different columns of a data frame
我正在尝试对我的数据进行故障排除,并检查某个名称是否出现在同一行的两个不同列中(相同的观察结果):
df1 <- data.frame(
text1 = c("John Jay Jakson",
"John Jay Jakson",
"John Jay Jakson",
"John Jack Jakson"),
text2 = c("Jerry Jack Jameson",
"Jerry Jack Jameson",
"Jerry Jack Jameson",
"Jerry Jack Jameson"))
df2 <- data.frame(
names = c("John", "Jay", "Jackson", "Jerry", "Jack", "Jameson"))
我想出的代码如下
data.check = sapply(df2$names, function(x) (grepl(x, df1$text1) & grepl(x, df1$text2))==TRUE)
或者:
which(sapply(df2$names, function(x) (grepl(x, df1$text1) & grepl(x, df1$text2))==TRUE))
但这些并不是筛选数据的最佳方法。相反,我想在 df1 中有一个新列 df1$check,它根据 df1$text1 和 df1$text2 下的每一行是否具有相同的名称来保存 1/0。
我知道将此代码分配给新列是行不通的:
df1$check = sapply(df2$names, function(x) (grepl(x, df1$text1) & grepl(x, df1$text2))==TRUE)
它给了我第 4 行错误,这应该是正确的。
感谢任何帮助,谢谢。
我包括一个 dplyr 方法:
# import required libraries
library(dplyr)
library(stringr)
# create your data (I added two more rows)
df1 <- data.frame(
text1 = c("John Jay Jakson",
"John Jay Jakson",
"John Jay Jakson",
"John Jack Jakson","Peter","John Snow"),
text2 = c("Jerry Jack Jameson",
"Jerry Jack Jameson",
"Jerry Jack Jameson",
"Jerry Jack Jameson","Peter", "Clay Snow"))
df2 <- data.frame(
names = c("John", "Jay", "Jackson", "Jerry", "Jack", "Jameson"))
# optionally convert df2 to vector or list
v2<-as.vector(df2$names)
#use of str_detect() to look for the string
# use of case_when() that works like if/else
# by including the | operator between the different names
# create a new column called check to store 1s and 0s
df1<-df1%>%
mutate(check=case_when(str_detect(text1,paste(v2, collapse = "|"))==TRUE & str_detect(text2,paste(v2, collapse = "|"))==TRUE ~"1",
TRUE~"0"))
OP 代码中 sapply
的输出 return 是一个逻辑 matrix
。
> sapply(df2$names, function(x) (grepl(x, df1$text1) & grepl(x, df1$text2)))
John Jay Jackson Jerry Jack Jameson
[1,] FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE TRUE FALSE
matrix
的每一列都应该收敛到一个逻辑值以创建一个向量。我们可以用 rowSums
包裹逻辑矩阵,然后将行式总和转换为逻辑向量 (> 0
) 并将其强制转换回二进制 (+
- TRUE
-> 1, FALSE
-> 0)
df1$check <- +(rowSums(sapply(df2$names, function(x)
(grepl(x, df1$text1) & grepl(x, df1$text2)))) > 0)
df1$check
[1] 0 0 0 1
或者另一种选择是使用 lapply
、return 循环 list
并使用 Reduce
和 |
到 return 向量
df1$check <- +(Reduce(`|`, lapply(df2$names, function(x)
(grepl(x, df1$text1) & grepl(x, df1$text2)))))
我正在尝试对我的数据进行故障排除,并检查某个名称是否出现在同一行的两个不同列中(相同的观察结果):
df1 <- data.frame(
text1 = c("John Jay Jakson",
"John Jay Jakson",
"John Jay Jakson",
"John Jack Jakson"),
text2 = c("Jerry Jack Jameson",
"Jerry Jack Jameson",
"Jerry Jack Jameson",
"Jerry Jack Jameson"))
df2 <- data.frame(
names = c("John", "Jay", "Jackson", "Jerry", "Jack", "Jameson"))
我想出的代码如下
data.check = sapply(df2$names, function(x) (grepl(x, df1$text1) & grepl(x, df1$text2))==TRUE)
或者:
which(sapply(df2$names, function(x) (grepl(x, df1$text1) & grepl(x, df1$text2))==TRUE))
但这些并不是筛选数据的最佳方法。相反,我想在 df1 中有一个新列 df1$check,它根据 df1$text1 和 df1$text2 下的每一行是否具有相同的名称来保存 1/0。
我知道将此代码分配给新列是行不通的:
df1$check = sapply(df2$names, function(x) (grepl(x, df1$text1) & grepl(x, df1$text2))==TRUE)
它给了我第 4 行错误,这应该是正确的。
感谢任何帮助,谢谢。
我包括一个 dplyr 方法:
# import required libraries
library(dplyr)
library(stringr)
# create your data (I added two more rows)
df1 <- data.frame(
text1 = c("John Jay Jakson",
"John Jay Jakson",
"John Jay Jakson",
"John Jack Jakson","Peter","John Snow"),
text2 = c("Jerry Jack Jameson",
"Jerry Jack Jameson",
"Jerry Jack Jameson",
"Jerry Jack Jameson","Peter", "Clay Snow"))
df2 <- data.frame(
names = c("John", "Jay", "Jackson", "Jerry", "Jack", "Jameson"))
# optionally convert df2 to vector or list
v2<-as.vector(df2$names)
#use of str_detect() to look for the string
# use of case_when() that works like if/else
# by including the | operator between the different names
# create a new column called check to store 1s and 0s
df1<-df1%>%
mutate(check=case_when(str_detect(text1,paste(v2, collapse = "|"))==TRUE & str_detect(text2,paste(v2, collapse = "|"))==TRUE ~"1",
TRUE~"0"))
OP 代码中 sapply
的输出 return 是一个逻辑 matrix
。
> sapply(df2$names, function(x) (grepl(x, df1$text1) & grepl(x, df1$text2)))
John Jay Jackson Jerry Jack Jameson
[1,] FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE TRUE FALSE
matrix
的每一列都应该收敛到一个逻辑值以创建一个向量。我们可以用 rowSums
包裹逻辑矩阵,然后将行式总和转换为逻辑向量 (> 0
) 并将其强制转换回二进制 (+
- TRUE
-> 1, FALSE
-> 0)
df1$check <- +(rowSums(sapply(df2$names, function(x)
(grepl(x, df1$text1) & grepl(x, df1$text2)))) > 0)
df1$check
[1] 0 0 0 1
或者另一种选择是使用 lapply
、return 循环 list
并使用 Reduce
和 |
到 return 向量
df1$check <- +(Reduce(`|`, lapply(df2$names, function(x)
(grepl(x, df1$text1) & grepl(x, df1$text2)))))