使用 strsplit 将组合词创建为子集数据框的逻辑列表
Create logical list with strsplit on combined words to subset data frame
我已尝试根据特定列的条件对我的数据框进行子集化。为此,我需要为此列的每一行创建 TRUE 或 FALSE 信息。但是此列中的某些行包含组合词,我的代码无法检测到它们。
p <- sapply(strsplit(test$hashtags, split=","), function(x)any(x%in%"evet"))
当您检查示例数据时,您可以很容易地看到第 5、7、8 行有特定的单词,但它们显示为 FALSE。
我尝试在我的代码中添加 "unlist" 命令,但它对我不起作用。
p <- sapply(unlist(strsplit(test$hashtags, split=",")), function(x)any(x%in%"evet"))
即使有多个单词,我也需要根据特定单词为组合行创建一个 FALSE 或 TRUE 条件。
提前致谢。
示例数据:
test <- structure(list(created_at = structure(c(1489636860, 1489636860,
1489636860, 1489636860, 1489636860, 1489636860, 1489636860, 1489636860,
1489636860, 1489636860), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
user.screen_name = c("bilge_bilir", "memetozturk93", "Byomeraslan",
"tmremolar", "orhanyilmaz_77", "tamdere", "EriVatan", "BaySancaktar",
"zeynepmekik", "EriVatan"), entities.hashtags = list(structure(list(
indices = list(c(84L, 90L)), text = "Hayır"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(65L, 70L)), text = "evet"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(98L, 103L)), text = "Evet"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(98L, 104L)), text = "Hayır"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(28L, 33L), c(45L, 50L), c(89L, 94L)),
text = c("EVET", "EVET", "EVET")), .Names = c("indices",
"text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
indices = list(c(38L, 43L)), text = "EVET"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(20L, 29L), c(36L, 46L), c(89L, 94L)),
text = c("Dirilişe", "Yükselişe", "Evet")), .Names = c("indices",
"text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
indices = list(c(10L, 15L), c(16L, 20L), c(21L, 26L),
c(27L, 31L)), text = c("Evet", "Eri", "Beli", "Yes"
)), .Names = c("indices", "text"), class = "data.frame", row.names = c(NA,
4L)), structure(list(indices = list(c(125L, 130L)), text = "Evet"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(102L, 107L)), text = "EVET"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L)), retweeted_status.created_at = c("Thu Mar 16 03:49:15 +0000 2017",
"Wed Mar 15 23:57:44 +0000 2017", "Wed Mar 15 21:07:54 +0000 2017",
"Wed Mar 15 20:54:43 +0000 2017", "Wed Mar 15 14:41:15 +0000 2017",
"Wed Mar 15 23:07:43 +0000 2017", "Wed Mar 15 15:41:06 +0000 2017",
NA, "Wed Mar 15 11:13:15 +0000 2017", "Wed Mar 15 16:37:13 +0000 2017"
), entities.user_mentions = list(structure(list(indices = list(
c(3L, 16L), c(18L, 30L), c(44L, 55L), c(56L, 71L), c(72L,
83L)), screen_name = c("seremgiz8289", "bilge_bilir",
"OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), id = c(301944248,
2189106581, 2756465282, 2668851081, 2734161237), id_str = c("301944248",
"2189106581", "2756465282", "2668851081", "2734161237"),
name = c("ATA KIZI HAYIR DİYOR", "Bilge Eryuz", "OduncuTimi ®",
"Yalçın Velioğlu", "OPTlMlst_Z")), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = c(NA,
5L)), structure(list(indices = list(c(3L, 16L)), screen_name = "kendimce_ben",
id = 2322523731, id_str = "2322523731", name = "İzzet#EVET/\U0001f1f9\U0001f1f7"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 12L)), screen_name = "omrolcay",
id = 360420809L, id_str = "360420809", name = "Ömer Olcay"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 18L)), screen_name = "mehmet_asassoy",
id = 3151503430, id_str = "3151503430", name = "Mehmet Asassoy"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 17L), c(120L, 132L
)), screen_name = c("sevincbeykent", "yigitbulutt"),
id = c(538364458L, 256065299L), id_str = c("538364458",
"256065299"), name = c("Sevinç", "YİĞİT BULUT"
)), .Names = c("indices", "screen_name", "id", "id_str",
"name"), class = "data.frame", row.names = 1:2), structure(list(
indices = list(c(3L, 13L)), screen_name = "AKsamet54",
id = 313205928L, id_str = "313205928", name = "Samet ÇELİK"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 18L)), screen_name = "HayataTebessum",
id = 2911157237, id_str = "2911157237", name = "Meryem"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(0L, 9L)), screen_name = "4qet1dil",
id = 536676261L, id_str = "536676261", name = "KerenGo"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 18L)), screen_name = "akkadinantalya",
id = 1898504755L, id_str = "1898504755", name = "AK Kadın Antalya"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 15L)), screen_name = "menes__2010",
id = 186968367L, id_str = "186968367", name = "#EVET☪ ياسين ☝"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L)),
hashtags = c("hayir", "evet", "evet", "hayir", "c(\"evet\", \"evet\", \"evet\")",
"evet", "c(\"dirilise\", \"yukselise\", \"evet\")", "c(\"evet\", \"eri\", \"beli\", \"yes\")",
"evet", "evet"), mentions = list(c("seremgiz8289", "bilge_bilir",
"OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), "kendimce_ben",
"omrolcay", "mehmet_asassoy", c("sevincbeykent", "yigitbulutt"
), "AKsamet54", "HayataTebessum", "4qet1dil", "akkadinantalya",
"menes__2010")), .Names = c("created_at", "user.screen_name",
"entities.hashtags", "retweeted_status.created_at", "entities.user_mentions",
"hashtags", "mentions"), row.names = c(NA, 10L), class = "data.frame")
这主要是因为 hashtags
列的生成方式。它被存储为一个字符向量列表,当被强制转换为字符时,它给出了这个结构。
例如,参见
list(c("A", "B", "C"))
#[[1]]
#[1] "A" "B" "C"
as.character(list(c("A", "B", "C")))
#[1] "c(\"A\", \"B\", \"C\")"
检查数据框上的单个元素会给出相同的结构。
test$hashtags[5]
#[1] "c(\"evet\", \"evet\", \"evet\")"
因此,如果您无法返回并更改 hashtags
列的生成方式,您可以改用 grepl
,这样可以避免 strsplit
和 sapply
也打电话。
grepl("evet", test$hashtags)
#[1] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
我会在这里使用 grepl
:
p <- sapply(strsplit(test$hashtags, split=","), function(x) {
grepl("evet", x)
})
如果你真的想匹配独立的单词evet
,那么使用单词边界:
p <- sapply(strsplit(test$hashtags, split=","), function(x) {
grepl("\bevet\b", x)
})
我们可以创建一个逻辑索引列 str_detect
library(tidyverse)
out <- test %>%
mutate(ind = str_detect(hashtags, pattern = "evet"))
out$ind
#[1] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
如果我们需要得到每个词的逻辑索引
test %>%
mutate(ind = str_extract_all(hashtags, "\w+") %>%
map(str_detect, pattern = "evet"))
我已尝试根据特定列的条件对我的数据框进行子集化。为此,我需要为此列的每一行创建 TRUE 或 FALSE 信息。但是此列中的某些行包含组合词,我的代码无法检测到它们。
p <- sapply(strsplit(test$hashtags, split=","), function(x)any(x%in%"evet"))
当您检查示例数据时,您可以很容易地看到第 5、7、8 行有特定的单词,但它们显示为 FALSE。
我尝试在我的代码中添加 "unlist" 命令,但它对我不起作用。
p <- sapply(unlist(strsplit(test$hashtags, split=",")), function(x)any(x%in%"evet"))
即使有多个单词,我也需要根据特定单词为组合行创建一个 FALSE 或 TRUE 条件。 提前致谢。
示例数据:
test <- structure(list(created_at = structure(c(1489636860, 1489636860,
1489636860, 1489636860, 1489636860, 1489636860, 1489636860, 1489636860,
1489636860, 1489636860), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
user.screen_name = c("bilge_bilir", "memetozturk93", "Byomeraslan",
"tmremolar", "orhanyilmaz_77", "tamdere", "EriVatan", "BaySancaktar",
"zeynepmekik", "EriVatan"), entities.hashtags = list(structure(list(
indices = list(c(84L, 90L)), text = "Hayır"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(65L, 70L)), text = "evet"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(98L, 103L)), text = "Evet"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(98L, 104L)), text = "Hayır"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(28L, 33L), c(45L, 50L), c(89L, 94L)),
text = c("EVET", "EVET", "EVET")), .Names = c("indices",
"text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
indices = list(c(38L, 43L)), text = "EVET"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(20L, 29L), c(36L, 46L), c(89L, 94L)),
text = c("Dirilişe", "Yükselişe", "Evet")), .Names = c("indices",
"text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
indices = list(c(10L, 15L), c(16L, 20L), c(21L, 26L),
c(27L, 31L)), text = c("Evet", "Eri", "Beli", "Yes"
)), .Names = c("indices", "text"), class = "data.frame", row.names = c(NA,
4L)), structure(list(indices = list(c(125L, 130L)), text = "Evet"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L), structure(list(
indices = list(c(102L, 107L)), text = "EVET"), .Names = c("indices",
"text"), class = "data.frame", row.names = 1L)), retweeted_status.created_at = c("Thu Mar 16 03:49:15 +0000 2017",
"Wed Mar 15 23:57:44 +0000 2017", "Wed Mar 15 21:07:54 +0000 2017",
"Wed Mar 15 20:54:43 +0000 2017", "Wed Mar 15 14:41:15 +0000 2017",
"Wed Mar 15 23:07:43 +0000 2017", "Wed Mar 15 15:41:06 +0000 2017",
NA, "Wed Mar 15 11:13:15 +0000 2017", "Wed Mar 15 16:37:13 +0000 2017"
), entities.user_mentions = list(structure(list(indices = list(
c(3L, 16L), c(18L, 30L), c(44L, 55L), c(56L, 71L), c(72L,
83L)), screen_name = c("seremgiz8289", "bilge_bilir",
"OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), id = c(301944248,
2189106581, 2756465282, 2668851081, 2734161237), id_str = c("301944248",
"2189106581", "2756465282", "2668851081", "2734161237"),
name = c("ATA KIZI HAYIR DİYOR", "Bilge Eryuz", "OduncuTimi ®",
"Yalçın Velioğlu", "OPTlMlst_Z")), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = c(NA,
5L)), structure(list(indices = list(c(3L, 16L)), screen_name = "kendimce_ben",
id = 2322523731, id_str = "2322523731", name = "İzzet#EVET/\U0001f1f9\U0001f1f7"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 12L)), screen_name = "omrolcay",
id = 360420809L, id_str = "360420809", name = "Ömer Olcay"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 18L)), screen_name = "mehmet_asassoy",
id = 3151503430, id_str = "3151503430", name = "Mehmet Asassoy"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 17L), c(120L, 132L
)), screen_name = c("sevincbeykent", "yigitbulutt"),
id = c(538364458L, 256065299L), id_str = c("538364458",
"256065299"), name = c("Sevinç", "YİĞİT BULUT"
)), .Names = c("indices", "screen_name", "id", "id_str",
"name"), class = "data.frame", row.names = 1:2), structure(list(
indices = list(c(3L, 13L)), screen_name = "AKsamet54",
id = 313205928L, id_str = "313205928", name = "Samet ÇELİK"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 18L)), screen_name = "HayataTebessum",
id = 2911157237, id_str = "2911157237", name = "Meryem"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(0L, 9L)), screen_name = "4qet1dil",
id = 536676261L, id_str = "536676261", name = "KerenGo"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 18L)), screen_name = "akkadinantalya",
id = 1898504755L, id_str = "1898504755", name = "AK Kadın Antalya"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L),
structure(list(indices = list(c(3L, 15L)), screen_name = "menes__2010",
id = 186968367L, id_str = "186968367", name = "#EVET☪ ياسين ☝"), .Names = c("indices",
"screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L)),
hashtags = c("hayir", "evet", "evet", "hayir", "c(\"evet\", \"evet\", \"evet\")",
"evet", "c(\"dirilise\", \"yukselise\", \"evet\")", "c(\"evet\", \"eri\", \"beli\", \"yes\")",
"evet", "evet"), mentions = list(c("seremgiz8289", "bilge_bilir",
"OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), "kendimce_ben",
"omrolcay", "mehmet_asassoy", c("sevincbeykent", "yigitbulutt"
), "AKsamet54", "HayataTebessum", "4qet1dil", "akkadinantalya",
"menes__2010")), .Names = c("created_at", "user.screen_name",
"entities.hashtags", "retweeted_status.created_at", "entities.user_mentions",
"hashtags", "mentions"), row.names = c(NA, 10L), class = "data.frame")
这主要是因为 hashtags
列的生成方式。它被存储为一个字符向量列表,当被强制转换为字符时,它给出了这个结构。
例如,参见
list(c("A", "B", "C"))
#[[1]]
#[1] "A" "B" "C"
as.character(list(c("A", "B", "C")))
#[1] "c(\"A\", \"B\", \"C\")"
检查数据框上的单个元素会给出相同的结构。
test$hashtags[5]
#[1] "c(\"evet\", \"evet\", \"evet\")"
因此,如果您无法返回并更改 hashtags
列的生成方式,您可以改用 grepl
,这样可以避免 strsplit
和 sapply
也打电话。
grepl("evet", test$hashtags)
#[1] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
我会在这里使用 grepl
:
p <- sapply(strsplit(test$hashtags, split=","), function(x) {
grepl("evet", x)
})
如果你真的想匹配独立的单词evet
,那么使用单词边界:
p <- sapply(strsplit(test$hashtags, split=","), function(x) {
grepl("\bevet\b", x)
})
我们可以创建一个逻辑索引列 str_detect
library(tidyverse)
out <- test %>%
mutate(ind = str_detect(hashtags, pattern = "evet"))
out$ind
#[1] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
如果我们需要得到每个词的逻辑索引
test %>%
mutate(ind = str_extract_all(hashtags, "\w+") %>%
map(str_detect, pattern = "evet"))