在 R 中使用 xpath 查找包含变音符号的文本
Find text including umlaut with xpath in R
我想通过 text()
识别包含带有“Umlaute”的文本的节点。
library(xml2)
library(rvest)
doc <- "<p>Über uns </p>" %>% xml2::read_html()
grepl(pattern = "Über uns", x = as.character(doc))
grepl(pattern = "Über uns", x = doc)
问题:
如何提取包含文本“Über uns”的节点?[=15=]
尝试了什么:
https://forum.fhem.de/index.php?topic=96254.0
Java XPath umlaut/vowel parsing
# does not work
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
# does not work
xp <- paste0("//*[translate(text(), 'Ü', 'U') = 'Uber uns']")
html_nodes(x = doc, xpath = xp)
# does not work
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
# this works but i wonder if there is a solution with xpath
doc2 <- doc %>%
as.character() %>%
gsub(pattern = "Ü", replacement = "Ue") %>%
xml2::read_html()
xp <- paste0("//*[contains(text(), 'Ueber uns')]")
html_nodes(x = doc2, xpath = xp)
这听起来像是一个编码问题;它适用于 en_US.UTF-8
。也许将您的默认文本编码更改为 UTF-8(例如在 RStudio 中:工具 - 全局选项 - 代码 - 保存 - 默认文本编码)或临时切换:
library(xml2)
library(rvest)
old.locale <- Sys.getlocale("LC_CTYPE")
Sys.setlocale("LC_CTYPE", 'C') # using non-UTF-8 encoding
#> [1] "C"
doc <- "<p>Über uns </p>" %>% xml2::read_html()
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
{xml_nodeset (0)}
Sys.setlocale("LC_CTYPE", 'en_US.UTF-8') # using UTF-8 encoding
#> [1] "en_US.UTF-8"
doc <- "<p>Über uns </p>" %>% xml2::read_html()
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
#> {xml_nodeset (1)}
#> [1] <p>Über uns </p>
Sys.setlocale("LC_CTYPE", old.locale)
#> [1] "en_US.UTF-8"
我想通过 text()
识别包含带有“Umlaute”的文本的节点。
library(xml2)
library(rvest)
doc <- "<p>Über uns </p>" %>% xml2::read_html()
grepl(pattern = "Über uns", x = as.character(doc))
grepl(pattern = "Über uns", x = doc)
问题:
如何提取包含文本“Über uns”的节点?[=15=]
尝试了什么:
https://forum.fhem.de/index.php?topic=96254.0
Java XPath umlaut/vowel parsing
# does not work
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
# does not work
xp <- paste0("//*[translate(text(), 'Ü', 'U') = 'Uber uns']")
html_nodes(x = doc, xpath = xp)
# does not work
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
# this works but i wonder if there is a solution with xpath
doc2 <- doc %>%
as.character() %>%
gsub(pattern = "Ü", replacement = "Ue") %>%
xml2::read_html()
xp <- paste0("//*[contains(text(), 'Ueber uns')]")
html_nodes(x = doc2, xpath = xp)
这听起来像是一个编码问题;它适用于 en_US.UTF-8
。也许将您的默认文本编码更改为 UTF-8(例如在 RStudio 中:工具 - 全局选项 - 代码 - 保存 - 默认文本编码)或临时切换:
library(xml2)
library(rvest)
old.locale <- Sys.getlocale("LC_CTYPE")
Sys.setlocale("LC_CTYPE", 'C') # using non-UTF-8 encoding
#> [1] "C"
doc <- "<p>Über uns </p>" %>% xml2::read_html()
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
{xml_nodeset (0)}
Sys.setlocale("LC_CTYPE", 'en_US.UTF-8') # using UTF-8 encoding
#> [1] "en_US.UTF-8"
doc <- "<p>Über uns </p>" %>% xml2::read_html()
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
#> {xml_nodeset (1)}
#> [1] <p>Über uns </p>
Sys.setlocale("LC_CTYPE", old.locale)
#> [1] "en_US.UTF-8"