从网页中提取地理 coordinates/numerics
Extracting geographic coordinates/numerics from webpage
我有一些代码可以尝试从网站收集一些信息:
我可以连接并读取 HTML 数据使用(感谢这个 ):
library(RSelenium)
library(rvest)
rD <- rsDriver(browser="firefox", port=4536L)
remDr <- rD[["client"]]
#navigate
url = 'https://www.fotocasa.es/es/comprar/viviendas/a-bana/todas-las-zonas/l'
remDr$navigate(url)
#accept cookies
remDr$findElement(using = "xpath",'/html/body/div[1]/div[4]/div/div/div/footer/div/button[2]')$clickElement()
#click on Zona
remDr$findElement(using = "xpath", '//*[@id="App"]/div[2]/div/div[2]/div[3]/div/div[1]/div')$clickElement()
# read html page
html_full_page = remDr$getPageSource()[[1]] %>% read_html()
我在尝试收集一份数据时有点卡住了。我 运行 以下内容并想从中提取数字结果:
html_full_page %>%
html_nodes('.re-GeographicSearchNext-checkboxItem') %>%
html_nodes('label')
{xml_nodeset (4)}
[1] <label class="sui-AtomCheckbox sui-AtomCheckbox--medium is-checked"><span class="sui-AtomIcon sui-AtomIcon--small sui-AtomIcon--currentColor"><span><svg viewbox="0 0 24 24"><path d="M19.2 5.4a1 1 0 0 1 1.669 1.095L ...
[2] <label class="re-GeographicSearchNext-checkboxItem-label" name="geoSearch-724,12,15,487,0,15007,0,0,0"><span class="re-GeographicSearchNext-checkboxItem-literal">A Baña</span><span class="re-GeographicSearchNext-ch ...
[3] <label class="sui-AtomCheckbox sui-AtomCheckbox--medium"><input type="checkbox" id="geoSearch-724,12,15,487,0,15056,0,0,0" name="geoSearch-724,12,15,487,0,15056,0,0,0" intermediate=""></label>
[4] <label class="re-GeographicSearchNext-checkboxItem-label" name="geoSearch-724,12,15,487,0,15056,0,0,0"><span class="re-GeographicSearchNext-checkboxItem-literal">Negreira</span><span class="re-GeographicSearchNext- ...
即数据的 geosearch
部分。
我试图从这部分代码中获取以下内容:
-724,12,15,487,0,15007,0,0,0
-724,12,15,487,0,15056,0,0,0
> [1] "<a class=\"re-GeographicSearchNext-checkboxItem re-GeographicSearchNext-checkboxItem--has-separator\" title=\"A Baña\"
> href=\"/es/comprar/viviendas/a-bana/todas-las-zonas/l\"><div
> class=\"sui-MoleculeCheckboxField\"><div class=\"sui-MoleculeField
> sui-MoleculeField--inline sui-MoleculeField--inline-reverse
> sui-MoleculeField--fullWidth\">\n<div
> class=\"sui-MoleculeField-labelContainer\">\n<label
> class=\"sui-AtomCheckbox sui-AtomCheckbox--medium\"><input
> type=\"checkbox\" id=\"geoSearch-724,12,15,487,0,15007,0,0,0\"
> name=\"geoSearch-724,12,15,487,0,15007,0,0,0\"
> intermediate=\"\"></label><div
> class=\"sui-MoleculeField-nodeLabelContainer\"><label
> class=\"re-GeographicSearchNext-checkboxItem-label\"
> name=\"geoSearch-724,12,15,487,0,15007,0,0,0\"><span
> class=\"re-GeographicSearchNext-checkboxItem-literal\">A
> Baña</span><span class=\"re-GeographicSearchNext-checkboxItem-count
> re-GeographicSearchNext-checkboxItem-count-is-child\">17</span></label></div>\n</div>\n<div
> class=\"sui-MoleculeField-inputContainer
> sui-MoleculeField-inputContainer--aligned\"></div>\n</div></div></a>"
>
> [2] "<a class=\"re-GeographicSearchNext-checkboxItem re-GeographicSearchNext-checkboxItem--has-separator\"
> title=\"Negreira\"
> href=\"/es/comprar/viviendas/negreira/todas-las-zonas/l\"><div
> class=\"sui-MoleculeCheckboxField\"><div class=\"sui-MoleculeField
> sui-MoleculeField--inline sui-MoleculeField--inline-reverse
> sui-MoleculeField--fullWidth\">\n<div
> class=\"sui-MoleculeField-labelContainer\">\n<label
> class=\"sui-AtomCheckbox sui-AtomCheckbox--medium\"><input
> type=\"checkbox\" id=\"geoSearch-724,12,15,487,0,15056,0,0,0\"
> name=\"geoSearch-724,12,15,487,0,15056,0,0,0\"
> intermediate=\"\"></label><div
> class=\"sui-MoleculeField-nodeLabelContainer\"><label
> class=\"re-GeographicSearchNext-checkboxItem-label\"
> name=\"geoSearch-724,12,15,487,0,15056,0,0,0\"><span
> class=\"re-GeographicSearchNext-checkboxItem-literal\">Negreira</span><span class=\"re-GeographicSearchNext-checkboxItem-count
> re-GeographicSearchNext-checkboxItem-count-is-child\">52</span></label></div>\n</div>\n<div
> class=\"sui-MoleculeField-inputContainer
> sui-MoleculeField-inputContainer--aligned\"></div>\n</div></div></a>"
应该这样做:
html_full_page %>%
html_nodes('.re-GeographicSearchNext-checkboxItem') %>%
html_nodes('label') %>%
html_attr("name") %>%
gsub("geoSearch-", "", .) %>%
na.omit()
# [1] "724,12,15,487,0,15007,0,0,0" "724,12,15,487,0,15056,0,0,0"
# attr(,"na.action")
# [1] 1 3
# attr(,"class")
# [1] "omit"
我有一些代码可以尝试从网站收集一些信息:
我可以连接并读取 HTML 数据使用(感谢这个
library(RSelenium)
library(rvest)
rD <- rsDriver(browser="firefox", port=4536L)
remDr <- rD[["client"]]
#navigate
url = 'https://www.fotocasa.es/es/comprar/viviendas/a-bana/todas-las-zonas/l'
remDr$navigate(url)
#accept cookies
remDr$findElement(using = "xpath",'/html/body/div[1]/div[4]/div/div/div/footer/div/button[2]')$clickElement()
#click on Zona
remDr$findElement(using = "xpath", '//*[@id="App"]/div[2]/div/div[2]/div[3]/div/div[1]/div')$clickElement()
# read html page
html_full_page = remDr$getPageSource()[[1]] %>% read_html()
我在尝试收集一份数据时有点卡住了。我 运行 以下内容并想从中提取数字结果:
html_full_page %>%
html_nodes('.re-GeographicSearchNext-checkboxItem') %>%
html_nodes('label')
{xml_nodeset (4)}
[1] <label class="sui-AtomCheckbox sui-AtomCheckbox--medium is-checked"><span class="sui-AtomIcon sui-AtomIcon--small sui-AtomIcon--currentColor"><span><svg viewbox="0 0 24 24"><path d="M19.2 5.4a1 1 0 0 1 1.669 1.095L ...
[2] <label class="re-GeographicSearchNext-checkboxItem-label" name="geoSearch-724,12,15,487,0,15007,0,0,0"><span class="re-GeographicSearchNext-checkboxItem-literal">A Baña</span><span class="re-GeographicSearchNext-ch ...
[3] <label class="sui-AtomCheckbox sui-AtomCheckbox--medium"><input type="checkbox" id="geoSearch-724,12,15,487,0,15056,0,0,0" name="geoSearch-724,12,15,487,0,15056,0,0,0" intermediate=""></label>
[4] <label class="re-GeographicSearchNext-checkboxItem-label" name="geoSearch-724,12,15,487,0,15056,0,0,0"><span class="re-GeographicSearchNext-checkboxItem-literal">Negreira</span><span class="re-GeographicSearchNext- ...
即数据的 geosearch
部分。
我试图从这部分代码中获取以下内容:
-724,12,15,487,0,15007,0,0,0
-724,12,15,487,0,15056,0,0,0
> [1] "<a class=\"re-GeographicSearchNext-checkboxItem re-GeographicSearchNext-checkboxItem--has-separator\" title=\"A Baña\" > href=\"/es/comprar/viviendas/a-bana/todas-las-zonas/l\"><div > class=\"sui-MoleculeCheckboxField\"><div class=\"sui-MoleculeField > sui-MoleculeField--inline sui-MoleculeField--inline-reverse > sui-MoleculeField--fullWidth\">\n<div > class=\"sui-MoleculeField-labelContainer\">\n<label > class=\"sui-AtomCheckbox sui-AtomCheckbox--medium\"><input > type=\"checkbox\" id=\"geoSearch-724,12,15,487,0,15007,0,0,0\" > name=\"geoSearch-724,12,15,487,0,15007,0,0,0\" > intermediate=\"\"></label><div > class=\"sui-MoleculeField-nodeLabelContainer\"><label > class=\"re-GeographicSearchNext-checkboxItem-label\" > name=\"geoSearch-724,12,15,487,0,15007,0,0,0\"><span > class=\"re-GeographicSearchNext-checkboxItem-literal\">A > Baña</span><span class=\"re-GeographicSearchNext-checkboxItem-count > re-GeographicSearchNext-checkboxItem-count-is-child\">17</span></label></div>\n</div>\n<div > class=\"sui-MoleculeField-inputContainer > sui-MoleculeField-inputContainer--aligned\"></div>\n</div></div></a>" > > [2] "<a class=\"re-GeographicSearchNext-checkboxItem re-GeographicSearchNext-checkboxItem--has-separator\" > title=\"Negreira\" > href=\"/es/comprar/viviendas/negreira/todas-las-zonas/l\"><div > class=\"sui-MoleculeCheckboxField\"><div class=\"sui-MoleculeField > sui-MoleculeField--inline sui-MoleculeField--inline-reverse > sui-MoleculeField--fullWidth\">\n<div > class=\"sui-MoleculeField-labelContainer\">\n<label > class=\"sui-AtomCheckbox sui-AtomCheckbox--medium\"><input > type=\"checkbox\" id=\"geoSearch-724,12,15,487,0,15056,0,0,0\" > name=\"geoSearch-724,12,15,487,0,15056,0,0,0\" > intermediate=\"\"></label><div > class=\"sui-MoleculeField-nodeLabelContainer\"><label > class=\"re-GeographicSearchNext-checkboxItem-label\" > name=\"geoSearch-724,12,15,487,0,15056,0,0,0\"><span > class=\"re-GeographicSearchNext-checkboxItem-literal\">Negreira</span><span class=\"re-GeographicSearchNext-checkboxItem-count > re-GeographicSearchNext-checkboxItem-count-is-child\">52</span></label></div>\n</div>\n<div > class=\"sui-MoleculeField-inputContainer > sui-MoleculeField-inputContainer--aligned\"></div>\n</div></div></a>"
应该这样做:
html_full_page %>%
html_nodes('.re-GeographicSearchNext-checkboxItem') %>%
html_nodes('label') %>%
html_attr("name") %>%
gsub("geoSearch-", "", .) %>%
na.omit()
# [1] "724,12,15,487,0,15007,0,0,0" "724,12,15,487,0,15056,0,0,0"
# attr(,"na.action")
# [1] 1 3
# attr(,"class")
# [1] "omit"