如何使用 Selenium 抓取交互式网页
How to web scrap an Interective web page with Rselenium
我想通过网络废弃此 web page 的所有属性。
当我尝试下面的代码时,我只得到页面上 1 属性 的详细信息。
library(tidyverse)
library(rvest)
library(RSelenium)
library(stringr)
rD <- rsDriver(browser = "chrome",port = 4234L,chromever = "99.0.4844.51")
remDr <- rD[["client"]]
# test Willhaben
goTo <- remDr$navigate("https://www.immobilienscout24.de/Suche/de/bayern/muenchen/haus-kaufen?pagenumber=3")
Lego <- read_html(remDr$getPageSource(goTo)[[1]])
rooms <- Lego %>% html_element(".iLQwFF+ .iLQwFF .jXuiQ") %>%
html_text()
address <- Lego %>% html_element("#skip-to-resultlist .hdZkVR") %>%
html_text()
cost <- Lego %>% html_element(".result-list-entry__primary-criterion:nth-child(1) .font-highlight") %>%
html_text()
surface <- Lego %>% html_element(".result-list-entry__primary-criterion:nth-child(2) .font-highlight") %>%
html_text()
href <- Lego %>% html_element("a.result-list-entry__brand-title-container ") %>%
html_attr('href')
apt_link <- paste0("https://www.immobilienscout24.de",href)
Munich_flat <- data.frame(apt_link, rooms, surface, cost, address)
结果是这样的。
我如何通过网络废弃此页面上的所有属性?
提前谢谢你。
您唯一需要做的就是将 html_element
更改为 html_elements
,因为前者只会获得一个节点,而后者会获得所有节点。
Lego <- read_html(remDr$getPageSource(goTo)[[1]])
rooms <- Lego %>% html_elements('div > div.grid-item.result-list-entry__data-container > div > div.result-list-entry__criteria > a > div > dl:nth-child(3) > dd ') %>%
html_text()
address <- Lego %>% html_elements('.result-list-entry__address') %>%
html_text()
cost <- Lego %>% html_elements(".result-list-entry__primary-criterion:nth-child(1) .font-highlight") %>%
html_text()
surface <- Lego %>% html_elements(".result-list-entry__primary-criterion:nth-child(2) .font-highlight") %>%
html_text()
href <- Lego %>% html_elements("a.result-list-entry__brand-title-container ") %>%
html_attr('href')
Munich_flat <- data.frame(apt_link, rooms, surface, cost, address)
head(Munich_flat)
apt_link rooms surface cost address
1 https://www.immobilienscout24.de/expose/132791623 37 Zi.37 1.023 m² 10.600.000 € Schlotthauerstraße xx, Untere Au, München
2 https://www.immobilienscout24.de/expose/132872500 5 Zi.5 119,31 m² 1.590.000 € Neupasing, München
3 https://www.immobilienscout24.de/expose/132882219 4 Zi.4 148 m² 1.150.000 € Waldtrudering, München
4 https://www.immobilienscout24.de/expose/132647642 5 Zi.5 148 m² 1.290.000 € Lerchenau-West, München
5 https://www.immobilienscout24.de/expose/132538727 10 Zi.10 840 m² 8.780.000 € Am alten südlichen Friedhof, München
我想通过网络废弃此 web page 的所有属性。
当我尝试下面的代码时,我只得到页面上 1 属性 的详细信息。
library(tidyverse)
library(rvest)
library(RSelenium)
library(stringr)
rD <- rsDriver(browser = "chrome",port = 4234L,chromever = "99.0.4844.51")
remDr <- rD[["client"]]
# test Willhaben
goTo <- remDr$navigate("https://www.immobilienscout24.de/Suche/de/bayern/muenchen/haus-kaufen?pagenumber=3")
Lego <- read_html(remDr$getPageSource(goTo)[[1]])
rooms <- Lego %>% html_element(".iLQwFF+ .iLQwFF .jXuiQ") %>%
html_text()
address <- Lego %>% html_element("#skip-to-resultlist .hdZkVR") %>%
html_text()
cost <- Lego %>% html_element(".result-list-entry__primary-criterion:nth-child(1) .font-highlight") %>%
html_text()
surface <- Lego %>% html_element(".result-list-entry__primary-criterion:nth-child(2) .font-highlight") %>%
html_text()
href <- Lego %>% html_element("a.result-list-entry__brand-title-container ") %>%
html_attr('href')
apt_link <- paste0("https://www.immobilienscout24.de",href)
Munich_flat <- data.frame(apt_link, rooms, surface, cost, address)
结果是这样的。
我如何通过网络废弃此页面上的所有属性? 提前谢谢你。
您唯一需要做的就是将 html_element
更改为 html_elements
,因为前者只会获得一个节点,而后者会获得所有节点。
Lego <- read_html(remDr$getPageSource(goTo)[[1]])
rooms <- Lego %>% html_elements('div > div.grid-item.result-list-entry__data-container > div > div.result-list-entry__criteria > a > div > dl:nth-child(3) > dd ') %>%
html_text()
address <- Lego %>% html_elements('.result-list-entry__address') %>%
html_text()
cost <- Lego %>% html_elements(".result-list-entry__primary-criterion:nth-child(1) .font-highlight") %>%
html_text()
surface <- Lego %>% html_elements(".result-list-entry__primary-criterion:nth-child(2) .font-highlight") %>%
html_text()
href <- Lego %>% html_elements("a.result-list-entry__brand-title-container ") %>%
html_attr('href')
Munich_flat <- data.frame(apt_link, rooms, surface, cost, address)
head(Munich_flat)
apt_link rooms surface cost address
1 https://www.immobilienscout24.de/expose/132791623 37 Zi.37 1.023 m² 10.600.000 € Schlotthauerstraße xx, Untere Au, München
2 https://www.immobilienscout24.de/expose/132872500 5 Zi.5 119,31 m² 1.590.000 € Neupasing, München
3 https://www.immobilienscout24.de/expose/132882219 4 Zi.4 148 m² 1.150.000 € Waldtrudering, München
4 https://www.immobilienscout24.de/expose/132647642 5 Zi.5 148 m² 1.290.000 € Lerchenau-West, München
5 https://www.immobilienscout24.de/expose/132538727 10 Zi.10 840 m² 8.780.000 € Am alten südlichen Friedhof, München