read_html returns 那

Question

我一直在寻找一种解决方案来使用 R 抓取网站一个月，但没有得到好评。

这是我在 R 上尝试过的，但答案仍然是#NA。不知道为什么。

library(rvest)
url<-read_html("https://www.richbourse.com/common/mouvements/technique/SPHC")
col_name<- url%>%
  html_nodes("th")%>%
  html_text()
data <- url%>%
  html_nodes("td") %>%
  html_text()
data
#Na

当我尝试以下代码时：

url<-"https://www.richbourse.com/common/mouvements/technique/SPHC"
read_html(url1) 
# {html_document} 
# <html lang="fr-FR"> 
# [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta name="csrf-para ... 
# [2] <body>\n\n\n<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KK5FBSF"height ...

感谢所有建议。我尝试了另一个问题来获得 table 但它仍然存在一些小问题。代码审查：现在只剩下两个问题： 1- 以日期格式更改日期 2-抓住交易量栏还在想

library("httr")
library(stringr)
library(dplyr)
library(tidyr)
link <- "https://www.richbourse.com/common/mouvements/technique/SPHC"
#Get link
page <- GET(link)
#Data before decoding
page[["content"]]

解码后数据变成字符

page <- content(page,as="text",encoding = 'UTF-8')
page
p=unlist(strsplit(page,split = '\n'))
p=unlist(strsplit(p[[641]],split = ':'))
p[2]
##Keeping for emergency
pp<-p[2]

清理数据

#To be sure that there is no space in the  character
pp1<-gsub(" ", "", pp)
#Remove all second square bracket closer to a comma
pp_split1<-strsplit(pp1, split = "],")
##Transform to data frame
pp_split2<- as.data.frame(pp_split1)
#Use a loop to remove the remaining square brackets in each row
i= 1
for (i in 1 : nrow(pp_split2)){
  pp_split2[i,1]<- gsub("\[|\]", "", pp_split2[i,1])
  #pp_split2[i,1]<-gsub("\[", "", pp_split2[i,1])
}
#Now transform one column to 5 columns
#And change numbers in integer
colnames(pp_split2)<-c("unique")
pp_split2<-separate(pp_split2, col = unique, into =  c("Date", "Open", "High", "Low", "Close"), sep = ",")
pp_split2$Open<-as.numeric(pp_split2$Open)
pp_split2$High<-as.numeric(pp_split2$High)
pp_split2$Low<-as.numeric(pp_split2$Low)
pp_split2$Close<-as.numeric(pp_split2$Close)

### If we want we can transform it to tibble
pp_split3<- as_tibble(pp_split2)

附加图片显示了 table 在网络上的样子。图1 此处显示点击打开数据table。图 2 显示 table。抓取后我丢失了交易量并且所有日期都更改为字符

Answer 1

似乎没有同名的节点。如果您使用 html_elements，帮助页面说它是 html_nodes 的替代品，并选择页面中存在的名称，例如“li”，您会得到结果：

 url<-read_html("https://www.richbourse.com/common/mouvements/technique/SPHC")
>col_name<- url%>%
     html_elements("li")%>%
     html_text()
 col_name
#------console output-----------
 [1] ""                                                                                                                                                                        
 [2] ""                                                                                                                                                                        
 [3] ""                                                                                                                                                                        
 [4] ""                                                                                                                                                                        
 [5] " Newsletter"                                                                                                                                                             
 [6] " Contact"                                                                                                                                                                
 [7] " Inscription"                                                                                                                                                            
 [8] " Connexion"                                                                                                                                                              
 [9] "Marchés Publications officielles\nActualités\nActions\nIndices\nObligations\nPalmarès hebdomadaires"                                                                     
[10] "Publications officielles"                                                                                                                                                
snipped the remaining 76 items

也适用于名称：“options”

 col_name<- url%>%
     html_elements("option")%>%
     html_text()
 col_name
 #------------------------
 [1] " -- action --"                               "PALM COTE D'IVOIRE"                         
 [3] "SAPH COTE D'IVOIRE"                          "SICOR COTE D'IVOIRE"                        
 [5] "SOGB COTE D'IVOIRE"                          "SUCRIVOIRE COTE D'IVOIRE"                   
 [7] "SETAO COTE D'IVOIRE"                         "BERNABE COTE D'IVOIRE"                      
 [9] "CFAO MOTORS COTE D'IVOIRE"                   "SERVAIR ABIDJAN COTE D'IVOIRE" 
  #---- snipped remaining 39 items-------

read_html returns 那

read_html returns NA

r

httr

rvest