read_html returns 那
read_html returns NA
我一直在寻找一种解决方案来使用 R 抓取网站一个月,但没有得到好评。
这是我在 R 上尝试过的,但答案仍然是#NA。不知道为什么。
library(rvest)
url<-read_html("https://www.richbourse.com/common/mouvements/technique/SPHC")
col_name<- url%>%
html_nodes("th")%>%
html_text()
data <- url%>%
html_nodes("td") %>%
html_text()
data
#Na
当我尝试以下代码时:
url<-"https://www.richbourse.com/common/mouvements/technique/SPHC"
read_html(url1)
# {html_document}
# <html lang="fr-FR">
# [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta name="csrf-para ...
# [2] <body>\n\n\n<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KK5FBSF"height ...
感谢所有建议。我尝试了另一个问题来获得 table 但它仍然存在一些小问题。
代码审查:现在只剩下两个问题:
1- 以日期格式更改日期
2-抓住交易量栏
还在想
library("httr")
library(stringr)
library(dplyr)
library(tidyr)
link <- "https://www.richbourse.com/common/mouvements/technique/SPHC"
#Get link
page <- GET(link)
#Data before decoding
page[["content"]]
解码后数据变成字符
page <- content(page,as="text",encoding = 'UTF-8')
page
p=unlist(strsplit(page,split = '\n'))
p=unlist(strsplit(p[[641]],split = ':'))
p[2]
##Keeping for emergency
pp<-p[2]
清理数据
#To be sure that there is no space in the character
pp1<-gsub(" ", "", pp)
#Remove all second square bracket closer to a comma
pp_split1<-strsplit(pp1, split = "],")
##Transform to data frame
pp_split2<- as.data.frame(pp_split1)
#Use a loop to remove the remaining square brackets in each row
i= 1
for (i in 1 : nrow(pp_split2)){
pp_split2[i,1]<- gsub("\[|\]", "", pp_split2[i,1])
#pp_split2[i,1]<-gsub("\[", "", pp_split2[i,1])
}
#Now transform one column to 5 columns
#And change numbers in integer
colnames(pp_split2)<-c("unique")
pp_split2<-separate(pp_split2, col = unique, into = c("Date", "Open", "High", "Low", "Close"), sep = ",")
pp_split2$Open<-as.numeric(pp_split2$Open)
pp_split2$High<-as.numeric(pp_split2$High)
pp_split2$Low<-as.numeric(pp_split2$Low)
pp_split2$Close<-as.numeric(pp_split2$Close)
### If we want we can transform it to tibble
pp_split3<- as_tibble(pp_split2)
附加图片显示了 table 在网络上的样子。图1 此处显示点击打开数据table。
图 2 显示 table。抓取后我丢失了交易量并且所有日期都更改为字符
似乎没有同名的节点。如果您使用 html_elements
,帮助页面说它是 html_nodes
的替代品,并选择页面中存在的名称,例如“li”,您会得到结果:
url<-read_html("https://www.richbourse.com/common/mouvements/technique/SPHC")
>col_name<- url%>%
html_elements("li")%>%
html_text()
col_name
#------console output-----------
[1] ""
[2] ""
[3] ""
[4] ""
[5] " Newsletter"
[6] " Contact"
[7] " Inscription"
[8] " Connexion"
[9] "Marchés Publications officielles\nActualités\nActions\nIndices\nObligations\nPalmarès hebdomadaires"
[10] "Publications officielles"
snipped the remaining 76 items
也适用于名称:“options”
col_name<- url%>%
html_elements("option")%>%
html_text()
col_name
#------------------------
[1] " -- action --" "PALM COTE D'IVOIRE"
[3] "SAPH COTE D'IVOIRE" "SICOR COTE D'IVOIRE"
[5] "SOGB COTE D'IVOIRE" "SUCRIVOIRE COTE D'IVOIRE"
[7] "SETAO COTE D'IVOIRE" "BERNABE COTE D'IVOIRE"
[9] "CFAO MOTORS COTE D'IVOIRE" "SERVAIR ABIDJAN COTE D'IVOIRE"
#---- snipped remaining 39 items-------
我一直在寻找一种解决方案来使用 R 抓取网站一个月,但没有得到好评。
这是我在 R 上尝试过的,但答案仍然是#NA。不知道为什么。
library(rvest)
url<-read_html("https://www.richbourse.com/common/mouvements/technique/SPHC")
col_name<- url%>%
html_nodes("th")%>%
html_text()
data <- url%>%
html_nodes("td") %>%
html_text()
data
#Na
当我尝试以下代码时:
url<-"https://www.richbourse.com/common/mouvements/technique/SPHC"
read_html(url1)
# {html_document}
# <html lang="fr-FR">
# [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta name="csrf-para ...
# [2] <body>\n\n\n<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KK5FBSF"height ...
感谢所有建议。我尝试了另一个问题来获得 table 但它仍然存在一些小问题。 代码审查:现在只剩下两个问题: 1- 以日期格式更改日期 2-抓住交易量栏 还在想
library("httr")
library(stringr)
library(dplyr)
library(tidyr)
link <- "https://www.richbourse.com/common/mouvements/technique/SPHC"
#Get link
page <- GET(link)
#Data before decoding
page[["content"]]
解码后数据变成字符
page <- content(page,as="text",encoding = 'UTF-8')
page
p=unlist(strsplit(page,split = '\n'))
p=unlist(strsplit(p[[641]],split = ':'))
p[2]
##Keeping for emergency
pp<-p[2]
清理数据
#To be sure that there is no space in the character
pp1<-gsub(" ", "", pp)
#Remove all second square bracket closer to a comma
pp_split1<-strsplit(pp1, split = "],")
##Transform to data frame
pp_split2<- as.data.frame(pp_split1)
#Use a loop to remove the remaining square brackets in each row
i= 1
for (i in 1 : nrow(pp_split2)){
pp_split2[i,1]<- gsub("\[|\]", "", pp_split2[i,1])
#pp_split2[i,1]<-gsub("\[", "", pp_split2[i,1])
}
#Now transform one column to 5 columns
#And change numbers in integer
colnames(pp_split2)<-c("unique")
pp_split2<-separate(pp_split2, col = unique, into = c("Date", "Open", "High", "Low", "Close"), sep = ",")
pp_split2$Open<-as.numeric(pp_split2$Open)
pp_split2$High<-as.numeric(pp_split2$High)
pp_split2$Low<-as.numeric(pp_split2$Low)
pp_split2$Close<-as.numeric(pp_split2$Close)
### If we want we can transform it to tibble
pp_split3<- as_tibble(pp_split2)
似乎没有同名的节点。如果您使用 html_elements
,帮助页面说它是 html_nodes
的替代品,并选择页面中存在的名称,例如“li”,您会得到结果:
url<-read_html("https://www.richbourse.com/common/mouvements/technique/SPHC")
>col_name<- url%>%
html_elements("li")%>%
html_text()
col_name
#------console output-----------
[1] ""
[2] ""
[3] ""
[4] ""
[5] " Newsletter"
[6] " Contact"
[7] " Inscription"
[8] " Connexion"
[9] "Marchés Publications officielles\nActualités\nActions\nIndices\nObligations\nPalmarès hebdomadaires"
[10] "Publications officielles"
snipped the remaining 76 items
也适用于名称:“options”
col_name<- url%>%
html_elements("option")%>%
html_text()
col_name
#------------------------
[1] " -- action --" "PALM COTE D'IVOIRE"
[3] "SAPH COTE D'IVOIRE" "SICOR COTE D'IVOIRE"
[5] "SOGB COTE D'IVOIRE" "SUCRIVOIRE COTE D'IVOIRE"
[7] "SETAO COTE D'IVOIRE" "BERNABE COTE D'IVOIRE"
[9] "CFAO MOTORS COTE D'IVOIRE" "SERVAIR ABIDJAN COTE D'IVOIRE"
#---- snipped remaining 39 items-------