文本挖掘——从网站中提取数据
Text mining - Data extraction from website
谁能帮忙从下方提取评论及其位置 link?我已成功编写代码,但无法在 excel 的两个单独列中下载这两个数据(评论及其位置)。我是 R 的新手。
我用过的代码:
thepage = readLines('http://www.subaruoutback.org/forums/104-gen-4-2010-
2014/22586-440-watt-9-speaker-harman-kardon-premium-audio-system-sound-
quality.html')
#Location
grep('Location',thepage)
thepage[2738]
mypattern = '\t\t\t\t<div>Location: ([^<]*)</div>'
datalines = grep(mypattern,thepage[2738:length(thepage)],value=TRUE)
datalines
getexpr = function(s,g)substring(s,g,g+attr(g,'match.length')-1)
gg = gregexpr(mypattern,datalines)
matches = mapply(getexpr,datalines,gg)
result = gsub(mypattern,'\1',matches)
result
names(result) = NULL
result[1:10]
class(result)
#Review data
library(XML)
library(httr)
raw2 <- htmlTreeParse(thepage, useInternalNodes = TRUE)
data <- pathApply(raw2,"//div[startswith(@id,'post_message')]",xmlValue)
data <- unlist(data)
data
class(data)
df=data.frame(result,data)
write.csv(df,"D:/Important files/R Practice/texts/output9.csv")
My Expected output:![My expected output format][1]
这是我使用 rvest
和 stringr
得出的结果
library("rvest")
library("stringr")
url <- "http://www.subaruoutback.org/forums/104-gen-4-2010-2014/22586-440-watt-9-speaker-harman-kardon-premium-audio-system-sound-quality.html"
text <- url %>%
html() %>%
html_nodes(".main-column-text") %>%
html_text() %>%
str_replace_all("\r|\n|\t", "") %>%
str_trim()
loc <- url %>%
html() %>%
html_nodes(".main-column-picture") %>%
html_text() %>%
str_extract("Location:[(a-z),:; -(A-Z)]+")
df <- data.frame(text, loc, stringsAsFactors=F)
df <- df[text != "QuoteQuick Reply", ]
df <- df[!grepl("^Go to first new post", df$text), ]
谁能帮忙从下方提取评论及其位置 link?我已成功编写代码,但无法在 excel 的两个单独列中下载这两个数据(评论及其位置)。我是 R 的新手。
我用过的代码:
thepage = readLines('http://www.subaruoutback.org/forums/104-gen-4-2010-
2014/22586-440-watt-9-speaker-harman-kardon-premium-audio-system-sound-
quality.html')
#Location
grep('Location',thepage)
thepage[2738]
mypattern = '\t\t\t\t<div>Location: ([^<]*)</div>'
datalines = grep(mypattern,thepage[2738:length(thepage)],value=TRUE)
datalines
getexpr = function(s,g)substring(s,g,g+attr(g,'match.length')-1)
gg = gregexpr(mypattern,datalines)
matches = mapply(getexpr,datalines,gg)
result = gsub(mypattern,'\1',matches)
result
names(result) = NULL
result[1:10]
class(result)
#Review data
library(XML)
library(httr)
raw2 <- htmlTreeParse(thepage, useInternalNodes = TRUE)
data <- pathApply(raw2,"//div[startswith(@id,'post_message')]",xmlValue)
data <- unlist(data)
data
class(data)
df=data.frame(result,data)
write.csv(df,"D:/Important files/R Practice/texts/output9.csv")
My Expected output:![My expected output format][1]
这是我使用 rvest
和 stringr
library("rvest")
library("stringr")
url <- "http://www.subaruoutback.org/forums/104-gen-4-2010-2014/22586-440-watt-9-speaker-harman-kardon-premium-audio-system-sound-quality.html"
text <- url %>%
html() %>%
html_nodes(".main-column-text") %>%
html_text() %>%
str_replace_all("\r|\n|\t", "") %>%
str_trim()
loc <- url %>%
html() %>%
html_nodes(".main-column-picture") %>%
html_text() %>%
str_extract("Location:[(a-z),:; -(A-Z)]+")
df <- data.frame(text, loc, stringsAsFactors=F)
df <- df[text != "QuoteQuick Reply", ]
df <- df[!grepl("^Go to first new post", df$text), ]