使用 RedditExtractoR 在 R 中抓取 Reddit

Question

我正在尝试抓取 Reddit 数据（我是网络抓取的新手，在 R 方面还算不错）。 RedditExtractor 包有一个很好的功能，可以完成我需要的 90%，但它没有获取与发表评论的用户相关的 "flair"。我正在尝试使用包的功能，但我有点不知所措。

有一些 Reddit 主题的例子 here。我想我正在寻找 XML:

这些位中的文本

<span class="flair flair-orthodox" title="Eastern Orthodox">Eastern Orthodox</span>

我已经粘贴了 reddit_content() 函数的代码以及我认为应该放置额外代码的注释，但我不太确定从这里到哪里去。目前，函数 returns 是一个包含评论、时间戳、用户等列的数据框。如果存在，我还需要它来生成具有用户风格的评论。提前致谢！

redd_content_flair <- function (URL, wait_time = 2) 
{
  if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
    stop("invalid URL parameter")
  }
  GetAttribute = function(node, feature) {
    Attribute = node$data[[feature]]
    replies = node$data$replies
    reply.nodes = if (is.list(replies)) 
      replies$data$children
    else NULL
    return(list(Attribute, lapply(reply.nodes, function(x) {
      GetAttribute(x, feature)
    })))
  }
  get.structure = function(node, depth = 0) {
    if (is.null(node)) {
      return(list())
    }
    filter = is.null(node$data$author)
    replies = node$data$replies
    reply.nodes = if (is.list(replies)) 
      replies$data$children
    else NULL
    return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes), 
                                                   function(x) get.structure(reply.nodes[[x]], paste0(depth, 
                                                                                                      "_", x)))))
  }
  data_extract = data.frame(id = numeric(), structure = character(), 
                            post_date = as.Date(character()), comm_date = as.Date(character()), 
                            num_comments = numeric(), subreddit = character(), upvote_prop = numeric(), 
                            post_score = numeric(), author = character(), user = character(), 
                            comment_score = numeric(), controversiality = numeric(), 
                            comment = character(), title = character(), post_text = character(), 
                            link = character(), domain = character(), 
                            #flair = character(),
                            URL = character())
  pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
  for (i in seq(URL)) {
    if (!grepl("^https?://(.*)", URL[i])) 
      URL[i] = paste0("https://www.", gsub("^.*(reddit\..*$)", 
                                           "\1", URL[i]))
    if (!grepl("\?ref=search_posts$", URL[i])) 
      URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
    X = paste0(gsub("\?ref=search_posts$", "", URL[i]), 
               ".json?limit=500")
    raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE)), 
                        error = function(e) NULL)
    if (is.null(raw_data)) {
      Sys.sleep(min(1, wait_time))
      raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, 
                                                      warn = FALSE)), error = function(e) NULL)
    }
    if (is.null(raw_data) == FALSE) {
      meta.node = raw_data[[1]]$data$children[[1]]$data
      main.node = raw_data[[2]]$data$children
      if (min(length(meta.node), length(main.node)) > 0) {
        structure = unlist(lapply(1:length(main.node), 
                                  function(x) get.structure(main.node[[x]], x)))
        TEMP = data.frame(id = NA, structure = gsub("FALSE ", 
                                                    "", structure[!grepl("TRUE", structure)]), 
                          post_date = format(as.Date(as.POSIXct(meta.node$created_utc, 
                                                                origin = "1970-01-01")), "%d-%m-%y"),
                          comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node, 
                                                                                                                                                          function(x) {
                                                                                                                                                            GetAttribute(x, "created_utc")
                                                                                                                                                          })), origin = "1970-01-01")), "%d-%m-%y"), 
                          num_comments = meta.node$num_comments,
                          subreddit = ifelse(is.null(meta.node$subreddit), 
                                                                                    "UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio, 
                          post_score = meta.node$score, author = meta.node$author, 
                          user = unlist(lapply(main.node, function(x) {
                            GetAttribute(x, "author")
                          })),
                          comment_score = unlist(lapply(main.node, 
                                                             function(x) {
                                                               GetAttribute(x, "score")
                                                             })),
                          controversiality = unlist(lapply(main.node, 
                                                                                                   function(x) {
                                                                                                     GetAttribute(x, "controversiality")
                                                                                                   })),
                          comment = unlist(lapply(main.node, function(x) {
                                                                                                     GetAttribute(x, "body")
                                                                                                   })),
                          title = meta.node$title, post_text = meta.node$selftext, 
                          link = meta.node$url, domain = meta.node$domain, 
                          #flair = unlist(lapply(main.node, function(x) {GetAttribute(x, "flair")})),
                          URL = URL[i], stringsAsFactors = FALSE)
        TEMP$id = 1:nrow(TEMP)
        if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0) 
          data_extract = rbind(TEMP, data_extract)
        else print(paste("missed", i, ":", URL[i]))
      }
    }
    utils::setTxtProgressBar(pb, i)
    Sys.sleep(min(2, wait_time))
  }
  close(pb)
  return(data_extract)
}

编辑：我还想为 "parent" 评论获取 URL，它看起来像

这样的标签

<p class="parent"><a name="d3t1p1r"></a></p>

Answer 1

我想出了一个临时解决方案。为了 post 诚实，我将 post 放在这里。问题是未按原样设置函数来处理 NULL JSON 值。这是一个快速修复。

大约中间有两条 raw_data = 行。您需要将 nullValue = 'your null text' 参数添加到 fromJSON 函数。然后，您可以使用与其他地方相同的构造，向空数据框和 TEMP 数据框添加您想要的任何元数据。在下面的函数中，我添加了用户的风格文本和父评论的 ID。

（请注意，不稳定的缩进来自原始函数...我将其保留原样以防止意外更改某些内容。）

reddit.fixed <- function (URL, wait_time = 2) 
{
  if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
    stop("invalid URL parameter")
  }
  GetAttribute = function(node, feature) {
    Attribute = node$data[[feature]]
    replies = node$data$replies
    reply.nodes = if (is.list(replies)) 
      replies$data$children
    else NULL
    return(list(Attribute, lapply(reply.nodes, function(x) {
      GetAttribute(x, feature)
    })))
  }
  get.structure = function(node, depth = 0) {
    if (is.null(node)) {
      return(list())
    }
    filter = is.null(node$data$author)
    replies = node$data$replies
    reply.nodes = if (is.list(replies)) 
      replies$data$children
    else NULL
    return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes), 
                                                   function(x) get.structure(reply.nodes[[x]], paste0(depth, 
                                                                                                      "_", x)))))
  }
  data_extract = data.frame(id = numeric(), structure = character(), 
                            post_date = as.Date(character()), comm_date = as.Date(character()), 
                            num_comments = numeric(), subreddit = character(), upvote_prop = numeric(), 
                            post_score = numeric(), author = character(), user = character(), 
                            comment_score = numeric(), controversiality = numeric(), 
                            comment = character(), title = character(), post_text = character(), 
                            link = character(), domain = character(), URL = character(), flair = character(), parent = character())
  pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
  for (i in seq(URL)) {
    if (!grepl("^https?://(.*)", URL[i])) 
      URL[i] = paste0("https://www.", gsub("^.*(reddit\..*$)", 
                                           "\1", URL[i]))
    if (!grepl("\?ref=search_posts$", URL[i])) 
      URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
    X = paste0(gsub("\?ref=search_posts$", "", URL[i]), 
               ".json?limit=500")
    raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE), nullValue = "none"), 
                        error = function(e) NULL)
    if (is.null(raw_data)) {
      Sys.sleep(min(1, wait_time))
      raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, 
                                                      warn = FALSE), nullValue = "none"), error = function(e) NULL)
    }
    if (is.null(raw_data) == FALSE) {
      meta.node = raw_data[[1]]$data$children[[1]]$data
      main.node = raw_data[[2]]$data$children
      if (min(length(meta.node), length(main.node)) > 0) {
        structure = unlist(lapply(1:length(main.node), 
                                  function(x) get.structure(main.node[[x]], x)))
        TEMP = data.frame(id = NA, structure = gsub("FALSE ", 
                                                    "", structure[!grepl("TRUE", structure)]), 
                          post_date = format(as.Date(as.POSIXct(meta.node$created_utc, 
                                                                origin = "1970-01-01")), "%d-%m-%y"), comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node, 
                                                                                                                                                          function(x) {
                                                                                                                                                            GetAttribute(x, "created_utc")
                                                                                                                                                          })), origin = "1970-01-01")), "%d-%m-%y"), 
                          num_comments = meta.node$num_comments, subreddit = ifelse(is.null(meta.node$subreddit), 
                                                                                    "UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio, 
                          post_score = meta.node$score, author = meta.node$author, 
                          user = unlist(lapply(main.node, function(x) {
                            GetAttribute(x, "author")
                          })), comment_score = unlist(lapply(main.node, 
                                                             function(x) {
                                                               GetAttribute(x, "score")
                                                             })), controversiality = unlist(lapply(main.node, 
                                                                                                   function(x) {
                                                                                                     GetAttribute(x, "controversiality")
                                                                                                   })), comment = unlist(lapply(main.node, function(x) {
                                                                                                     GetAttribute(x, "body")
                                                                                                   })), title = meta.node$title, post_text = meta.node$selftext, 
                          link = meta.node$url, domain = meta.node$domain, 
                          URL = URL[i], 
                          flair = unlist(lapply(main.node, function(x) {
                            GetAttribute(x, "author_flair_text")
                          })),
                          parent = unlist(lapply(main.node, function(x) {GetAttribute(x, "parent_id")})),
                          stringsAsFactors = FALSE)
        TEMP$id = 1:nrow(TEMP)
        if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0) 
          data_extract = rbind(TEMP, data_extract)
        else print(paste("missed", i, ":", URL[i]))
      }
    }
    utils::setTxtProgressBar(pb, i)
    Sys.sleep(min(2, wait_time))
  }
  close(pb)
  return(data_extract)
}

使用 RedditExtractoR 在 R 中抓取 Reddit

Scraping Reddit in R with RedditExtractoR

r

reddit

web-scraping