R 中的内存泄漏?

Memory leaks in R?

我正在尝试使用 Rcurl 和 XML 包解析某些站点。 根据任务管理器 rsession 进程执行约 1 小时后使用 12G。 调用后

rm(list = ls())
gc()

我所有的内存仍然被 rsession 使用。问题是什么? 这是代码:

  library(XML);
  library(RCurl);
  rm(list = ls())
  if (file.exists('cookies.txt')){
        file.remove('cookies.txt')
  }
  if (file.exists('words.txt')){
        file.remove('words.txt')
  }

  doGet <- function(url, encode = 'windows-1251') {
        html <- tryCatch(
              {
                    getURL(url, curl=curl, .encoding = encode)
              }, warning = function(w) {
                    print(paste('warning: ', url, w))
              }, error = function(e) {
                    print(paste('error: ', url, e))
              }, finally = {}
        )
        write(x = html, file = '~tmp.html', append = F)
        htmlTreeParse(file = '~tmp.html', useInternalNodes = T, encoding = encode)
  }

  makeURL <- function(url) {
        paste(url_base, url, sep = "")
  }

  parse.morph <- function(n){
        val <- xmlValue(n, encoding = 'UTF-8')
        res <- tolower(gsub(" |-", "", strsplit(val, ':')[[1]][[2]]))
        rm(val)
        res
  }

  morphToList <- function(morphs) {
        print(paste(morphs, collapse=''))
        res <- list()
        res$prefix = unlist(strsplit(morphs[1], split = ';'))
        res$base =   unlist(strsplit(morphs[2], split = ';'))
        res$suffix = unlist(strsplit(morphs[3], split = ';'))
        res$ending = unlist(strsplit(morphs[4], split = ';'))
        res
  }

  indexOf <- function(val, str) {
        grep(val, strsplit(str, "")[[1]])
  }

  parse.word <- function(page) {
        xpathSApply(page, "//div[@class='word-article']/ul/li", parse.morph) 
  }

  append <- function(df, m) {
        tmp <- data.frame(p1 =  m$prefix[3], p2 =  m$prefix[2], p3 =  m$prefix[1], 
                          b1 = m$base[1],   b2 = m$base[2],  
                          s1 = m$suffix[1], s2 = m$suffix[2], s3 = m$suffix[3], s4 = m$suffix[4], 
                          e1 = m$ending[1], e2 = m$ending[2], e3 = m$ending[3])
        rbind(df, tmp)
  }

  parsePage <- function(page) {
        words.url <- xpathSApply(page, "//tr[contains(@class, 'row')]/td/a", xmlGetAttr, 'href') 
        df <- data.frame(p1 = c(), p2 = c(), p3 = c(), b1 = c(), b2 = c(),  s1 = c(), s2 = c(), s3 = c(), s4 = c(), e1 = c(), e2 = c(), e3 = c())
        for(word.url in words.url) {
              page <- doGet(makeURL(word.url))
              word.morphs <- parse.word(page)
              df <- append(df, morphToList(word.morphs))
        }
        return(df)
  }

  saveWords <- function(df, fileName) {
        write.table(file = fileName, x = df, append = T, row.names = F, col.names = F, quote = T, sep = ',')
  }

  url_base <- 'http://slovonline.ru'
  url_addr <- makeURL('/slovar_sostav')
  agent<-"Mozilla/5.0"

  curl<-getCurlHandle()
  curlSetOpt(curl = curl, cookiejar='cookies.txt', useragent='Mozilla/5.0', followlocation=T)


  index <- doGet(url_addr)
  lrs.url <- xpathSApply(index, "//div[@class = 'nletters all']/a", xmlGetAttr, 'href') 

  for (letter in lrs.url[1:2]) {
        page <- doGet(makeURL(letter))
        table <- parsePage(page)
        pages.url <- c(letter, xpathSApply(page, "//div[@class = 'npages']/a", xmlGetAttr, 'href'))
        saveWords(df = table, fileName = 'words.csv')
        for (page.url in pages.url) {
              page <- doGet(makeURL(page.url))
              table <- parsePage(page)
              saveWords(df = table, fileName = 'words.csv')
        }
  }

XML package is known to have memory management issues, as Whosebug query reveals (examples here, here and here). Duncan Lang, author and maintainer of package, went as far as writing paper about memory usage issues.

您可以尝试 Hadley Wickham 的 xml2 包,它承诺比 XML 更好的内存管理。我个人尚未证实此说法。

上次我不得不抓取大量网络数据(大约 20k 页)时,我决定在 Python 中重写整个内容。那时还没有xml2

我尝试过的另一种方法是从 shell 循环启动 R 脚本。这样 rsession 进程在内存耗尽之前就停止了。它工作得相当好,虽然有点笨拙。

有兴趣的可以看算法概览:

  • 在shell中:
    • 检查 special-name 存在的文件
    • 如果不存在,运行 r 在 R 脚本文件上
    • 重复
  • 在 R 脚本中
    • 从 "yet-to-process" 池中获取地址样本
    • 如果样本为空("yet-to-process" 池已耗尽)- 使用 special-name 创建文件(信号 shell 停止执行)并完成
    • 对于每个地址:对其进行处理并将其从 "yet-to-process" 池中删除