超出递归深度，pickle 和 BeautifulSoup

Question

我想从网站上挑选 html。我将 html 保存到列表中并尝试腌制它。一个这样的列表的例子是来自 brckhmptn.com/tour 的 html。当然这个网站的 html 很多，是我的错误吗？整个脚本都在那里，但在最后几行中调用了错误。我正在使用 Python 3.6.1

Traceback (most recent call last):
  File "./showNotifier.py", line 128, in <module>
    scrape_html(combinedUrls)
  File "./showNotifier.py", line 125, in scrape_html
    pickle.dump(sites, hf)
RecursionError: maximum recursion depth exceeded



#!/home/steph/unix/miniconda3/bin/python
'''
By Stephen Wist
stphnwist@gmail.com
ShowNotifier 
---------------------
script takes cmd line args to:
    indiacte URLS to add
default behaviour is checking if new shows were added
'''
import requests
import pickle 
import sys
import argparse
import os 
import urllib
from bs4 import BeautifulSoup

urlFile = "urls"
htmlFile = "htmls"

# take in cmd line args
parseArgs =  argparse.ArgumentParser(description='Add URLs to cache.')
parseArgs.add_argument('urls', type=str, nargs='*', help='URLs to be added.')

# assign cmd line args to urls.urls
urls = parseArgs.parse_args()                                                       

# this function makes sure all files are in place
def status_report():

    # this should be the case only the first time the 
    # script is run
    if (os.path.getsize(urlFile) == 0):
        urlFileExists = 0

        # create urlFile if it does not exist 
        if (not urls.urls):
            print ("No data in the url file. Run the script again but include url(s) on the command line.\n\
        e.g. ./showNotifier.py www.myfavoriteband.com")
            sys.exit()   
    else:
        urlFileExists = 1
    # these file should never be deleted, but just in case
    if (not os.path.isfile(urlFile)):
        f = open("urls","w")
        f.close()
    if (not os.path.isfile(htmlFile)):
        f = open("htmls","w") 
        f.close()
    return urlFileExists
urlFileExists = status_report()


# grab the urls in urlFile, or make
# urlFile if it does not exist
def read_urls(urlFileExists):
    print(urlFileExists)     

    # assign all urls in urlFile to prevUrls
    if (urlFileExists == 1):
        uf = open(urlFile, "rb") 
        prevUrls = pickle.load(uf)
        uf.close()      
        return prevUrls
    else:
        return 1
prevUrls = read_urls(urlFileExists)
print("prevUrls: {}\n".format(prevUrls))


# we will need to check if the user has
# entered a url that is already stored
# and ignore it so the contents of the stored
# urls must be known
def compare_urls(prevUrls, newUrls):

    # no urls were stored in urlFile, 
    # so just move on with the script
    if (prevUrls == 1):
        return newUrls

    # iterate over all urls given on cmd line
    # check for membership in the set of 
    # stored urls and remove them if the
    # test is true
    else:
        for url in newUrls:
            if (url in prevUrls):
                newUrls.remove(url)
                print ("duplicate url {} found, ignoring it.\n".format(url))
        combinedUrls = newUrls + prevUrls
        return combinedUrls
combinedUrls = compare_urls(prevUrls, urls.urls)
print(type(combinedUrls))
print("combinedUrls: {}\n".format(combinedUrls))
print("combo urls[0]: {}\n".format(combinedUrls[0]))
print(type(combinedUrls[0]))


# write all unique urls to file
def write_urls(combinedUrls):
    uf = open(urlFile, "wb") 
    pickle.dump(combinedUrls, uf)
    uf.close()
    return 0
write_urls(combinedUrls)

# TODO: 


# visit sites, store their HTML in a list (for now) 
def scrape_html(combinedUrls):
    sites = []

    # could this loop be shortened to a fancy list comprehension
    # or lambda expression?
    for url in combinedUrls:
        print(url)
        response = urllib.request.urlopen(url)
        html = response.read()
        soup = BeautifulSoup(html, "html.parser")
        sites.append(soup)
    hf = open(htmlFile, "wb")
    pickle.dump(sites, hf)
    hf.close()
    return 0
scrape_html(combinedUrls)

Answer 1

尝试：

import sys
sys.setrecursionlimit(10000)

10 000 次递归应该足够了发生的事情是在某处某处，某物一遍又一遍地呼唤自己。每次，这称为一次递归。 Python 有一个限制可以无限地阻止程序运行。虽然这通常是错误的迹象，但您可以增加您认为合适的限制，因为您的程序可能会递归异常大的数量。

Answer 2

最简单的方法是将 soup 转换为字符串，然后 pickle：

import pickle
from bs4 import BeautifulSoup

# ... now we have soup
# dump
with open('my_soup.pkl', 'wb') as f:
    pickle.dump(str(soup), f)

# load
with open('my_soup.pkl', 'rb') as f:
    str_soup = pickle.load(f)
    re_soup = BeautifulSoup(str_soup, 'lxml')

# check if they are the same

assert soup == re_soup, 'Houston we have a problem'

# we should not see anything if they are the same.

Answer 3

尝试改变这个：

soup = BeautifulSoup(html, "html.parser")

至

soup = BeautifulSoup(str(html), "html.parser")

超出递归深度，pickle 和 BeautifulSoup

Recursion Depth Exceeded, pickle and BeautifulSoup

html

python

beautifulsoup

pickle