Python 中未指定超时，但属性存在

Question

from bs4 import BeautifulSoup
import urllib.request
import re

def getLinks(url):
    html_page = urllib.request.urlopen(url)
    soup = BeautifulSoup(html_page, "html.parser")
    links = []

    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        links.append(link.get('href'))
    return links

anchors = getLinks("http://madisonmemorial.org/")
for anchor in anchors:
    happens = urllib.request.urlopen(anchor)
    if happens.getcode() == "404":
        print(happens.getcode())
# Click on links and return responses
countMe = len(anchors)
for anchor in anchors:
    i = getLinks(anchor)
    happens = urllib.request.urlopen(i, timeout = 2)
    if happens.getcode() == "404":
        print(happens.getcode())
        countMe += len(i)

print(countMe)

所以说到这个我真的不知道该说什么...我以为设置网络抓取工具会很简单，但这变成了一个真正的挑战。所以第二个 for 循环（第一个具有 anchor in anchors 参数的循环）工作正常并返回代码......这是最后一个给我问题的 for 循环......特别是上面写着的那一行：

happens = urllib.request.urlopen(i, timeout = 2)

为什么程序在上面的那一行超时了，但是在上面的for循环中完全相同的那一行却没有超时。而且超时的时候，会超时几十次。
我看过 this question but that doesn't really help because it is with building a networking app, I did get my try - except syntax and logic down with that question though. I've looked at this question, but it didn't really help me because it wasn't really applicable to the issue, and I looked at this SO question 试图完成一些稍微不同的事情

Answer 1

下面的代码可以满足您的需求。请注意，您可以递归地跟踪链接，并且您需要指定希望此递归的深度。

import requests
import re
from bs4 import BeautifulSoup

def getLinks(url):
    response = requests.get(url)
    if response.status_code != 200: return []

    html_page = response.content
    soup = BeautifulSoup(html_page, "html.parser")
    links = []

    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        links.append(link.get('href'))

    # remove duplicates
    links = list(set(links))

    return links

def count_dead_links(url, recursion_depth=0):

    count = 0

    for link in getLinks(url):
        response = requests.get(link)
        if response.status_code == 404:
            count += 1
        else:
            if recursion_depth > 0:
                count += count_dead_links(link, recursion_depth-1)

    return count

# returns count of dead links on the page
print(count_dead_links("http://madisonmemorial.org/"))

# returns count of dead links on the page plus all the dead links 
# on all the pages that result after following links that work.
print(count_dead_links("http://madisonmemorial.org/", 1))

Python 中未指定超时，但属性存在

Timeout not specified in Python, but attribute is there

python

python-3.x

web-scraping

urllib