Python 中未指定超时,但属性存在
Timeout not specified in Python, but attribute is there
from bs4 import BeautifulSoup
import urllib.request
import re
def getLinks(url):
html_page = urllib.request.urlopen(url)
soup = BeautifulSoup(html_page, "html.parser")
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
return links
anchors = getLinks("http://madisonmemorial.org/")
for anchor in anchors:
happens = urllib.request.urlopen(anchor)
if happens.getcode() == "404":
print(happens.getcode())
# Click on links and return responses
countMe = len(anchors)
for anchor in anchors:
i = getLinks(anchor)
happens = urllib.request.urlopen(i, timeout = 2)
if happens.getcode() == "404":
print(happens.getcode())
countMe += len(i)
print(countMe)
所以说到这个我真的不知道该说什么...我以为设置网络抓取工具会很简单,但这变成了一个真正的挑战。所以第二个 for
循环(第一个具有 anchor in anchors
参数的循环)工作正常并返回代码......这是最后一个给我问题的 for 循环......特别是上面写着的那一行:
happens = urllib.request.urlopen(i, timeout = 2)
为什么程序在上面的那一行超时了,但是在上面的for
循环中完全相同的那一行却没有超时。而且超时的时候,会超时几十次。
我看过 this question but that doesn't really help because it is with building a networking app, I did get my try
- except
syntax and logic down with that question though. I've looked at this question, but it didn't really help me because it wasn't really applicable to the issue, and I looked at this SO question 试图完成一些稍微不同的事情
下面的代码可以满足您的需求。请注意,您可以递归地跟踪链接,并且您需要指定希望此递归的深度。
import requests
import re
from bs4 import BeautifulSoup
def getLinks(url):
response = requests.get(url)
if response.status_code != 200: return []
html_page = response.content
soup = BeautifulSoup(html_page, "html.parser")
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
# remove duplicates
links = list(set(links))
return links
def count_dead_links(url, recursion_depth=0):
count = 0
for link in getLinks(url):
response = requests.get(link)
if response.status_code == 404:
count += 1
else:
if recursion_depth > 0:
count += count_dead_links(link, recursion_depth-1)
return count
# returns count of dead links on the page
print(count_dead_links("http://madisonmemorial.org/"))
# returns count of dead links on the page plus all the dead links
# on all the pages that result after following links that work.
print(count_dead_links("http://madisonmemorial.org/", 1))
from bs4 import BeautifulSoup
import urllib.request
import re
def getLinks(url):
html_page = urllib.request.urlopen(url)
soup = BeautifulSoup(html_page, "html.parser")
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
return links
anchors = getLinks("http://madisonmemorial.org/")
for anchor in anchors:
happens = urllib.request.urlopen(anchor)
if happens.getcode() == "404":
print(happens.getcode())
# Click on links and return responses
countMe = len(anchors)
for anchor in anchors:
i = getLinks(anchor)
happens = urllib.request.urlopen(i, timeout = 2)
if happens.getcode() == "404":
print(happens.getcode())
countMe += len(i)
print(countMe)
所以说到这个我真的不知道该说什么...我以为设置网络抓取工具会很简单,但这变成了一个真正的挑战。所以第二个 for
循环(第一个具有 anchor in anchors
参数的循环)工作正常并返回代码......这是最后一个给我问题的 for 循环......特别是上面写着的那一行:
happens = urllib.request.urlopen(i, timeout = 2)
为什么程序在上面的那一行超时了,但是在上面的for
循环中完全相同的那一行却没有超时。而且超时的时候,会超时几十次。
我看过 this question but that doesn't really help because it is with building a networking app, I did get my try
- except
syntax and logic down with that question though. I've looked at this question, but it didn't really help me because it wasn't really applicable to the issue, and I looked at this SO question 试图完成一些稍微不同的事情
下面的代码可以满足您的需求。请注意,您可以递归地跟踪链接,并且您需要指定希望此递归的深度。
import requests
import re
from bs4 import BeautifulSoup
def getLinks(url):
response = requests.get(url)
if response.status_code != 200: return []
html_page = response.content
soup = BeautifulSoup(html_page, "html.parser")
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
# remove duplicates
links = list(set(links))
return links
def count_dead_links(url, recursion_depth=0):
count = 0
for link in getLinks(url):
response = requests.get(link)
if response.status_code == 404:
count += 1
else:
if recursion_depth > 0:
count += count_dead_links(link, recursion_depth-1)
return count
# returns count of dead links on the page
print(count_dead_links("http://madisonmemorial.org/"))
# returns count of dead links on the page plus all the dead links
# on all the pages that result after following links that work.
print(count_dead_links("http://madisonmemorial.org/", 1))