如何将国外网页(如gem5官方文档)下载到本地?
How can I download foreign web pages (such as gem5 official documents) to the local?
gem5(http://www.gem5.org/)的官方文档在国内一直无法访问。即使用了vpn,我发现也只能偶尔访问,而且每次都要等很长时间才能进去,而且经常出现页面无法访问的错误。所以我想下载这个文档。然后我可以离线访问。在网上找了很多工具,但是都只能下载国内的网页。国外网页总是显示无响应?既然这些工具都没有,我想用python把所有的weblink都搞定,然后根据这些link下载?我用了网上的一段代码,但是这段代码输出links到国内的网站是正常的,但是用gem5的官方文档link给它的时候,又出现无响应的错误,这会让我发疯。我不知道去哪里问这些问题?谁能告诉我该怎么做?
不仅有gem5的官方文档,还有很多国外的网页版文档想下载下来随时查阅?有什么办法吗?我上传了我用来github(https://github.com/Yujie-Cui/bin/blob/master/getLink.py)的代码?谁能帮我看看应该怎么修改?
# __author__ = 'Administrat
# coding=utf-8
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
import io
import os
import sys
from urllib import request
from urllib.request import urlopen
import urllib
pages = set()
random.seed(datetime.datetime.now())
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
# 获取页面所有内链的列表
def getInternalLinks(bsObj, includeUrl):
includeUrl = urlparse(includeUrl).scheme + "://" + urlparse(includeUrl).netloc
internalLinks = []
# 找出所有以“/”开头的链接
for link in bsObj.findAll("a", href=re.compile("^(/|.*" + includeUrl + ")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if (link.attrs['href'].startswith("/")):
internalLinks.append(includeUrl + link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
# 获取页面所有外链的列表
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
# 找出所有以“http”或者“www”开头且不包含当前URL的链接
for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def getRandomExternalLink(startingPage):
req = request.Request(startingPage, headers=headers)
html = urlopen(req)
bsObj = BeautifulSoup(html.read(), "html.parser")
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print("没有外部链接,准备遍历整个网站")
domain = urlparse(startingPage).scheme + "://" + urlparse(startingPage).netloc
internalLinks = getInternalLinks(bsObj, domain)
return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks) - 1)])
else:
return externalLinks[random.randint(0, len(externalLinks) - 1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print("随机外链是: " + externalLink)
followExternalOnly(externalLink)
# 收集网站上发现的所有外链列表
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
# 设置代理IP访问
# proxy_handler = urllib.request.ProxyHandler({'http': '183.77.250.45:3128'})
proxy_handler = urllib.request.ProxyHandler({'http': '183.77.250.45:3128'})
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()
# proxy_auth_handler.add_password('realm', '123.123.2123.123', 'user', 'password')
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
req = request.Request(siteUrl, headers=headers)
html = urlopen(req)
bsObj = BeautifulSoup(html.read(), "html.parser")
domain = urlparse(siteUrl).scheme + "://" + urlparse(siteUrl).netloc
internalLinks = getInternalLinks(bsObj, domain)
externalLinks = getExternalLinks(bsObj, domain)
#收集外链
for link in externalLinks:
if link not in allExtLinks:
allExtLinks.add(link)
# print(link)
print("extern url: " + link)
# 收集内链
for link in internalLinks:
if link not in allIntLinks:
print("intern url: " + link)
allIntLinks.add(link)
getAllExternalLinks(link)
# followExternalOnly("http://bbs.3s001.com/forum-36-1.html")
# allIntLinks.add("http://bbs.3s001.com/forum-36-1.html")
getAllExternalLinks("http://www.gem5.org/documentation/learning_gem5/introduction/")
Q:不知道去哪里问这些问题?
A: 因为只有中文才有这个问题,你最好去中文技术社区问,比如v2ex。不在中国的人很难理解我们的问题T T
问:谁能告诉我应该怎么做?
A:作为开源社区的一员,我给大家提供另一个方面来解决这个问题。现在大部分文档都是社区写的,你可以直接拿到源文档文件。 我不使用 gem5,但我发现了这个 如何下载 gem5-website 到本地.