在 python 教程中进行网络抓取后,他们使用 urllib im 使用请求帮助翻译说明
following webscraping in python tutorial they use urllib im using requests help translating instructions
我认为到目前为止我已经很好地翻译了说明,但现在我迷路了。我没有太多的编程知识或技能。
import requests
from bs4 import BeautifulSoup
def make_soup(url):
thepage = requests.get(url)
soupdata = BeautifulSoup(thepage.text, "html.parser")
return soupdata
i = 1
soup = make_soup("https://uwaterloo.ca")
for img in soup.findAll('img'):
temp = img.get('src')
if temp[:1]=="/":
image = "https://uwaterloo.ca" + temp
else:
image = temp
nametemp = img.get('alt')
if len(nametemp) == 0:
filename = str(i)
i = i + 1
else:
filename = nametemp
这是我迷失方向的地方
imagefile = open(filename + ".jpeg", 'wb')
imagefile.write(urllib.request.urlopen(image).read()
imagefile.close()
只需将urllib逻辑替换为requests.get并将内容写入文件即可:
with open(filename + ".jpeg", 'wb') as f:
f.write(requests.get(image).content)
f.write(requests.get(image).content)
相当于urllib代码在做什么。使用上下文管理器 with 意味着您的文件将自动关闭。
我们还可以使用 css 选择器和 str.format:
稍微改进代码
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
def make_soup(url):
thepage = requests.get(url)
soupdata = BeautifulSoup(thepage.text, "html.parser")
return soupdata
soup = make_soup("https://uwaterloo.ca")
i = 1
for img in soup.select('img[src]'):
temp = img["src"]
alt = img["alt"]
if not alt:
alt = i
i += 1
if temp.startswith("/"):
temp = urljoin("https://uwaterloo.ca", temp)
with open("{}{}.jpeg".format(alt, i), 'wb') as f:
f.write(requests.get(temp).content)
我认为到目前为止我已经很好地翻译了说明,但现在我迷路了。我没有太多的编程知识或技能。
import requests
from bs4 import BeautifulSoup
def make_soup(url):
thepage = requests.get(url)
soupdata = BeautifulSoup(thepage.text, "html.parser")
return soupdata
i = 1
soup = make_soup("https://uwaterloo.ca")
for img in soup.findAll('img'):
temp = img.get('src')
if temp[:1]=="/":
image = "https://uwaterloo.ca" + temp
else:
image = temp
nametemp = img.get('alt')
if len(nametemp) == 0:
filename = str(i)
i = i + 1
else:
filename = nametemp
这是我迷失方向的地方
imagefile = open(filename + ".jpeg", 'wb')
imagefile.write(urllib.request.urlopen(image).read()
imagefile.close()
只需将urllib逻辑替换为requests.get并将内容写入文件即可:
with open(filename + ".jpeg", 'wb') as f:
f.write(requests.get(image).content)
f.write(requests.get(image).content)
相当于urllib代码在做什么。使用上下文管理器 with 意味着您的文件将自动关闭。
我们还可以使用 css 选择器和 str.format:
稍微改进代码import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
def make_soup(url):
thepage = requests.get(url)
soupdata = BeautifulSoup(thepage.text, "html.parser")
return soupdata
soup = make_soup("https://uwaterloo.ca")
i = 1
for img in soup.select('img[src]'):
temp = img["src"]
alt = img["alt"]
if not alt:
alt = i
i += 1
if temp.startswith("/"):
temp = urljoin("https://uwaterloo.ca", temp)
with open("{}{}.jpeg".format(alt, i), 'wb') as f:
f.write(requests.get(temp).content)