如何避免网络抓取中的循环中断?
How can I avoid loop break in web scrapping?
import bs4 as bs
import urllib.request
import csv
import requests
import re
from urllib.request import urlopen
from urllib.error import HTTPError
import pandas as pd
import time
import urllib.error
book_ids = ["9781408110416","9789604249671","9781405950305"]
def get_description(book_id):
my_urls = 'https://www.bookdepository.com/Enid-Blytons-Christmas-Tales-Enid-Blyton/' + book_id
source = urlopen(my_urls).read()
soup = bs.BeautifulSoup(source, 'lxml')
description = soup.find('div', class_='item-img-content')
if description:
return description
else:
return "[No description"
for book_id in book_ids:
print(book_id)
print(get_description(book_id))
time.sleep(2)
我得到的错误-
HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
您好,我正在编写脚本以从 bookdepository 中删除图像 url。我的主要问题是某些书籍 return 404 因为它们在平台上没有。
我怎样才能让脚本只转到下一个代码而不停止整个循环。
提前致谢。
您可以将 get_description()
函数中的代码放在 try
和 except
块中。像这样,
def get_description(book_id):
my_urls = 'https://www.bookdepository.com/Enid-Blytons-Christmas-Tales-Enid-Blyton/' + book_id
try:
source = urlopen(my_urls).read()
soup = bs.BeautifulSoup(source, 'lxml')
description = soup.find('div', class_='item-img-content')
if description:
return description
else:
return "[No description"
except:
pass
现在,如果代码遇到错误,它会跳过那个 book_id
并转到下一个。
import bs4 as bs
import urllib.request
import csv
import requests
import re
from urllib.request import urlopen
from urllib.error import HTTPError
import pandas as pd
import time
import urllib.error
book_ids = ["9781408110416","9789604249671","9781405950305"]
def get_description(book_id):
my_urls = 'https://www.bookdepository.com/Enid-Blytons-Christmas-Tales-Enid-Blyton/' + book_id
source = urlopen(my_urls).read()
soup = bs.BeautifulSoup(source, 'lxml')
description = soup.find('div', class_='item-img-content')
if description:
return description
else:
return "[No description"
for book_id in book_ids:
print(book_id)
print(get_description(book_id))
time.sleep(2)
我得到的错误-
HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
您好,我正在编写脚本以从 bookdepository 中删除图像 url。我的主要问题是某些书籍 return 404 因为它们在平台上没有。
我怎样才能让脚本只转到下一个代码而不停止整个循环。
提前致谢。
您可以将 get_description()
函数中的代码放在 try
和 except
块中。像这样,
def get_description(book_id):
my_urls = 'https://www.bookdepository.com/Enid-Blytons-Christmas-Tales-Enid-Blyton/' + book_id
try:
source = urlopen(my_urls).read()
soup = bs.BeautifulSoup(source, 'lxml')
description = soup.find('div', class_='item-img-content')
if description:
return description
else:
return "[No description"
except:
pass
现在,如果代码遇到错误,它会跳过那个 book_id
并转到下一个。