Python, 'IndexError: list index out of range' when parsing large arrays BeautifulSoup
Python, 'IndexError: list index out of range' when parsing large arrays BeautifulSoup
我不断收到以下错误:
Traceback (most recent call last):
File "C:\Users\User\Documents\Project.py", line 100, in <module>
parseData(array)
File "C:\Users\User\Documents\Project.py", line 91, in parseData
name2 = pageSoup.findAll('div', {'class': 'item-title'})[0].string
IndexError: list index out of range
传递给函数的数组包含几千个 URL。当我用一个长度更短的数百个数组进行测试时,它可以正常工作,完成时没有问题。我不太确定为什么当使用更大的数组作为输入时它不起作用。
def parseData(urls):
f = io.open('output.txt', 'a', encoding='utf-8')
for url in urls:
response = urllib.request.urlopen(url)
responseContent = response.read()
pageSoup = BeautifulSoup(responseContent, 'html.parser', from_encoding="utf-8")
if 'https://example.com' in url:
name = pageSoup.findAll('h3', {'class': 'tb-main-title'})[0].string
price = pageSoup.findAll('em', {'class': 'tb-rmb-num'})[0].string
link = url
print('Retriving data from ' + str(link) + '...\n' + str(name) + ':' + str(price))
f.write('\n' + str(link) + '\n' + str(name) + '\n' + str(price) + '\n')
elif 'https://example2.com' in url:
name2 = pageSoup.findAll('div', {'class': 'item-title'})[0].string
price2 = pageSoup.findAll('span', {'class': 'cur-price'})[0].string
print('Retriving data from ' + str(link) + '...\n' + str(name2) + ':' + str(price2))
f.write('\n' + str(link) + '\n' + str(name2) + '\n' + str(price2) + '\n')
感谢您花时间查看此内容,非常感谢您的帮助! :)
有一个 IndexError,我认为这两个问题都可以通过这种方式解决:
import urllib.request
def parseData(url):
with urllib.request.urlopen('url') as response:
if response:
responseContent = response.read()
pageSoup = BeautifulSoup(responseContent, 'html.parser', from_encoding="utf-8")
if 'https://example.com' in url:
try:
name1 = pageSoup.findAll('h3', {'class': 'tb-main-title'})[0].string
price1 = pageSoup.findAll('em', {'class': 'tb-rmb-num'})[0].string
except IndexError as e:
pass
else:
link = url
print('Retriving data from ' + str(link) + '...\n' + str(name1) + ':' + str(price1))
yield (link, name1, price1)
elif 'https://example2.com' in url:
try:
name2 = pageSoup.findAll('div', {'class': 'item-title'})[0].string
price2 = pageSoup.findAll('span', {'class': 'cur-price'})[0].string
except IndexError as e:
pass
else:
print('Retriving data from ' + str(link) + '...\n' + str(name2) + ':' + str(price2))
yield (link, name2, price2)
urls = ["list of urls here"]
if __name__ == "main":
for url_ in urls:
link, name, price = parseData(url_)
with open('output.txt', 'a', encoding='utf-8') as f:
f.write('\n' + str(link) + '\n' + str(name) + '\n' + str(price) + '\n')
这段代码添加了一个管理器上下文和一些可以避免一些错误的检查
这改善了上述反应
import urllib.request
from bs4 import BeautifulSoup
from collections import namedtuple
Data = namedtuple('Data', 'link name price')
def parseData(url):
link = None
name = None
price = None
with urllib.request.urlopen(url) as response:
if response:
# responseContent = response.read()
pageSoup = BeautifulSoup(response, 'html.parser', from_encoding="utf-8")
if 'https://example.com' in url:
try:
name = pageSoup.findAll('h3', {'class': 'tb-main-title'})[0].string
price = pageSoup.findAll('em', {'class': 'tb-rmb-num'})[0].string
except IndexError as e:
pass
elif 'https://example2.com' in url:
breakpoint()
try:
name = pageSoup.findAll('div', {'class': 'item-title'})[0].string
price = pageSoup.findAll('span', {'class': 'cur-price'})[0].string
except IndexError as e:
pass
link = url
print('Retriving data from ' + str(link) + '...\n' + str(name) + ':' + str(price))
return Data(link=link, name=name, price=price)
urls = ["https://www.yahoo.com", "https://www.google.com"]
if __name__ == "__main__":
for url_ in urls:
data = parseData(url_)
if data.link and data.name and data.price:
with open('output.txt', 'a', encoding='utf-8') as f:
f.write('\n' + str(link) + '\n' + str(name) + '\n' + str(price) + '\n')
我不断收到以下错误:
Traceback (most recent call last):
File "C:\Users\User\Documents\Project.py", line 100, in <module>
parseData(array)
File "C:\Users\User\Documents\Project.py", line 91, in parseData
name2 = pageSoup.findAll('div', {'class': 'item-title'})[0].string
IndexError: list index out of range
传递给函数的数组包含几千个 URL。当我用一个长度更短的数百个数组进行测试时,它可以正常工作,完成时没有问题。我不太确定为什么当使用更大的数组作为输入时它不起作用。
def parseData(urls):
f = io.open('output.txt', 'a', encoding='utf-8')
for url in urls:
response = urllib.request.urlopen(url)
responseContent = response.read()
pageSoup = BeautifulSoup(responseContent, 'html.parser', from_encoding="utf-8")
if 'https://example.com' in url:
name = pageSoup.findAll('h3', {'class': 'tb-main-title'})[0].string
price = pageSoup.findAll('em', {'class': 'tb-rmb-num'})[0].string
link = url
print('Retriving data from ' + str(link) + '...\n' + str(name) + ':' + str(price))
f.write('\n' + str(link) + '\n' + str(name) + '\n' + str(price) + '\n')
elif 'https://example2.com' in url:
name2 = pageSoup.findAll('div', {'class': 'item-title'})[0].string
price2 = pageSoup.findAll('span', {'class': 'cur-price'})[0].string
print('Retriving data from ' + str(link) + '...\n' + str(name2) + ':' + str(price2))
f.write('\n' + str(link) + '\n' + str(name2) + '\n' + str(price2) + '\n')
感谢您花时间查看此内容,非常感谢您的帮助! :)
有一个 IndexError,我认为这两个问题都可以通过这种方式解决:
import urllib.request
def parseData(url):
with urllib.request.urlopen('url') as response:
if response:
responseContent = response.read()
pageSoup = BeautifulSoup(responseContent, 'html.parser', from_encoding="utf-8")
if 'https://example.com' in url:
try:
name1 = pageSoup.findAll('h3', {'class': 'tb-main-title'})[0].string
price1 = pageSoup.findAll('em', {'class': 'tb-rmb-num'})[0].string
except IndexError as e:
pass
else:
link = url
print('Retriving data from ' + str(link) + '...\n' + str(name1) + ':' + str(price1))
yield (link, name1, price1)
elif 'https://example2.com' in url:
try:
name2 = pageSoup.findAll('div', {'class': 'item-title'})[0].string
price2 = pageSoup.findAll('span', {'class': 'cur-price'})[0].string
except IndexError as e:
pass
else:
print('Retriving data from ' + str(link) + '...\n' + str(name2) + ':' + str(price2))
yield (link, name2, price2)
urls = ["list of urls here"]
if __name__ == "main":
for url_ in urls:
link, name, price = parseData(url_)
with open('output.txt', 'a', encoding='utf-8') as f:
f.write('\n' + str(link) + '\n' + str(name) + '\n' + str(price) + '\n')
这段代码添加了一个管理器上下文和一些可以避免一些错误的检查
这改善了上述反应
import urllib.request
from bs4 import BeautifulSoup
from collections import namedtuple
Data = namedtuple('Data', 'link name price')
def parseData(url):
link = None
name = None
price = None
with urllib.request.urlopen(url) as response:
if response:
# responseContent = response.read()
pageSoup = BeautifulSoup(response, 'html.parser', from_encoding="utf-8")
if 'https://example.com' in url:
try:
name = pageSoup.findAll('h3', {'class': 'tb-main-title'})[0].string
price = pageSoup.findAll('em', {'class': 'tb-rmb-num'})[0].string
except IndexError as e:
pass
elif 'https://example2.com' in url:
breakpoint()
try:
name = pageSoup.findAll('div', {'class': 'item-title'})[0].string
price = pageSoup.findAll('span', {'class': 'cur-price'})[0].string
except IndexError as e:
pass
link = url
print('Retriving data from ' + str(link) + '...\n' + str(name) + ':' + str(price))
return Data(link=link, name=name, price=price)
urls = ["https://www.yahoo.com", "https://www.google.com"]
if __name__ == "__main__":
for url_ in urls:
data = parseData(url_)
if data.link and data.name and data.price:
with open('output.txt', 'a', encoding='utf-8') as f:
f.write('\n' + str(link) + '\n' + str(name) + '\n' + str(price) + '\n')