Python : IndexError: list index out of range after modifying code
Python : IndexError: list index out of range after modifying code
我的代码应该提供以下格式的输出。
我尝试修改代码,但我把它弄坏了。
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
# options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
try:
browser = create_driver()
browser.get(url)
df = pd.read_html(browser.page_source)[0]
except KeyError:
print('KeyError')
return None
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/matches/soccer/20210903/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 5
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
print(results.head())
我收到这个错误:
Traceback (most recent call last):
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_13.py", line 107, in <module>
for game_data in pool.imap(parse_data, urls):
File "C:\Program Files\Python39\lib\multiprocessing\pool.py", line 870, in next
raise value
File "C:\Program Files\Python39\lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_13.py", line 72, in parse_data
league = count[2].text
IndexError: list index out of range
结果通常采用以下格式:
date time game score home_odds draw_odds away_odds country league
0 None 15:30 Wolves - Manchester Utd 0:1 393/100 69/25 39/50 England Premier League
1 None 13:00 Burnley - Leeds 1:1 231/100 64/25 123/100 England Premier League
2 None 13:00 Tottenham - Watford 1:0 23/50 87/25 709/100 England Premier League
3 28 Aug 2021 16:30 Liverpool - Chelsea 1:1 29/20 59/25 207/100 England Premier League
4 28 Aug 2021 14:00 Aston Villa - Brentford 1:1 109/100 58/25 74/25 England Premier League
5 28 Aug 2021 14:00 Brighton - Everton 0:2 33/25 113/50 239/100 England Premier League
6 28 Aug 2
021 14:00 Newcastle - Southampton 2:2 73/50 257/100 189/100 England Premier League
如何获取数据?
详细:
我有一个代码可以为下一场比赛循环运行 url,我想修改它。 oddsportal 的匹配 'next matches' 的 Xpath 是://*[@id="col-content"]/div[3]/div/div/span
这个图片。
请帮忙
我看到的唯一日期是页面顶部的标题“下一场足球比赛:”。
我看不出您最初使用 df = pd.read_html(browser.page_source)[0]
创建数据框以及您对该数据框的后续迭代有任何意义;你应该直接迭代主 table 的 标签。如果操作正确,您最终会在国家和联赛列中获得正确的值。
我还更改了一些变量名称,以更准确地反映它们所持有的值类型。此外,我已经稍微简化了您在 HTML 层次结构中的导航,认识到具有 id 属性的元素在文档中必须是唯一的,因此您可以通过该 id 直接检索它而不必先检索它的 parent.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' not in tr_tag.attrs:
continue
tr_class = tr_tag['class']
if 'dark' in tr_class:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
elif 'deactivate' in tr_class:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/matches/soccer/20210903/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 5
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
#print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
打印:
date time game score home_odds draw_odds away_odds country league
0 03 Sep 2021 00:00 Petrolera - Dep. Pasto 2:3 -128 +245 +334 Colombia Copa Colombia
1 03 Sep 2021 00:00 Jalapa - Export Sebaco 0:2 -137 +266 +307 Nicaragua Liga Primera
2 03 Sep 2021 00:00 Venezuela - Argentina 1:3 +799 +376 -270 World World Cup 2022
3 03 Sep 2021 00:05 Canada - Honduras 1:1 -196 +290 +597 World World Cup 2022
4 03 Sep 2021 01:00 Peru - Uruguay 1:1 +231 +204 +140 World World Cup 2022
.. ... ... ... ... ... ... ... ... ...
219 03 Sep 2021 23:00 Greenville - Toronto FC II 3:0 -147 +263 +363 USA USL League One
220 03 Sep 2021 23:30 Nashville SC - New York City 3:1 +166 +235 +166 USA MLS
221 03 Sep 2021 23:30 Philadelphia Union - New England Revolution 0:1 +164 +256 +154 USA MLS
222 03 Sep 2021 23:30 Louisville City - FC Tulsa 0:1 -233 +394 +459 USA USL Championship
223 03 Sep 2021 23:30 Tampa Bay - Oakland Roots 3:0 -227 +320 +573 USA USL Championship
[224 rows x 9 columns]
我的代码应该提供以下格式的输出。
我尝试修改代码,但我把它弄坏了。
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
# options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
try:
browser = create_driver()
browser.get(url)
df = pd.read_html(browser.page_source)[0]
except KeyError:
print('KeyError')
return None
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/matches/soccer/20210903/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 5
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
print(results.head())
我收到这个错误:
Traceback (most recent call last):
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_13.py", line 107, in <module>
for game_data in pool.imap(parse_data, urls):
File "C:\Program Files\Python39\lib\multiprocessing\pool.py", line 870, in next
raise value
File "C:\Program Files\Python39\lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_13.py", line 72, in parse_data
league = count[2].text
IndexError: list index out of range
结果通常采用以下格式:
date time game score home_odds draw_odds away_odds country league
0 None 15:30 Wolves - Manchester Utd 0:1 393/100 69/25 39/50 England Premier League
1 None 13:00 Burnley - Leeds 1:1 231/100 64/25 123/100 England Premier League
2 None 13:00 Tottenham - Watford 1:0 23/50 87/25 709/100 England Premier League
3 28 Aug 2021 16:30 Liverpool - Chelsea 1:1 29/20 59/25 207/100 England Premier League
4 28 Aug 2021 14:00 Aston Villa - Brentford 1:1 109/100 58/25 74/25 England Premier League
5 28 Aug 2021 14:00 Brighton - Everton 0:2 33/25 113/50 239/100 England Premier League
6 28 Aug 2
021 14:00 Newcastle - Southampton 2:2 73/50 257/100 189/100 England Premier League
如何获取数据?
详细:
我有一个代码可以为下一场比赛循环运行 url,我想修改它。 oddsportal 的匹配 'next matches' 的 Xpath 是://*[@id="col-content"]/div[3]/div/div/span
这个图片。
请帮忙
我看到的唯一日期是页面顶部的标题“下一场足球比赛:”。
我看不出您最初使用 df = pd.read_html(browser.page_source)[0]
创建数据框以及您对该数据框的后续迭代有任何意义;你应该直接迭代主 table 的 标签。如果操作正确,您最终会在国家和联赛列中获得正确的值。
我还更改了一些变量名称,以更准确地反映它们所持有的值类型。此外,我已经稍微简化了您在 HTML 层次结构中的导航,认识到具有 id 属性的元素在文档中必须是唯一的,因此您可以通过该 id 直接检索它而不必先检索它的 parent.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' not in tr_tag.attrs:
continue
tr_class = tr_tag['class']
if 'dark' in tr_class:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
elif 'deactivate' in tr_class:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/matches/soccer/20210903/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 5
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
#print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
打印:
date time game score home_odds draw_odds away_odds country league
0 03 Sep 2021 00:00 Petrolera - Dep. Pasto 2:3 -128 +245 +334 Colombia Copa Colombia
1 03 Sep 2021 00:00 Jalapa - Export Sebaco 0:2 -137 +266 +307 Nicaragua Liga Primera
2 03 Sep 2021 00:00 Venezuela - Argentina 1:3 +799 +376 -270 World World Cup 2022
3 03 Sep 2021 00:05 Canada - Honduras 1:1 -196 +290 +597 World World Cup 2022
4 03 Sep 2021 01:00 Peru - Uruguay 1:1 +231 +204 +140 World World Cup 2022
.. ... ... ... ... ... ... ... ... ...
219 03 Sep 2021 23:00 Greenville - Toronto FC II 3:0 -147 +263 +363 USA USL League One
220 03 Sep 2021 23:30 Nashville SC - New York City 3:1 +166 +235 +166 USA MLS
221 03 Sep 2021 23:30 Philadelphia Union - New England Revolution 0:1 +164 +256 +154 USA MLS
222 03 Sep 2021 23:30 Louisville City - FC Tulsa 0:1 -233 +394 +459 USA USL Championship
223 03 Sep 2021 23:30 Tampa Bay - Oakland Roots 3:0 -227 +320 +573 USA USL Championship
[224 rows x 9 columns]