BeautifulSoup、请求、数据帧保存到 Excel 数组错误
BeautifulSoup, Requests, Dataframe Saving to Excel arrays error
我是 Python 的新手,正在帮助完成一个学校项目。任何帮助深表感谢。谢谢。当它到达 2004 年和 2003 年时出现错误。这是由 result_list 列表引起的。错误是 "ValueError: arrays must all be same length"。我怎样才能引入解决这个问题的代码。分数很重要....
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
#from openpyxl.writer.excel import ExcelWriter
import openpyxl
#from openpyxl import load_workbook
import csv
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
i=0
while i <= len(year_id)-1:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + str(year_id[i])
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
date_list = []
for date in soup.find_all('div',class_="sidearm-schedule-game-opponent-date"):
date_list.append(date.get_text(strip=True, separator=' '))
name_list = []
for name in soup.find_all('div',class_="sidearm-schedule-game-opponent-name"):
name_list.append(name.get_text(strip=True, separator=' '))
result_list = []
for result in soup.find_all('div',class_="sidearm-schedule-game-result"):
result_list.append(result.get_text(strip=True, separator=' '))
opp_list = []
for opp in soup.find_all('div',class_="sidearm-schedule-game-opponent-text"):
opp_list.append(opp.get_text(strip=True, separator=' '))
conf_list = []
for conf in soup.find_all('div',class_="sidearm-schedule-game-conference-conference"):
conf_list.append(conf.get_text(strip=True))
dict = {'date':date_list,'opponent':name_list,'result':result_list,'list':opp_list,'conference':conf_list}
df = pd.DataFrame(dict)
workbook1 = openpyxl.load_workbook('lehigh.xlsx')
writer = pd.ExcelWriter('lehigh.xlsx', engine='openpyxl')
writer.book = workbook1
df.to_excel(writer, sheet_name=str(year_id[i]),index=False,startrow=0,startcol=0)
writer.save()
writer.close()
i = i+1
Code is updated:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import zip_longest
d = []
n = []
res = []
op = []
yr = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
r = req.get(
f"https://lehighsports.com/sports/mens-soccer/schedule/{year}")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for date in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-date flex-item-1'}):
d.append(date.get_text(strip=True, separator=" "))
for name in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-name'}):
n.append(name.get_text(strip=True))
for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
result = result.get_text(strip=True)
res.append(result)
if len(d) != len(res):
res.append("None")
for opp in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-text'}):
op.append(opp.get_text(strip=True, separator=' '))
yr.append(year)
data = []
for items in zip_longest(yr, d, n, res, op):
data.append(items)
df = pd.DataFrame(data, columns=['Year', 'Date', 'Name', 'Result', 'Opponent']).to_excel(
'lehigh.xlsx', index=False)
输出:check-online
几件事:
- 您不需要遍历索引。只需简单地遍历列表
- 你得到错误的原因是结果列表的长度是 23,而你的其他列表的长度是 24。所以你需要弄清楚如何处理空值,并处理它们是否落下 (他们可能并不总是最后一个条目)
我会怎么做,我会抓取每一行,然后为此拉取数据,而不是将每个实体拉到列表中。然后,我获取网站上的所有这些行并创建一个 table,并制作一个 table 的列表(每年 1 个 table)。处理缺失数据的方法是使用 try/except。我还添加了一个小函数(找到 here),它将获取 table 的列表并将它们写入单独的 excel 工作表。
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
#from openpyxl.writer.excel import ExcelWriter
import openpyxl
#from openpyxl import load_workbook
import csv
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
results = []
for year in year_id:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
print (url)
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results.append(sheet)
def save_xls(list_dfs, xls_path):
with ExcelWriter(xls_path) as writer:
for n, df in enumerate(list_dfs):
df.to_excel(writer,'%s' %year_id[n],index=False,)
writer.save()
save_xls(results,'lehigh.xlsx')
我是 Python 的新手,正在帮助完成一个学校项目。任何帮助深表感谢。谢谢。当它到达 2004 年和 2003 年时出现错误。这是由 result_list 列表引起的。错误是 "ValueError: arrays must all be same length"。我怎样才能引入解决这个问题的代码。分数很重要....
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
#from openpyxl.writer.excel import ExcelWriter
import openpyxl
#from openpyxl import load_workbook
import csv
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
i=0
while i <= len(year_id)-1:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + str(year_id[i])
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
date_list = []
for date in soup.find_all('div',class_="sidearm-schedule-game-opponent-date"):
date_list.append(date.get_text(strip=True, separator=' '))
name_list = []
for name in soup.find_all('div',class_="sidearm-schedule-game-opponent-name"):
name_list.append(name.get_text(strip=True, separator=' '))
result_list = []
for result in soup.find_all('div',class_="sidearm-schedule-game-result"):
result_list.append(result.get_text(strip=True, separator=' '))
opp_list = []
for opp in soup.find_all('div',class_="sidearm-schedule-game-opponent-text"):
opp_list.append(opp.get_text(strip=True, separator=' '))
conf_list = []
for conf in soup.find_all('div',class_="sidearm-schedule-game-conference-conference"):
conf_list.append(conf.get_text(strip=True))
dict = {'date':date_list,'opponent':name_list,'result':result_list,'list':opp_list,'conference':conf_list}
df = pd.DataFrame(dict)
workbook1 = openpyxl.load_workbook('lehigh.xlsx')
writer = pd.ExcelWriter('lehigh.xlsx', engine='openpyxl')
writer.book = workbook1
df.to_excel(writer, sheet_name=str(year_id[i]),index=False,startrow=0,startcol=0)
writer.save()
writer.close()
i = i+1
Code is updated:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import zip_longest
d = []
n = []
res = []
op = []
yr = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
r = req.get(
f"https://lehighsports.com/sports/mens-soccer/schedule/{year}")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for date in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-date flex-item-1'}):
d.append(date.get_text(strip=True, separator=" "))
for name in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-name'}):
n.append(name.get_text(strip=True))
for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
result = result.get_text(strip=True)
res.append(result)
if len(d) != len(res):
res.append("None")
for opp in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-text'}):
op.append(opp.get_text(strip=True, separator=' '))
yr.append(year)
data = []
for items in zip_longest(yr, d, n, res, op):
data.append(items)
df = pd.DataFrame(data, columns=['Year', 'Date', 'Name', 'Result', 'Opponent']).to_excel(
'lehigh.xlsx', index=False)
输出:check-online
几件事:
- 您不需要遍历索引。只需简单地遍历列表
- 你得到错误的原因是结果列表的长度是 23,而你的其他列表的长度是 24。所以你需要弄清楚如何处理空值,并处理它们是否落下 (他们可能并不总是最后一个条目)
我会怎么做,我会抓取每一行,然后为此拉取数据,而不是将每个实体拉到列表中。然后,我获取网站上的所有这些行并创建一个 table,并制作一个 table 的列表(每年 1 个 table)。处理缺失数据的方法是使用 try/except。我还添加了一个小函数(找到 here),它将获取 table 的列表并将它们写入单独的 excel 工作表。
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
#from openpyxl.writer.excel import ExcelWriter
import openpyxl
#from openpyxl import load_workbook
import csv
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
results = []
for year in year_id:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
print (url)
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results.append(sheet)
def save_xls(list_dfs, xls_path):
with ExcelWriter(xls_path) as writer:
for n, df in enumerate(list_dfs):
df.to_excel(writer,'%s' %year_id[n],index=False,)
writer.save()
save_xls(results,'lehigh.xlsx')