使用 Panda 和 BeautifulSoup 附加结果
Appending results with Panda and BeautifulSoup
问题: 我有一个网站列表,我希望 BS 和 Pandas 为 table 获取数据。我想将所有迭代结果添加到同一个 xlsx 或 csv 文件中。
我下面的当前代码将遍历 3 个站点中的每一个,但最终产品只是要抓取的最后一页。删除我的导出功能并仅打印 df
,我可以看到所有 3 页数据;所以我不确定如何正确地将每次迭代附加到我的输出文件中。
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import gmtime, strftime
#Pass in the URL
url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]
for site in url:
#Load the page html
page = requests.get(site)
soup = BeautifulSoup(page.text, 'lxml')
# Get all the table data
table = soup.find('table', {'summary':'Standings - Detailed View'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
#Dataframe the headers into columns
df = pd.DataFrame(columns = headers)
# TR for the rows, TD for the values
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
#Write the collected data out to an Excel file
dateTime = strftime("%d%b%Y_%H%M", gmtime())
writer = pd.ExcelWriter(dateTime + "Z" + ".xlsx")
df.to_excel(writer)
writer.save()
print('[*] Data successfully written to Excel File.')
尝试以下操作。您需要从每个 url 捕获所有数据帧,然后将它们连接起来,然后将新的 df 写入 excel。这应该有效,但未经测试。查看内联评论。
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import gmtime, strftime
#Pass in the URL
url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]
df_hold_list = [] #collect each dataframe separately
for site in url:
#Load the page html
page = requests.get(site)
soup = BeautifulSoup(page.text, 'lxml')
# Get all the table data
table = soup.find('table', {'summary':'Standings - Detailed View'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
#Dataframe the headers into columns
df = pd.DataFrame(columns = headers)
# TR for the rows, TD for the values
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
df_hold_list.append(df) # add the dfs to the list
final_df = pd.concat(df_hold_list, axis=1) # put them together-check that axis=1 is correct, otherwise axis=0
# move this out of loop
#Write the collected data out to an Excel file
dateTime = strftime("%d%b%Y_%H%M", gmtime())
writer = pd.ExcelWriter(dateTime + "Z" + ".xlsx")
final_df.to_excel(writer) # write final_df to excel
writer.save()
print('[*] Data successfully written to Excel File.')
问题: 我有一个网站列表,我希望 BS 和 Pandas 为 table 获取数据。我想将所有迭代结果添加到同一个 xlsx 或 csv 文件中。
我下面的当前代码将遍历 3 个站点中的每一个,但最终产品只是要抓取的最后一页。删除我的导出功能并仅打印 df
,我可以看到所有 3 页数据;所以我不确定如何正确地将每次迭代附加到我的输出文件中。
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import gmtime, strftime
#Pass in the URL
url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]
for site in url:
#Load the page html
page = requests.get(site)
soup = BeautifulSoup(page.text, 'lxml')
# Get all the table data
table = soup.find('table', {'summary':'Standings - Detailed View'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
#Dataframe the headers into columns
df = pd.DataFrame(columns = headers)
# TR for the rows, TD for the values
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
#Write the collected data out to an Excel file
dateTime = strftime("%d%b%Y_%H%M", gmtime())
writer = pd.ExcelWriter(dateTime + "Z" + ".xlsx")
df.to_excel(writer)
writer.save()
print('[*] Data successfully written to Excel File.')
尝试以下操作。您需要从每个 url 捕获所有数据帧,然后将它们连接起来,然后将新的 df 写入 excel。这应该有效,但未经测试。查看内联评论。
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import gmtime, strftime
#Pass in the URL
url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]
df_hold_list = [] #collect each dataframe separately
for site in url:
#Load the page html
page = requests.get(site)
soup = BeautifulSoup(page.text, 'lxml')
# Get all the table data
table = soup.find('table', {'summary':'Standings - Detailed View'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
#Dataframe the headers into columns
df = pd.DataFrame(columns = headers)
# TR for the rows, TD for the values
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
df_hold_list.append(df) # add the dfs to the list
final_df = pd.concat(df_hold_list, axis=1) # put them together-check that axis=1 is correct, otherwise axis=0
# move this out of loop
#Write the collected data out to an Excel file
dateTime = strftime("%d%b%Y_%H%M", gmtime())
writer = pd.ExcelWriter(dateTime + "Z" + ".xlsx")
final_df.to_excel(writer) # write final_df to excel
writer.save()
print('[*] Data successfully written to Excel File.')