使用 pandas read_html 抓取时将 table 行分隔为 2
Separate table row to 2 when scraping with pandas read_html
使用 pandas read_html()
时无法正确获取 行 格式。我正在寻找对方法本身或底层 html(通过 bs4 删除)进行调整以获得所需的输出。
当前输出:
(注意是1行包含两种类型的数据。理想情况下应该分成2行如下)
期望:
复制问题的代码:
import requests
import pandas as pd
from bs4 import BeautifulSoup # alternatively
url = "http://ufcstats.com/fight-details/bb15c0a2911043bd"
df = pd.read_html(url)[-1] # last table
df.columns = [str(i) for i in range(len(df.columns))]
# to get the html via bs4
headers = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET",
"Access-Control-Allow-Headers": "Content-Type",
"Access-Control-Max-Age": "3600",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
}
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, "html.parser")
table_html = soup.find_all("table", {"class": "b-fight-details__table"})[-1]
如何(快速)修复 beautifulsoup
您可以使用 table
中的 headers 创建一个 dict
,然后遍历每个 td
以附加存储在 [=16 中的值列表=]:
data = {}
header = [x.text.strip() for x in table_html.select('tr th')]
for i,td in enumerate(table_html.select('tr:has(td) td')):
data[header[i]] = [x.text.strip() for x in td.select('p')]
pd.DataFrame.from_dict(data)
例子
import requests
import pandas as pd
from bs4 import BeautifulSoup # alternatively
url = "http://ufcstats.com/fight-details/bb15c0a2911043bd"
# to get the html via bs4
headers = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET",
"Access-Control-Allow-Headers": "Content-Type",
"Access-Control-Max-Age": "3600",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
}
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, "html.parser")
table_html = soup.find_all("table", {"class": "b-fight-details__table"})[-1]
data = {}
header = [x.text.strip() for x in table_html.select('tr th')]
for i,td in enumerate(table_html.select('tr:has(td) td')):
data[header[i]] = [x.text.strip() for x in td.select('p')]
pd.DataFrame.from_dict(data)
输出
Fighter
Sig. str
Sig. str. %
Head
Body
Leg
Distance
Clinch
Ground
Joanne Wood
27 of 68
39%
8 of 36
3 of 7
16 of 25
26 of 67
1 of 1
0 of 0
Taila Santos
30 of 60
50%
21 of 46
3 of 7
6 of 7
19 of 42
0 of 0
11 of 18
与使用枚举确定行数的想法类似,但使用 :-soup-contains
定位 table,然后使用 nth-child
选择器在列表理解期间提取相关行。 pandas
将列表的结果列表转换为 DataFrame。假设以与当前 2 相同的模式添加行。
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
r = requests.get('http://ufcstats.com/fight-details/bb15c0a2911043bd')
soup = bs(r.content, 'lxml')
table = soup.select_one(
'.js-fight-section:has(p:-soup-contains("Significant Strikes")) + table')
df = pd.DataFrame(
[[i.text.strip() for i in table.select(f'tr:nth-child(1) td p:nth-child({n+1})')]
for n, _ in enumerate(table.select('tr:nth-child(1) > td:nth-child(1) > p'))], columns=[i.text.strip() for i in table.select('th')])
print(df)
使用 pandas read_html()
时无法正确获取 行 格式。我正在寻找对方法本身或底层 html(通过 bs4 删除)进行调整以获得所需的输出。
当前输出:
期望:
复制问题的代码:
import requests
import pandas as pd
from bs4 import BeautifulSoup # alternatively
url = "http://ufcstats.com/fight-details/bb15c0a2911043bd"
df = pd.read_html(url)[-1] # last table
df.columns = [str(i) for i in range(len(df.columns))]
# to get the html via bs4
headers = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET",
"Access-Control-Allow-Headers": "Content-Type",
"Access-Control-Max-Age": "3600",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
}
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, "html.parser")
table_html = soup.find_all("table", {"class": "b-fight-details__table"})[-1]
如何(快速)修复 beautifulsoup
您可以使用 table
中的 headers 创建一个 dict
,然后遍历每个 td
以附加存储在 [=16 中的值列表=]:
data = {}
header = [x.text.strip() for x in table_html.select('tr th')]
for i,td in enumerate(table_html.select('tr:has(td) td')):
data[header[i]] = [x.text.strip() for x in td.select('p')]
pd.DataFrame.from_dict(data)
例子
import requests
import pandas as pd
from bs4 import BeautifulSoup # alternatively
url = "http://ufcstats.com/fight-details/bb15c0a2911043bd"
# to get the html via bs4
headers = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET",
"Access-Control-Allow-Headers": "Content-Type",
"Access-Control-Max-Age": "3600",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
}
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, "html.parser")
table_html = soup.find_all("table", {"class": "b-fight-details__table"})[-1]
data = {}
header = [x.text.strip() for x in table_html.select('tr th')]
for i,td in enumerate(table_html.select('tr:has(td) td')):
data[header[i]] = [x.text.strip() for x in td.select('p')]
pd.DataFrame.from_dict(data)
输出
Fighter | Sig. str | Sig. str. % | Head | Body | Leg | Distance | Clinch | Ground |
---|---|---|---|---|---|---|---|---|
Joanne Wood | 27 of 68 | 39% | 8 of 36 | 3 of 7 | 16 of 25 | 26 of 67 | 1 of 1 | 0 of 0 |
Taila Santos | 30 of 60 | 50% | 21 of 46 | 3 of 7 | 6 of 7 | 19 of 42 | 0 of 0 | 11 of 18 |
与使用枚举确定行数的想法类似,但使用 :-soup-contains
定位 table,然后使用 nth-child
选择器在列表理解期间提取相关行。 pandas
将列表的结果列表转换为 DataFrame。假设以与当前 2 相同的模式添加行。
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
r = requests.get('http://ufcstats.com/fight-details/bb15c0a2911043bd')
soup = bs(r.content, 'lxml')
table = soup.select_one(
'.js-fight-section:has(p:-soup-contains("Significant Strikes")) + table')
df = pd.DataFrame(
[[i.text.strip() for i in table.select(f'tr:nth-child(1) td p:nth-child({n+1})')]
for n, _ in enumerate(table.select('tr:nth-child(1) > td:nth-child(1) > p'))], columns=[i.text.strip() for i in table.select('th')])
print(df)