在 Python beautifulsoup 中正确抓取基于列 headers 的数据

Question

我正在尝试从 url 中获取数据。我的代码片段正在运行，但我需要更多想法来改进它。在我试图抓取的页面中，有 7 列 headers 但有些行只有 5 个数据。有没有更好的方法从列中获取数据（Parent Txn Hash, Type, From, Tom Value）？

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re, random, ctypes
import requests, os
from time import sleep
from time import strftime

user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]
header = random.choice(user_agent_list)
trim = re.compile(r'[^\d,.]+')


url = "https://bscscan.com/txsInternal?a=0xbb4cdb9cbd36b01bd1cbaebf2de08d9173bc095c&ps=100&p=1"

reqblockdetails = requests.get(url,header, timeout=10)
soupblockdetails = BeautifulSoup(reqblockdetails.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr', limit=100)
for row in rowsblockdetails[1:]:
    rowr = row.find_all('td')[0].text[0:]
    if rowr != "":
        ptxnhash = row.find_all('td')[3].text[0:]  #-- is on [3]
        ptxnhash = str(ptxnhash).strip()
        exectype1 = row.find_all('td')[4].text[0:]
        source1 = row.find_all('td')[5].text[0:]
        destination1 = row.find_all('td')[7].text[0:]
        amount1 = row.find_all('td')[8].text[0:]
        amount1 = trim.sub('', amount1)
        rptblockdetails = str("%s" % ptxnhash + "\t%s" % source1 + " \t%s" % destination1 + " \t%s" % amount1)
        print (rptblockdetails)
        
    ptxnhash = row.find_all('td')[2].text[0:]  #-- while here is on [2]
    ptxnhash = str(ptxnhash).strip()
    if ptxnhash.startswith ('0x'):
        txnhashdetails = ptxnhash.strip()
        exectype = row.find_all('td')[3].text[0:]
        source = row.find_all('td')[4].text[0:]
        source = source.strip()
        destination = row.find_all('td')[6].text[0:]
        destination = str(destination).replace("['", "").replace("']", "")
        destination = destination.strip()
        amount = row.find_all('td')[7].text[0:]
        amount = trim.sub('', amount)       
        rptblockdetails = str("%s" % ptxnhash + " \t%s" % source  + " \t%s" % destination + " \t%s" % amount)
        print (rptblockdetails)

当前输出：

0xf864a52f67b55252bac44ff5a4187cae9c3044245ddc3e094286d1dc5876175f  Binance: WBNB Token     PancakeSwap: Router v2  0.123538253669756
0x7be9b2e0ea4c88853e91d2534aeb9542f9234313cc305085e6c808c806a87cbf  Binance: WBNB Token     PancakeSwap: Router v2  0.091183472723697
0x7be9b2e0ea4c88853e91d2534aeb9542f9234313cc305085e6c808c806a87cbf  Binance: WBNB Token     PancakeSwap: Router v2  0.027077169875612
0x02ab699bc69f09ee1ab315259260a8e9baa84bc1f72e9d285426f8be5c7e1c9a  Binance: WBNB Token     PancakeSwap: Router v2  0.560826883730127
0xe3fd62ca86f93a780fb63eb17356e28d4dc1d8d5431736750b9a03c181bd449a  Binance: WBNB Token     PancakeSwap: Router v2  0.031760167285687

Answer 1

此示例将从页面获取所有数据并创建 pandas DataFrame：

import requests
import pandas as pd
from bs4 import BeautifulSoup

url = "https://bscscan.com/txsInternal?a=0xbb4cdb9cbd36b01bd1cbaebf2de08d9173bc095c&ps=100&p=1"

soup = BeautifulSoup(requests.get(url).content, "html.parser")

all_data = []
for row in soup.select("tr:has(td)"):
    tds = [td.get_text(strip=True) for td in row.select("td")]
    if len(tds) == 9:
        tds.pop(6)
        tds.pop(1)
    else:
        block = row.find_previous(
            lambda tag: tag.name == "td"
            and "bg-soft-secondary" in tag.attrs.get("class", [])
            and tag.find("a")
        )
        tds[0] = block.get_text(strip=True)
        tds[1] = block.find_next("td").find_next("td").get_text(strip=True)
        tds.pop(5)

    all_data.append(tds)

df = pd.DataFrame(
    all_data,
    columns=["Block", "Age", "Parent Txn Hash", "Type", "From", "To", "Value"],
)
print(df)
df.to_csv("data.csv", index=False)

打印：

      Block          Age                                                     Parent Txn Hash  Type                                        From                                          To                   Value
0   9655148  39 secs ago  0xe4757e4012ac1e5a2560174f9db3d74dd85fcda5c513b251301ef2239a7b0da1  call                         Binance: WBNB Token  0xcf0febd3f17cef5b47b0cd257acf6025c5bff3b7   0.032619188744045 BNB
1   9655148  39 secs ago  0xdaba21a2133a1c9244271f6ace04484d89bb69bf0fc0cebea9da83facf7c9156  call                                Mdex: Router                         Binance: WBNB Token                 0.5 BNB
2   9655148  39 secs ago  0x623f5eb7ee7452f2deebc5d228e6d426f7283773455905ae933cc5a3f294a8bb  call                         Binance: WBNB Token                      PancakeSwap: Router v2                0.78 BNB
3   9655148  39 secs ago  0x5e5bed183342a82ce1ab1d978d4b9a4059b0a7903736c6ea33ca332034c2fcc1  call                         Binance: WBNB Token                      PancakeSwap: Router v2   0.014049013918158 BNB
4   9655148  39 secs ago  0x407f274d8a45f5cf1bca80b4dcc1c3ab2c70f3281ebc77276d617b84aef257a2  call                      PancakeSwap: Router v2                         Binance: WBNB Token               0.395 BNB
5   9655148  39 secs ago  0xa6172d1aad2938d7640cef2b4189bc85c17f89ca621917fb06c351bc1ef3d9c4  call                         Binance: WBNB Token                      PancakeSwap: Router v2   0.661559048339038 BNB

...

并创建 data.csv（来自 LibreOffice 的屏幕截图）：

在 Python beautifulsoup 中正确抓取基于列 headers 的数据

Grabbing data based on column headers properly in Python beautifulsoup

python

beautifulsoup

dataformat

web-scraping

python-3.x