格式化 Python beautifulsoup 数据并删除重复的第一列值

Question

我有以下已经可以使用的代码片段，但是我想通过删除一些重复的第一列数据来清理格式，并使其更具可读性。

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000 #orig

req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    age = tds[2].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url, tds[8].find('a')['href'])
    print (str(txnhash) + "  " + str(value) + "   " + str(token))

当前输出：

0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  899.885819768    TrusterCoin (TSC)
0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  0.62679168    Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  388,214,984,514.909719227    WoofCoin (WOOF)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  0.003    Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  26.737674146727101117    Binance-Peg ... (BUSD)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  1.251364193609566793    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.03997685638568537    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.041171860015645402    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.089939749761843203    Wrapped BNB (WBNB)

需要改进：

0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  899.885819768                 TrusterCoin (TSC)
                                                                    0.62679168                    Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  388,214,984,514.909719227     WoofCoin (WOOF)
                                                                    0.003                         Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  26.737674146727101117         Binance-Peg ... (BUSD)
                                                                    1.251364193609566793          Binance-Peg ... (ADA)
                                                                    0.03997685638568537           Binance-Peg ... (ADA)
                                                                    0.041171860015645402          Binance-Peg ... (ADA)
                                                                    0.089939749761843203          Wrapped BNB (WBNB)

Answer 1

试试这个：

from urllib.request import Request, urlopen,urljoin
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000

req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

ne=[]
for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    age = tds[2].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url, tds[8].find('a')['href'])
    if str(txnhash) not in ne:
        ne.append(str(txnhash))
        print (str(txnhash),end=" ")
    else:# If you want those tab also then. Otherwise remove else
        print("\t\t\t",end=" ")
    print(str(value) + "   " + str(token))

我们正在 ne 中创建 list of txnhash，然后每次检查新的 txnhash 是否在该列表中。

格式化 Python beautifulsoup 数据并删除重复的第一列值

Formatting Python beautifulsoup data and remove duplicates first columns values

python

webrequest

beautifulsoup

dataformat

python-3.x