格式化 Python beautifulsoup 数据并删除重复的第一列值

Formatting Python beautifulsoup data and remove duplicates first columns values

我有以下已经可以使用的代码片段,但是我想通过删除一些重复的第一列数据来清理格式,并使其更具可读性。

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000 #orig

req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    age = tds[2].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url, tds[8].find('a')['href'])
    print (str(txnhash) + "  " + str(value) + "   " + str(token))

当前输出:

0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  899.885819768    TrusterCoin (TSC)
0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  0.62679168    Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  388,214,984,514.909719227    WoofCoin (WOOF)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  0.003    Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  26.737674146727101117    Binance-Peg ... (BUSD)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  1.251364193609566793    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.03997685638568537    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.041171860015645402    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.089939749761843203    Wrapped BNB (WBNB)

需要改进:

0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  899.885819768                 TrusterCoin (TSC)
                                                                    0.62679168                    Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  388,214,984,514.909719227     WoofCoin (WOOF)
                                                                    0.003                         Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  26.737674146727101117         Binance-Peg ... (BUSD)
                                                                    1.251364193609566793          Binance-Peg ... (ADA)
                                                                    0.03997685638568537           Binance-Peg ... (ADA)
                                                                    0.041171860015645402          Binance-Peg ... (ADA)
                                                                    0.089939749761843203          Wrapped BNB (WBNB)

试试这个:

from urllib.request import Request, urlopen,urljoin
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000

req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

ne=[]
for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    age = tds[2].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url, tds[8].find('a')['href'])
    if str(txnhash) not in ne:
        ne.append(str(txnhash))
        print (str(txnhash),end=" ")
    else:# If you want those tab also then. Otherwise remove else
        print("\t\t\t",end=" ")
    print(str(value) + "   " + str(token))

我们正在 ne 中创建 list of txnhash,然后每次检查新的 txnhash 是否在该列表中。