python 中的网络抓取工具 beautifulsoup 个不同的标签

web scraper beautifulsoup different tags in python

我正在尝试抓取股票网站,抓取行业和行业(下一个问题)并将其添加到 csv。我在 1 页上获得了我想要的信息,但下一页不同,所以这就是我卡住的地方

share_details1 = soup.find('a', href="../Industry/Industry_Data.php?s=100") 结果:基础材料 我想找到范围从 100 - 1300 到数百的标签,如 href="../Industry/Industry_Data.php?s=200" 300 400 500 600 等等到 1300

from bs4 import BeautifulSoup as bs
import csv
import requests

LSE = 'https://csimarket.com/stocks/at_glance.php?code=aa'

def get_stocks():
    with open('tickers.csv') as ticker_file:
        return list(map(lambda ticker: ticker.strip(), ticker_file))


def to_csv(stocks):
    with open('stocks.csv', 'w') as sectors:
        writer = csv.writer(sectors)
        writer.writerow(stocks[0].keys())
        for stock in stocks:
            writer.writerow(stock.values())


def get_soup(url):
    return bs(requests.get(url).text, 'html.parser')

def get_sector(ticker):
    soup = get_soup(LSE + ticker)
    try:
        share_details1 = soup.find('a', href="../Industry/Industry_Data.php?s=100")
        messy = share_details1.find("span")
        messy.decompose()
        sector = share_details1.text.strip()

    except:
        print('No sector information availible for ', ticker)
        return {'ticker': ticker, 'sector': ''}

    print(ticker, sector)
    return {'ticker': ticker, 'sector': sector}


def get_industry(ticker):
    soup1 = get_soup(LSE + ticker)
    try:
        share_details1 = soup1.find('a', href="../Industry/Industry_Data.php?ind=104")
        messy = share_details1.find("span")
        messy.decompose()
        industry = share_details1.text.strip()

    except:
        print('No industry information availible for ', ticker)
        return {'ticker': ticker, 'industry': ''}

    print(ticker, industry)
    return {'ticker': ticker, 'industry': industry}


if __name__ == '__main__':
    to_csv(list(map(lambda ticker: get_sector(ticker), get_stocks())))
    # to_csv(list(map(lambda ticker: get_industry(ticker), get_stocks())))

这里是 csv 的样本

ticker,sector
A,
AA,Basic Materials
AADI,
AAIC,
AAL,
AAN,
AAOI,
AAON,
AAP,
AAPL,
AAT,
AAU,Basic Materials
AAWW,
AB,
ABB,
ABBV,
ABC,
ABCB,
ABCL,
ABEO,
ABEV,
ABG,
ABIO,
ABM,
ABMD,
ABNB,
ABOS,
ABR,
ABSI,
ABST,
ABT,
ABTX,
ABUS,
ACA,Basic Materials
ACAD,
ACB,
ACC,
ACCD,
ACCO,Basic Materials
ACEL,
ACER,
ACET,
ACEV,
ACGL,
ACH,Basic Materials
ACHC,
ACHR,
ACHV,
ACI,
ACIU,

看起来这些 href 是动态的。你最好只寻找 'Sector''Industry' 然后解析它。

您也可以使用正则表达式提取该信息。但这是我的解决方法。

from bs4 import BeautifulSoup as bs
import pandas as pd
import re
import requests

LSE = 'https://csimarket.com/stocks/at_glance.php?code='

def get_stocks():
    df = pd.read_csv('tickers.csv')
    return list(df['ticker'])


def to_csv(stocks):
    df = pd.DataFrame(stocks)
    df.to_csv('stocks.csv', index=False)

def get_soup(url):
    return bs(requests.get(url).text, 'html.parser')

def get_sector(ticker):
    soup = get_soup(LSE + ticker)
    try:
        sector = soup.find('span', text='Sector').find_next('a').text.replace('\n','').replace('•','').strip()
    except:
        print('No sector information availible for ', ticker)
        return {'ticker': ticker, 'sector': ''}

    print(ticker, sector)
    return {'ticker': ticker, 'sector': sector}


def get_industry(ticker):
    soup1 = get_soup(LSE + ticker)
    try:
        industry = soup1.find('span', text='Industry').find_next('a').text.replace('\n','').replace('•','').strip()
    except:
        print('No industry information availible for ', ticker)
        return {'ticker': ticker, 'industry': ''}

    print(ticker, industry)
    return {'ticker': ticker, 'industry': industry}


if __name__ == '__main__':
    to_csv(list(map(lambda ticker: get_sector(ticker), get_stocks())))
    # to_csv(list(map(lambda ticker: get_industry(ticker), get_stocks())))