Python BeautifulSoup "weird" 错误

Python BeautifulSoup "weird" errors

我在迷雾中的某个地方纠结了自己运行这些代码给我“奇怪”的错误,似乎我缺少一个模块,但即使在多次阅读错误消息后似乎也无法让它工作.

有人知道这里出了什么问题吗? 新年快乐,提前致谢!

import requests
from bs4 import BeautifulSoup
import csv

def get_page(url):
    response = requests.get(url)
    


    if not response.ok:
        print('Server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'lxml')
        return soup


def get_detail_data(soup):
    try:
        product = soup.find('span',{'class':'a-size-large product-title-word-break'}).text
    except:
        product = ''

    try:
        price = soup.find('span',{'class':'a-size-medium a-color-price priceBlockBuyingPriceString'}).text.strip()
        currency, price = p.split(' ')
    except:
        currency = ''
        price = ''
    try:
        amount = soup.find('span', class_='a-size-medium a-color-state').find('a').text.strip()

    except:
        amount = ''

    data = {
        'product': product,
        'price': price,
        'currency': currency,
        'amount': amount,
    }
    return data

def get_index_data(soup):
    try:
        links = soup.find_all('a',class_='a-link-normal a-text-normal')
    except:
        links = []
    urls = [item.get('href') for item in links]

    return urls

def write_csv(data, url):
    with open('hardware.csv', 'a') as csvfile:
        writer = csv.writer(csvfile)

        row = [data['title'], data['price'], data['currency'], data['amount'], url]

        writer.writerow(row)



def main():
    url = 'https://www.amazon.se/s?k=grafikkort&page=1'

    products = get_index_data(get_page(url))

    for link in products:
        data = get_detail_data(get_page(link))
        write_csv(data, link)

if __name__ == '__main__':
    main()

以及错误消息。

Traceback (most recent call last):
  File "scrp.py", line 75, in <module>
    main()
  File "scrp.py", line 71, in main
    data = get_detail_data(get_page(link))
  File "scrp.py", line 7, in get_page
    response = requests.get(url)
  File "/usr/lib/python3/dist-packages/requests/api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "/usr/lib/python3/dist-packages/requests/api.py", line 60, in request
    return session.request(method=method, url=url, **kwargs)
  File "/usr/lib/python3/dist-packages/requests/sessions.py", line 519, in request
    prep = self.prepare_request(req)
  File "/usr/lib/python3/dist-packages/requests/sessions.py", line 452, in prepare_request
    p.prepare(
  File "/usr/lib/python3/dist-packages/requests/models.py", line 313, in prepare
    self.prepare_url(url, params)
  File "/usr/lib/python3/dist-packages/requests/models.py", line 387, in prepare_url
    raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/ASUS-NVIDIA-GeForce-grafikkort-kylning/dp/B07489XSJP?dchild=1': No schema supplied. Perhaps you meant http:///ASUS-NVIDIA-GeForce-grafikkort-kylning/dp/B07489XSJP?dchild=1?

这里发生的事情是您仅从 products 中获得 URL 后缀,例如 /ASUS-NVIDIA-GeForce-grafikkort-kylning.

一个快速的解决方案是在所有网址前加上 `'https://amazon.se':

def main():
    url = 'https://www.amazon.se/s?k=grafikkort&page=1'

    products = get_index_data(get_page(url))

    for link in products:
        data = get_detail_data(get_page('https://www.amazon.se' + link))
        write_csv(data, link)