抓取 Goodreads 版本

Scrape Goodreads editions

我正在尝试从 ISBNS 列表中获取 Goodreads 版本,但代码并没有获取所有版本,而且一些“版本”实际上是从页面上删除的奇怪代码。现在 Goodreads 没有 API,有必要找到一些解决方法。代码:

def get_isbn():
    isbns = []
    return isbns

def get_page(base_url, data):
    r = requests.get(base_url, params=data)
    return r

def get_editions_details(isbn):
    data = {'q': isbn}
    book_url = get_page("https://www.goodreads.com/search", data)
    soup = bs(book_url.text, 'lxml')

    ed_item = soup.find("div", class_="otherEditionsLink").find("a")
    ed_link = f"https://www.goodreads.com{ed_item['href']}"
    ed_num = ed_item.text.strip().split(' ')[-1].strip('()')

    return ((ed_link, int(ed_num), isbn))

def get_editions_urls(ed_details):
    # Unpack the tuple with the informations about the editions
    url, ed_num, isbn = ed_details

    # Navigate to all pages for books with more than 100 editions
    for page in range((ed_num // 100) + 1):
        r = requests.get(url, params={
            'page': str(page + 1),
            'per_page': '100',
            'filter_by_format': 'Paperback',
            'utf8': "%E2%9C%93"})

        soup = bs(r.text, 'lxml')

        editions = soup.find_all("div", class_="editionData")

        with open(f"urls_files/{isbn}_urls.txt", 'a') as fp:
            for book in editions:
                item = book.find("a", class_="bookTitle")
                    rating = book.find_all("div", class_="dataValue")[-1].text
                    rating = re.sub(ws_ptrn, '', rating)
                    fp.write(f"https://www.goodreads.com{item['href']}" +
                             f"  rating: {rating}\n")
        # Let some time to the goodreads server between the requests

if __name__ == "__main__":
    except Exception:

    isbns = get_isbn()

    for isbn in isbns:
        ed_details = get_editions_details(isbn)

是的,它似乎默认为某种类型的 return 版本数(虽然不确定是哪种类型……它说的是 191……但精装本是 192)。无论如何,您的代码看到了这一点并说只浏览第 1 页和第 2 页。但是您查询它以获得 227 版的平装本。所以你的代码遍历了 2 页,returns 只有 200 of 227 平装版。

最简单的方法是不要对要浏览的页数进行硬编码。让它转到下一页,直到它用完要处理的版本(即放置一个 while 循环,一直持续到最后一页......在这种情况下,因为你每页得到 100,一次该页面的版本少于 100 个)。

我还把它变成了 csv 而不是 txt(对我来说更容易调试),但如果你愿意,你可以将它切换回 txt。

所以这本书有 227 个平装版,如您所见,这 returns 227:


import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import os
import time
import re

def get_isbn():
    isbns = ['9788845210662']
    return isbns

def get_page(base_url, data):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
        r = requests.get(base_url, headers=headers, params=data)
        r = None
    return r

def get_editions_details(isbn):
    data = {'q': isbn}
    book_url = get_page("https://www.goodreads.com/search", data)
    soup = bs(book_url.text, 'html.parser')

    ed_item = soup.find("div", class_="otherEditionsLink").find("a")
    ed_link = f"https://www.goodreads.com{ed_item['href']}"
    ed_num = ed_item.text.strip().split(' ')[-1].strip('()')

    return ((ed_link, int(ed_num), isbn))

def get_editions_urls(ed_details):
    rows = []
    # Unpack the tuple with the informations about the editions
    url, ed_num, isbn = ed_details

    # Navigate to all pages for books with more than 100 editions

    end_of_list = False
    page = 0
    while end_of_list == False:
        print('Page: ', page+1)
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
        r = requests.get(url, headers = headers, params={
            'page': str(page + 1),
            'per_page': '100',
            'filter_by_format': 'Paperback',
            # if you want all editions change above line to 'filter_by_format': '',
            'utf8': "%E2%9C%93"})

        soup = bs(r.text, 'html.parser')

        editions = soup.find_all("div", class_="editionData")
        if len(editions) < 100:
            end_of_list = True

        #with open(f"urls_files/{isbn}_urls.txt", 'a') as fp:
        #    for book in editions:
        #        item = book.find("a", class_="bookTitle")
        #        rating = book.find_all("div", class_="dataValue")[-1].text
        #        rating = ' '.join(rating.split())
        #        fp.write(f"https://www.goodreads.com{item['href']}" +
        #                 f"  rating: {rating}\n")
        # Let some time to the goodreads server between the requests
        for book in editions:
            item = book.find("a", class_="bookTitle")
            rating = book.find_all("div", class_="dataValue")[-1].text
            rating = ' '.join(rating.split())
            row = {'item':f"https://www.goodreads.com{item['href']}",
                   'rating': f'{rating}'}
        page += 1
    return rows

if __name__ == "__main__":
    except Exception:

    isbns = get_isbn()

    for isbn in isbns:
        ed_details = get_editions_details(isbn)
        rows = get_editions_urls(ed_details)
        df = pd.DataFrame(rows)
        df.to_csv(f"urls_files/{isbn}_urls.csv", index=False)


                                                  item                  rating
0    https://www.goodreads.com/book/show/119073.The...  4.12 (276,636 ratings)
1    https://www.goodreads.com/book/show/10522.Il_n...    4.26 (7,445 ratings)
2    https://www.goodreads.com/book/show/71565.El_n...    4.26 (6,129 ratings)
3         https://www.goodreads.com/book/show/16082109      4.02 (968 ratings)
4    https://www.goodreads.com/book/show/9269618-g-...    4.38 (1,429 ratings)
..                                                 ...                     ...
222  https://www.goodreads.com/book/show/58462415-t...         4.00 (1 rating)
223  https://www.goodreads.com/book/show/57406880-e...         0.0 (0 ratings)
224  https://www.goodreads.com/book/show/40944198-e...         0.0 (0 ratings)
225  https://www.goodreads.com/book/show/29927352-n...         0.0 (0 ratings)
226  https://www.goodreads.com/book/show/29242657-i...         4.00 (1 rating)

[227 rows x 2 columns]