抓取 Goodreads 版本
Scrape Goodreads editions
我正在尝试从 ISBNS 列表中获取 Goodreads 版本,但代码并没有获取所有版本,而且一些“版本”实际上是从页面上删除的奇怪代码。现在 Goodreads 没有 API,有必要找到一些解决方法。代码:
def get_isbn():
isbns = []
return isbns
def get_page(base_url, data):
r = requests.get(base_url, params=data)
return r
def get_editions_details(isbn):
data = {'q': isbn}
book_url = get_page("https://www.goodreads.com/search", data)
soup = bs(book_url.text, 'lxml')
ed_item = soup.find("div", class_="otherEditionsLink").find("a")
ed_link = f"https://www.goodreads.com{ed_item['href']}"
ed_num = ed_item.text.strip().split(' ')[-1].strip('()')
return ((ed_link, int(ed_num), isbn))
def get_editions_urls(ed_details):
# Unpack the tuple with the informations about the editions
url, ed_num, isbn = ed_details
# Navigate to all pages for books with more than 100 editions
for page in range((ed_num // 100) + 1):
r = requests.get(url, params={
'page': str(page + 1),
'per_page': '100',
'filter_by_format': 'Paperback',
'utf8': "%E2%9C%93"})
soup = bs(r.text, 'lxml')
editions = soup.find_all("div", class_="editionData")
with open(f"urls_files/{isbn}_urls.txt", 'a') as fp:
for book in editions:
item = book.find("a", class_="bookTitle")
rating = book.find_all("div", class_="dataValue")[-1].text
rating = re.sub(ws_ptrn, '', rating)
fp.write(f"https://www.goodreads.com{item['href']}" +
f" rating: {rating}\n")
# Let some time to the goodreads server between the requests
time.sleep(2)
if __name__ == "__main__":
try:
os.mkdir('./urls_files')
except Exception:
pass
isbns = get_isbn()
for isbn in isbns:
ed_details = get_editions_details(isbn)
get_editions_urls(ed_details)
是的,它似乎默认为某种类型的 return 版本数(虽然不确定是哪种类型……它说的是 191……但精装本是 192)。无论如何,您的代码看到了这一点并说只浏览第 1 页和第 2 页。但是您查询它以获得 227 版的平装本。所以你的代码遍历了 2 页,returns 只有 200 of 227 平装版。
最简单的方法是不要对要浏览的页数进行硬编码。让它转到下一页,直到它用完要处理的版本(即放置一个 while
循环,一直持续到最后一页......在这种情况下,因为你每页得到 100,一次该页面的版本少于 100 个)。
我还把它变成了 csv 而不是 txt(对我来说更容易调试),但如果你愿意,你可以将它切换回 txt。
所以这本书有 227 个平装版,如您所见,这 returns 227:
代码:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import os
import time
import re
def get_isbn():
isbns = ['9788845210662']
return isbns
def get_page(base_url, data):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(base_url, headers=headers, params=data)
except:
r = None
return r
def get_editions_details(isbn):
data = {'q': isbn}
book_url = get_page("https://www.goodreads.com/search", data)
soup = bs(book_url.text, 'html.parser')
ed_item = soup.find("div", class_="otherEditionsLink").find("a")
ed_link = f"https://www.goodreads.com{ed_item['href']}"
ed_num = ed_item.text.strip().split(' ')[-1].strip('()')
return ((ed_link, int(ed_num), isbn))
def get_editions_urls(ed_details):
rows = []
# Unpack the tuple with the informations about the editions
url, ed_num, isbn = ed_details
# Navigate to all pages for books with more than 100 editions
end_of_list = False
page = 0
while end_of_list == False:
print('Page: ', page+1)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(url, headers = headers, params={
'page': str(page + 1),
'per_page': '100',
'filter_by_format': 'Paperback',
# if you want all editions change above line to 'filter_by_format': '',
'utf8': "%E2%9C%93"})
soup = bs(r.text, 'html.parser')
editions = soup.find_all("div", class_="editionData")
print(len(editions))
if len(editions) < 100:
end_of_list = True
#with open(f"urls_files/{isbn}_urls.txt", 'a') as fp:
# for book in editions:
# item = book.find("a", class_="bookTitle")
# rating = book.find_all("div", class_="dataValue")[-1].text
# rating = ' '.join(rating.split())
# fp.write(f"https://www.goodreads.com{item['href']}" +
# f" rating: {rating}\n")
# Let some time to the goodreads server between the requests
for book in editions:
item = book.find("a", class_="bookTitle")
rating = book.find_all("div", class_="dataValue")[-1].text
rating = ' '.join(rating.split())
row = {'item':f"https://www.goodreads.com{item['href']}",
'rating': f'{rating}'}
rows.append(row)
time.sleep(2)
page += 1
return rows
if __name__ == "__main__":
try:
os.mkdir('./urls_files')
except Exception:
pass
isbns = get_isbn()
for isbn in isbns:
ed_details = get_editions_details(isbn)
rows = get_editions_urls(ed_details)
df = pd.DataFrame(rows)
df.to_csv(f"urls_files/{isbn}_urls.csv", index=False)
输出:
print(df)
item rating
0 https://www.goodreads.com/book/show/119073.The... 4.12 (276,636 ratings)
1 https://www.goodreads.com/book/show/10522.Il_n... 4.26 (7,445 ratings)
2 https://www.goodreads.com/book/show/71565.El_n... 4.26 (6,129 ratings)
3 https://www.goodreads.com/book/show/16082109 4.02 (968 ratings)
4 https://www.goodreads.com/book/show/9269618-g-... 4.38 (1,429 ratings)
.. ... ...
222 https://www.goodreads.com/book/show/58462415-t... 4.00 (1 rating)
223 https://www.goodreads.com/book/show/57406880-e... 0.0 (0 ratings)
224 https://www.goodreads.com/book/show/40944198-e... 0.0 (0 ratings)
225 https://www.goodreads.com/book/show/29927352-n... 0.0 (0 ratings)
226 https://www.goodreads.com/book/show/29242657-i... 4.00 (1 rating)
[227 rows x 2 columns]
我正在尝试从 ISBNS 列表中获取 Goodreads 版本,但代码并没有获取所有版本,而且一些“版本”实际上是从页面上删除的奇怪代码。现在 Goodreads 没有 API,有必要找到一些解决方法。代码:
def get_isbn():
isbns = []
return isbns
def get_page(base_url, data):
r = requests.get(base_url, params=data)
return r
def get_editions_details(isbn):
data = {'q': isbn}
book_url = get_page("https://www.goodreads.com/search", data)
soup = bs(book_url.text, 'lxml')
ed_item = soup.find("div", class_="otherEditionsLink").find("a")
ed_link = f"https://www.goodreads.com{ed_item['href']}"
ed_num = ed_item.text.strip().split(' ')[-1].strip('()')
return ((ed_link, int(ed_num), isbn))
def get_editions_urls(ed_details):
# Unpack the tuple with the informations about the editions
url, ed_num, isbn = ed_details
# Navigate to all pages for books with more than 100 editions
for page in range((ed_num // 100) + 1):
r = requests.get(url, params={
'page': str(page + 1),
'per_page': '100',
'filter_by_format': 'Paperback',
'utf8': "%E2%9C%93"})
soup = bs(r.text, 'lxml')
editions = soup.find_all("div", class_="editionData")
with open(f"urls_files/{isbn}_urls.txt", 'a') as fp:
for book in editions:
item = book.find("a", class_="bookTitle")
rating = book.find_all("div", class_="dataValue")[-1].text
rating = re.sub(ws_ptrn, '', rating)
fp.write(f"https://www.goodreads.com{item['href']}" +
f" rating: {rating}\n")
# Let some time to the goodreads server between the requests
time.sleep(2)
if __name__ == "__main__":
try:
os.mkdir('./urls_files')
except Exception:
pass
isbns = get_isbn()
for isbn in isbns:
ed_details = get_editions_details(isbn)
get_editions_urls(ed_details)
是的,它似乎默认为某种类型的 return 版本数(虽然不确定是哪种类型……它说的是 191……但精装本是 192)。无论如何,您的代码看到了这一点并说只浏览第 1 页和第 2 页。但是您查询它以获得 227 版的平装本。所以你的代码遍历了 2 页,returns 只有 200 of 227 平装版。
最简单的方法是不要对要浏览的页数进行硬编码。让它转到下一页,直到它用完要处理的版本(即放置一个 while
循环,一直持续到最后一页......在这种情况下,因为你每页得到 100,一次该页面的版本少于 100 个)。
我还把它变成了 csv 而不是 txt(对我来说更容易调试),但如果你愿意,你可以将它切换回 txt。
所以这本书有 227 个平装版,如您所见,这 returns 227:
代码:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import os
import time
import re
def get_isbn():
isbns = ['9788845210662']
return isbns
def get_page(base_url, data):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(base_url, headers=headers, params=data)
except:
r = None
return r
def get_editions_details(isbn):
data = {'q': isbn}
book_url = get_page("https://www.goodreads.com/search", data)
soup = bs(book_url.text, 'html.parser')
ed_item = soup.find("div", class_="otherEditionsLink").find("a")
ed_link = f"https://www.goodreads.com{ed_item['href']}"
ed_num = ed_item.text.strip().split(' ')[-1].strip('()')
return ((ed_link, int(ed_num), isbn))
def get_editions_urls(ed_details):
rows = []
# Unpack the tuple with the informations about the editions
url, ed_num, isbn = ed_details
# Navigate to all pages for books with more than 100 editions
end_of_list = False
page = 0
while end_of_list == False:
print('Page: ', page+1)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(url, headers = headers, params={
'page': str(page + 1),
'per_page': '100',
'filter_by_format': 'Paperback',
# if you want all editions change above line to 'filter_by_format': '',
'utf8': "%E2%9C%93"})
soup = bs(r.text, 'html.parser')
editions = soup.find_all("div", class_="editionData")
print(len(editions))
if len(editions) < 100:
end_of_list = True
#with open(f"urls_files/{isbn}_urls.txt", 'a') as fp:
# for book in editions:
# item = book.find("a", class_="bookTitle")
# rating = book.find_all("div", class_="dataValue")[-1].text
# rating = ' '.join(rating.split())
# fp.write(f"https://www.goodreads.com{item['href']}" +
# f" rating: {rating}\n")
# Let some time to the goodreads server between the requests
for book in editions:
item = book.find("a", class_="bookTitle")
rating = book.find_all("div", class_="dataValue")[-1].text
rating = ' '.join(rating.split())
row = {'item':f"https://www.goodreads.com{item['href']}",
'rating': f'{rating}'}
rows.append(row)
time.sleep(2)
page += 1
return rows
if __name__ == "__main__":
try:
os.mkdir('./urls_files')
except Exception:
pass
isbns = get_isbn()
for isbn in isbns:
ed_details = get_editions_details(isbn)
rows = get_editions_urls(ed_details)
df = pd.DataFrame(rows)
df.to_csv(f"urls_files/{isbn}_urls.csv", index=False)
输出:
print(df)
item rating
0 https://www.goodreads.com/book/show/119073.The... 4.12 (276,636 ratings)
1 https://www.goodreads.com/book/show/10522.Il_n... 4.26 (7,445 ratings)
2 https://www.goodreads.com/book/show/71565.El_n... 4.26 (6,129 ratings)
3 https://www.goodreads.com/book/show/16082109 4.02 (968 ratings)
4 https://www.goodreads.com/book/show/9269618-g-... 4.38 (1,429 ratings)
.. ... ...
222 https://www.goodreads.com/book/show/58462415-t... 4.00 (1 rating)
223 https://www.goodreads.com/book/show/57406880-e... 0.0 (0 ratings)
224 https://www.goodreads.com/book/show/40944198-e... 0.0 (0 ratings)
225 https://www.goodreads.com/book/show/29927352-n... 0.0 (0 ratings)
226 https://www.goodreads.com/book/show/29242657-i... 4.00 (1 rating)
[227 rows x 2 columns]