使用 python 从 div 抓取 h3
scraping h3 from div using python
我想使用 Python 3.6,从 DIV 中抓取 H3 标题 - 来自页面:
https://player.bfi.org.uk/search/rentals?q=&sort=title&page=1
注意页码变化,增量为1。
我正在努力 return 或确定标题。
from requests import get
url = 'https://player.bfi.org.uk/search/rentals?q=&sort=title&page=1'
response = get(url)
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'lxml')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'card card--rentals')
print(type(movie_containers))
print(len(movie_containers))
我也试过遍历它们:
for dd in page("div.card__content"):
print(div.select_one("h3.card__title").text.strip())
任何帮助都会很棒。
谢谢,
我期待每一页的每部电影标题的结果,包括电影的 link。例如。 https://player.bfi.org.uk/rentals/film/watch-akenfield-1975-online
您遇到的问题实际上并不是找到 div
- 我认为您做对了。但是,当您尝试使用
访问该网站时
from requests import get
url = 'https://player.bfi.org.uk/search/rentals?q=&sort=title&page=1'
response = get(url)
响应实际上并不包括您在浏览器中看到的所有内容。您可以检查 'card' in response == False
是否属于这种情况。这很可能是因为网站加载后,所有卡片都是通过 javascript 加载的,因此仅加载 requests
库中的基本内容不足以获取您想要抓取的所有信息。
我建议您可以尝试查看该网站如何加载所有卡片 - 浏览器开发工具中的“网络”选项卡可能会有所帮助。
该页面正在通过 xhr 将内容加载到另一个 url,所以您错过了这个。您可以模仿 xhr POST 请求页面使用并更改 post json 发送。如果你改变 size
你会得到更多的结果。
import requests
data = {"size":1480,"from":0,"sort":"sort_title","aggregations":{"genre":{"terms":{"field":"genre.raw","size":10}},"captions":{"terms":{"field":"captions"}},"decade":{"terms":{"field":"decade.raw","order":{"_term":"asc"},"size":20}},"bbfc":{"terms":{"field":"bbfc_rating","size":10}},"english":{"terms":{"field":"english"}},"audio_desc":{"terms":{"field":"audio_desc"}},"colour":{"terms":{"field":"colour"}},"mono":{"terms":{"field":"mono"}},"fiction":{"terms":{"field":"fiction"}}},"min_score":0.5,"query":{"bool":{"must":{"match_all":{}},"must_not":[],"should":[],"filter":{"term":{"pillar.raw":"rentals"}}}}}
r = requests.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
for film in r['hits']['hits']:
print(film['_source']['title'], 'https://player.bfi.org.uk' + film['_source']['url'])
rentals
的实际结果计数在 json、r['hits']['total']
中,所以您可以做一个初始请求,从比您预期高得多的数字开始,检查是否需要另一个请求,然后通过更改 from
和 size
来收集任何额外的请求以清除所有未完成的请求。
import requests
import pandas as pd
initial_count = 10000
results = []
def add_results(r):
for film in r['hits']['hits']:
results.append([film['_source']['title'], 'https://player.bfi.org.uk' + film['_source']['url']])
with requests.Session() as s:
data = {"size": initial_count,"from":0,"sort":"sort_title","aggregations":{"genre":{"terms":{"field":"genre.raw","size":10}},"captions":{"terms":{"field":"captions"}},"decade":{"terms":{"field":"decade.raw","order":{"_term":"asc"},"size":20}},"bbfc":{"terms":{"field":"bbfc_rating","size":10}},"english":{"terms":{"field":"english"}},"audio_desc":{"terms":{"field":"audio_desc"}},"colour":{"terms":{"field":"colour"}},"mono":{"terms":{"field":"mono"}},"fiction":{"terms":{"field":"fiction"}}},"min_score":0.5,"query":{"bool":{"must":{"match_all":{}},"must_not":[],"should":[],"filter":{"term":{"pillar.raw":"rentals"}}}}}
r = s.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
total_results = int(r['hits']['total'])
add_results(r)
if total_results > initial_count :
data['size'] = total_results - initial_count
data['from'] = initial_count
r = s.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
add_results(r)
df = pd.DataFrame(results, columns = ['Title', 'Link'])
print(df.head())
我想使用 Python 3.6,从 DIV 中抓取 H3 标题 - 来自页面:
https://player.bfi.org.uk/search/rentals?q=&sort=title&page=1
注意页码变化,增量为1。
我正在努力 return 或确定标题。
from requests import get
url = 'https://player.bfi.org.uk/search/rentals?q=&sort=title&page=1'
response = get(url)
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'lxml')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'card card--rentals')
print(type(movie_containers))
print(len(movie_containers))
我也试过遍历它们:
for dd in page("div.card__content"):
print(div.select_one("h3.card__title").text.strip())
任何帮助都会很棒。
谢谢,
我期待每一页的每部电影标题的结果,包括电影的 link。例如。 https://player.bfi.org.uk/rentals/film/watch-akenfield-1975-online
您遇到的问题实际上并不是找到 div
- 我认为您做对了。但是,当您尝试使用
from requests import get
url = 'https://player.bfi.org.uk/search/rentals?q=&sort=title&page=1'
response = get(url)
响应实际上并不包括您在浏览器中看到的所有内容。您可以检查 'card' in response == False
是否属于这种情况。这很可能是因为网站加载后,所有卡片都是通过 javascript 加载的,因此仅加载 requests
库中的基本内容不足以获取您想要抓取的所有信息。
我建议您可以尝试查看该网站如何加载所有卡片 - 浏览器开发工具中的“网络”选项卡可能会有所帮助。
该页面正在通过 xhr 将内容加载到另一个 url,所以您错过了这个。您可以模仿 xhr POST 请求页面使用并更改 post json 发送。如果你改变 size
你会得到更多的结果。
import requests
data = {"size":1480,"from":0,"sort":"sort_title","aggregations":{"genre":{"terms":{"field":"genre.raw","size":10}},"captions":{"terms":{"field":"captions"}},"decade":{"terms":{"field":"decade.raw","order":{"_term":"asc"},"size":20}},"bbfc":{"terms":{"field":"bbfc_rating","size":10}},"english":{"terms":{"field":"english"}},"audio_desc":{"terms":{"field":"audio_desc"}},"colour":{"terms":{"field":"colour"}},"mono":{"terms":{"field":"mono"}},"fiction":{"terms":{"field":"fiction"}}},"min_score":0.5,"query":{"bool":{"must":{"match_all":{}},"must_not":[],"should":[],"filter":{"term":{"pillar.raw":"rentals"}}}}}
r = requests.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
for film in r['hits']['hits']:
print(film['_source']['title'], 'https://player.bfi.org.uk' + film['_source']['url'])
rentals
的实际结果计数在 json、r['hits']['total']
中,所以您可以做一个初始请求,从比您预期高得多的数字开始,检查是否需要另一个请求,然后通过更改 from
和 size
来收集任何额外的请求以清除所有未完成的请求。
import requests
import pandas as pd
initial_count = 10000
results = []
def add_results(r):
for film in r['hits']['hits']:
results.append([film['_source']['title'], 'https://player.bfi.org.uk' + film['_source']['url']])
with requests.Session() as s:
data = {"size": initial_count,"from":0,"sort":"sort_title","aggregations":{"genre":{"terms":{"field":"genre.raw","size":10}},"captions":{"terms":{"field":"captions"}},"decade":{"terms":{"field":"decade.raw","order":{"_term":"asc"},"size":20}},"bbfc":{"terms":{"field":"bbfc_rating","size":10}},"english":{"terms":{"field":"english"}},"audio_desc":{"terms":{"field":"audio_desc"}},"colour":{"terms":{"field":"colour"}},"mono":{"terms":{"field":"mono"}},"fiction":{"terms":{"field":"fiction"}}},"min_score":0.5,"query":{"bool":{"must":{"match_all":{}},"must_not":[],"should":[],"filter":{"term":{"pillar.raw":"rentals"}}}}}
r = s.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
total_results = int(r['hits']['total'])
add_results(r)
if total_results > initial_count :
data['size'] = total_results - initial_count
data['from'] = initial_count
r = s.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
add_results(r)
df = pd.DataFrame(results, columns = ['Title', 'Link'])
print(df.head())