Python BeautifulSoup 无法从具有特定 class 的 div 获取数据
Python BeautifulSoup failure to get data from a div with a certain class
我正在开发一个程序,它将从我的图书馆中抓取 metacritic 以获取有关电影的信息并显示它,但在某些部分,例如总是获取评级 returns 没什么,我做错了什么?
from bs4 import BeautifulSoup
import requests
import os
def ratingsGet(headers, movie):
movie = movie.lower().replace(" ","-")
detail_link="https://www.metacritic.com/movie/" + movie + "/details"
detail_page = requests.get(detail_link, headers = headers)
soup = BeautifulSoup(detail_page.content, "html.parser")
#g_data = soup.select('tr.movie_rating td.data span')
g_data = soup.find_all("div", {"class": "movie_rating"})
print(g_data)
if g_data!= []:
return g_data[0].text
else:
return "Failed"
def getMovieInfo():
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_0) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/58.0.849.0 Safari/536.1'}
for movie in os.listdir("D:/Movies/"):
movie = movie.lower().replace(".mp4","")
print(movie)
print("Rating: " + ratingsGet(headers,movie))
print("Home release year: " + rYearGet(headers,movie))
break
html 片段:
<table class="details" summary="13 Going on 30 Details and Credits">
<tr class="runtime">
<td class="label">Runtime:</td>
<td class="data">98 min</td>
</tr>
<tr class="movie_rating">
<td class="label">Rating:</td>
<td class="data">
Rated PG-13 for some sexual content and brief drug references.
</td>
</tr>
<tr class="company">
<td class="label">Production:</td>
<td class="data">Revolution Studios</td>
</tr>
我只是在搜索错误的元素....
def ratingsGet(headers, movie):
movie = movie.lower().replace(" ","-")
detail_link="https://www.metacritic.com/movie/" + movie + "/details"
detail_page = requests.get(detail_link, headers = headers)
soup = BeautifulSoup(detail_page.content, "html.parser")
#g_data = soup.select('tr.movie_rating td.data span')
g_data = soup.find_all("tr", {"class": "movie_rating"})
print(g_data[0].text.strip(" "))
if g_data!= []:
return g_data[0].text
else:
return "Failed"
如您所说,您需要查找“tr”(而不是“div”)。我也会将此附加到答案中。
- 尽量只使用
find
(不需要全部查找)
- 如果
find
的结果不是 None,请在其中再次查找以仅获取文本,如下所示:
g_data.find("td", { "class": "data" }).text
通用代码如下:
def ratingsGet(headers, movie):
movie = movie.lower().replace(" ","-")
detail_link="https://www.metacritic.com/movie/" + movie + "/details"
detail_page = requests.get(detail_link, headers = headers)
soup = BeautifulSoup(detail_page.content, "html.parser")
g_data = soup.find("tr", {"class": "movie_rating"})
# Check if that tr exists
if g_data is not None:
g_data = g_data.find("td", { "class": "data" })
# Check if the td inside of it exists
if g_data is not None:
return g_data.text.strip()
return "Failed"
我正在开发一个程序,它将从我的图书馆中抓取 metacritic 以获取有关电影的信息并显示它,但在某些部分,例如总是获取评级 returns 没什么,我做错了什么?
from bs4 import BeautifulSoup
import requests
import os
def ratingsGet(headers, movie):
movie = movie.lower().replace(" ","-")
detail_link="https://www.metacritic.com/movie/" + movie + "/details"
detail_page = requests.get(detail_link, headers = headers)
soup = BeautifulSoup(detail_page.content, "html.parser")
#g_data = soup.select('tr.movie_rating td.data span')
g_data = soup.find_all("div", {"class": "movie_rating"})
print(g_data)
if g_data!= []:
return g_data[0].text
else:
return "Failed"
def getMovieInfo():
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_0) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/58.0.849.0 Safari/536.1'}
for movie in os.listdir("D:/Movies/"):
movie = movie.lower().replace(".mp4","")
print(movie)
print("Rating: " + ratingsGet(headers,movie))
print("Home release year: " + rYearGet(headers,movie))
break
html 片段:
<table class="details" summary="13 Going on 30 Details and Credits">
<tr class="runtime">
<td class="label">Runtime:</td>
<td class="data">98 min</td>
</tr>
<tr class="movie_rating">
<td class="label">Rating:</td>
<td class="data">
Rated PG-13 for some sexual content and brief drug references.
</td>
</tr>
<tr class="company">
<td class="label">Production:</td>
<td class="data">Revolution Studios</td>
</tr>
我只是在搜索错误的元素....
def ratingsGet(headers, movie):
movie = movie.lower().replace(" ","-")
detail_link="https://www.metacritic.com/movie/" + movie + "/details"
detail_page = requests.get(detail_link, headers = headers)
soup = BeautifulSoup(detail_page.content, "html.parser")
#g_data = soup.select('tr.movie_rating td.data span')
g_data = soup.find_all("tr", {"class": "movie_rating"})
print(g_data[0].text.strip(" "))
if g_data!= []:
return g_data[0].text
else:
return "Failed"
如您所说,您需要查找“tr”(而不是“div”)。我也会将此附加到答案中。
- 尽量只使用
find
(不需要全部查找) - 如果
find
的结果不是 None,请在其中再次查找以仅获取文本,如下所示:
g_data.find("td", { "class": "data" }).text
通用代码如下:
def ratingsGet(headers, movie):
movie = movie.lower().replace(" ","-")
detail_link="https://www.metacritic.com/movie/" + movie + "/details"
detail_page = requests.get(detail_link, headers = headers)
soup = BeautifulSoup(detail_page.content, "html.parser")
g_data = soup.find("tr", {"class": "movie_rating"})
# Check if that tr exists
if g_data is not None:
g_data = g_data.find("td", { "class": "data" })
# Check if the td inside of it exists
if g_data is not None:
return g_data.text.strip()
return "Failed"