HTML 使用 Python 和 Beautiful Soup 进行抓取(来自 IMDb)
HTML scraping using Python and Beautiful Soup (from IMDb)
例如,我想从 this page 获取电影评级并逐行打印评级,
我用 BS4 提取了名称和发行年份,但不知道如何处理评级...
import requests
from bs4 import BeautifulSoup
import urllib.request
url = urllib.request.urlopen('http://imdb.com/list/ls097228983/')
content = url.read()
soup = BeautifulSoup(content, 'lxml')
for div in soup.findAll('h3', attrs={'class':'lister-item-header'}):
#print(div.find('a')['href'])
#print("**")
#print(div)
year = div.find('span', attrs={'class':'lister-item-year text-muted unbold'})
year = str(year)
year = year.replace('<span class="lister-item-year text-muted unbold">', '')
year = year.replace('</span>', '')
name = div.find('a').contents[0]
print(name + ' ' + year)
>> I want: Solaris (1972) 8.1
您需要将 'class':'lister-item-header'
更改为 'class':'lister-item-content'
父级 class 才能获得评分。
import requests
from bs4 import BeautifulSoup
import urllib.request
url = urllib.request.urlopen('http://imdb.com/list/ls097228983/')
content = url.read()
soup = BeautifulSoup(content, 'lxml')
for div in soup.findAll('div', {'class':'lister-item-content'}):
#print(div.find('a')['href'])
#print("**")
#print(div)
year = div.find('span', attrs={'class':'lister-item-year text-muted unbold'})
year = str(year)
year = year.replace('<span class="lister-item-year text-muted unbold">', '')
year = year.replace('</span>', '')
name = div.find('a').contents[0]
rating = div.find('span',class_='ipl-rating-star__rating').text
# print(rating)
# you could also format string.
print(f'{name} {year} {rating}'.format(name, year, rating))
print(name + ' ' + year + " " +rating)
例如,我想从 this page 获取电影评级并逐行打印评级, 我用 BS4 提取了名称和发行年份,但不知道如何处理评级...
import requests
from bs4 import BeautifulSoup
import urllib.request
url = urllib.request.urlopen('http://imdb.com/list/ls097228983/')
content = url.read()
soup = BeautifulSoup(content, 'lxml')
for div in soup.findAll('h3', attrs={'class':'lister-item-header'}):
#print(div.find('a')['href'])
#print("**")
#print(div)
year = div.find('span', attrs={'class':'lister-item-year text-muted unbold'})
year = str(year)
year = year.replace('<span class="lister-item-year text-muted unbold">', '')
year = year.replace('</span>', '')
name = div.find('a').contents[0]
print(name + ' ' + year)
>> I want: Solaris (1972) 8.1
您需要将 'class':'lister-item-header'
更改为 'class':'lister-item-content'
父级 class 才能获得评分。
import requests
from bs4 import BeautifulSoup
import urllib.request
url = urllib.request.urlopen('http://imdb.com/list/ls097228983/')
content = url.read()
soup = BeautifulSoup(content, 'lxml')
for div in soup.findAll('div', {'class':'lister-item-content'}):
#print(div.find('a')['href'])
#print("**")
#print(div)
year = div.find('span', attrs={'class':'lister-item-year text-muted unbold'})
year = str(year)
year = year.replace('<span class="lister-item-year text-muted unbold">', '')
year = year.replace('</span>', '')
name = div.find('a').contents[0]
rating = div.find('span',class_='ipl-rating-star__rating').text
# print(rating)
# you could also format string.
print(f'{name} {year} {rating}'.format(name, year, rating))
print(name + ' ' + year + " " +rating)