YouTube 视频标题和视图未打印
YouTube video Title and View are not printed
我正在尝试打印每个 YouTube 频道(我放在 urls
中)上每个视频的标题和观看次数。
但是,它只显示了其中一个通道 (https://www.youtube.com/c/TuckerBudzyn/videos) but didn't show the result of the other channel (https://www.youtube.com/c/LUCKIESTBTS/videos) 的结果。
我无法分辨这两个通道之间的区别,这就是我无法解决这个问题的原因。如果有人知道,请帮助我。
from selenium import webdriver
from bs4 import BeautifulSoup
# provide the url of the channel whose data you want to fetch
urls = ['https://www.youtube.com/c/TuckerBudzyn/videos','https://www.youtube.com/c/LUCKIESTBTS/videos']
def main():
driver = webdriver.Chrome('D:\chromedrive\chromedriver.exe')
for url in urls:
driver.get('{}/videos?view=0&sort=p&flow=grid'.format(url))
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content, 'lxml')
titles = soup.findAll('a', id='video-title')
views = soup.findAll('span', class_='style-scope ytd-grid-video-renderer')
video_urls = soup.findAll('a', id='video-title')
print('Channel: {}'.format(url))
i = 0 # views and time
j = 0 # urls
for title in titles[:10]:
print('\n{}\t{}\t{}\thttps://www.youtube.com{}'.format(title.text,views[i].text, views[i+1].text, video_urls[j].get('href')))
i += 2
j += 1
main()
我强烈推荐使用 Python 包 pytube,它是为提取 YouTube 信息而设计的。
from pytube import YouTube
from pytube import Channel
videos = ['https://www.youtube.com/c/TuckerBudzyn/videos','https://www.youtube.com/c/LUCKIESTBTS/videos']
for video in videos:
channel_info = Channel(video)
for url in channel_info.url_generator():
video_details = YouTube(url)
print(f'Video Author: {video_details.author}')
print(f'Video URL: {url}')
print(f'Video Title: {video_details.title}')
print(f'Video Number of Views: {video_details.views}')
# output
Video Author: Tucker Budzyn
Video URL: https://www.youtube.com/watch?v=j8yQO6gsfVI
Video Title: This Is What My Dog Does When I Hug My Husband
Video Number of Views: 1106181
Video Author: Tucker Budzyn
Video URL: https://www.youtube.com/watch?v=2ovyQ1V13BQ
Video Title: My Dog Stole My Camera!
Video Number of Views: 805448
Video Author: Tucker Budzyn
Video URL: https://www.youtube.com/watch?v=UmVpw2TisOA
Video Title: My Dog Rents a Log Cabin
Video Number of Views: 4864017
truncated....
这个答案使用 selenium 就像你的问题一样,但不使用 BeautifulSoup。使用 selenium,您可以使用 xpath.
访问标题信息和观看次数
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36")
chrome_options.add_argument("start-maximized")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=chrome_options)
driver.get('https://www.youtube.com/c/TuckerBudzyn/videos')
titles = driver.find_elements_by_xpath('//*[@id="video-title"]')
video_views = driver.find_elements_by_xpath('//*[@id="metadata-line"]/span[1]')
# https://www.w3schools.com/python/ref_func_zip.asp
for title, view in zip(titles, video_views):
# https://realpython.com/python-f-strings/
print(f'Video Title: {title.text}')
print(f'Video Number of Views: {view.text}')
# output
Video Title: This Is What My Dog Does When I Hug My Husband
Video Number of Views: 1.1M views
Video Title: My Dog Stole My Camera!
Video Number of Views: 805K views
Video Title: My Dog Rents a Log Cabin
Video Number of Views: 4.8M views
Video Title: My Dog Tries the Obstacle Course Challenge
Video Number of Views: 1.7M views
Video Title: Hugging My Dog's Brother for Too Long | Jealous Dog Reaction
Video Number of Views: 2.7M views
truncated....
This answer using Python requests and BeautifulSoup 提取您正在寻找的视频信息.
import json
import requests
import re as regex
from bs4 import BeautifulSoup
# this function iteration through nested json for specific key
def item_generator(json_input, lookup_key):
if isinstance(json_input, dict):
for k, v in json_input.items():
if k == lookup_key:
yield v
else:
yield from item_generator(v, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from item_generator(item, lookup_key)
response = requests.get('https://www.youtube.com/c/TuckerBudzyn/videos')
soup = BeautifulSoup(response.text, 'html.parser')
# the video information is located in a script tag
for script_tags in soup.find_all('script'):
# the script contains the string "var ytInitialData"
script_match = bool(regex.findall('var ytInitialData', str(script_tags)))
if script_match:
# regex is used to extract the content that we want to harvest
extract_content = regex.match(r'.*(var ytInitialData =)(.*)(;<\/script>)', str(script_tags))
# load the extract content into JSON
content = json.loads(extract_content.group(2))
# use the generator to extract the dictionary with the
# video details
for video_dict in item_generator(content, 'gridVideoRenderer'):
for key, value in video_dict.items():
# here are the keys
# -----------------
# videoId
# thumbnail
# title
# publishedTimeText
# viewCountText
# navigationEndpoint
# ownerBadges
# trackingParams
# shortViewCountText
# menu
# thumbnailOverlays
# -----------------
if key == 'title':
video_title = value['runs'][0]['text']
elif key == 'viewCountText':
video_views = value['simpleText']
这是输出:
Video Title: This Is What My Dog Does When I Hug My Husband
Video Number of Views: 1,125,048 views
Video Title: My Dog Stole My Camera!
Video Number of Views: 808,420 views
Video Title: My Dog Rents a Log Cabin
Video Number of Views: 4,879,426 views
Video Title: My Dog Tries the Obstacle Course Challenge
Video Number of Views: 1,793,264 views
Video Title: Hugging My Dog's Brother for Too Long | Jealous Dog Reaction
Video Number of Views: 2,775,208 views
Video Title: My Dog Rents a Swimming Pool
Video Number of Views: 12,459,980 views
Video Title: Feeding My Dog Invisible Treats
Video Number of Views: 1,378,288 views
Video Title: Leaving My Dog Alone with a Juicy Steak
Video Number of Views: 4,155,106 views
truncated....
我正在尝试打印每个 YouTube 频道(我放在 urls
中)上每个视频的标题和观看次数。
但是,它只显示了其中一个通道 (https://www.youtube.com/c/TuckerBudzyn/videos) but didn't show the result of the other channel (https://www.youtube.com/c/LUCKIESTBTS/videos) 的结果。
我无法分辨这两个通道之间的区别,这就是我无法解决这个问题的原因。如果有人知道,请帮助我。
from selenium import webdriver
from bs4 import BeautifulSoup
# provide the url of the channel whose data you want to fetch
urls = ['https://www.youtube.com/c/TuckerBudzyn/videos','https://www.youtube.com/c/LUCKIESTBTS/videos']
def main():
driver = webdriver.Chrome('D:\chromedrive\chromedriver.exe')
for url in urls:
driver.get('{}/videos?view=0&sort=p&flow=grid'.format(url))
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content, 'lxml')
titles = soup.findAll('a', id='video-title')
views = soup.findAll('span', class_='style-scope ytd-grid-video-renderer')
video_urls = soup.findAll('a', id='video-title')
print('Channel: {}'.format(url))
i = 0 # views and time
j = 0 # urls
for title in titles[:10]:
print('\n{}\t{}\t{}\thttps://www.youtube.com{}'.format(title.text,views[i].text, views[i+1].text, video_urls[j].get('href')))
i += 2
j += 1
main()
我强烈推荐使用 Python 包 pytube,它是为提取 YouTube 信息而设计的。
from pytube import YouTube
from pytube import Channel
videos = ['https://www.youtube.com/c/TuckerBudzyn/videos','https://www.youtube.com/c/LUCKIESTBTS/videos']
for video in videos:
channel_info = Channel(video)
for url in channel_info.url_generator():
video_details = YouTube(url)
print(f'Video Author: {video_details.author}')
print(f'Video URL: {url}')
print(f'Video Title: {video_details.title}')
print(f'Video Number of Views: {video_details.views}')
# output
Video Author: Tucker Budzyn
Video URL: https://www.youtube.com/watch?v=j8yQO6gsfVI
Video Title: This Is What My Dog Does When I Hug My Husband
Video Number of Views: 1106181
Video Author: Tucker Budzyn
Video URL: https://www.youtube.com/watch?v=2ovyQ1V13BQ
Video Title: My Dog Stole My Camera!
Video Number of Views: 805448
Video Author: Tucker Budzyn
Video URL: https://www.youtube.com/watch?v=UmVpw2TisOA
Video Title: My Dog Rents a Log Cabin
Video Number of Views: 4864017
truncated....
这个答案使用 selenium 就像你的问题一样,但不使用 BeautifulSoup。使用 selenium,您可以使用 xpath.
访问标题信息和观看次数from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36")
chrome_options.add_argument("start-maximized")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=chrome_options)
driver.get('https://www.youtube.com/c/TuckerBudzyn/videos')
titles = driver.find_elements_by_xpath('//*[@id="video-title"]')
video_views = driver.find_elements_by_xpath('//*[@id="metadata-line"]/span[1]')
# https://www.w3schools.com/python/ref_func_zip.asp
for title, view in zip(titles, video_views):
# https://realpython.com/python-f-strings/
print(f'Video Title: {title.text}')
print(f'Video Number of Views: {view.text}')
# output
Video Title: This Is What My Dog Does When I Hug My Husband
Video Number of Views: 1.1M views
Video Title: My Dog Stole My Camera!
Video Number of Views: 805K views
Video Title: My Dog Rents a Log Cabin
Video Number of Views: 4.8M views
Video Title: My Dog Tries the Obstacle Course Challenge
Video Number of Views: 1.7M views
Video Title: Hugging My Dog's Brother for Too Long | Jealous Dog Reaction
Video Number of Views: 2.7M views
truncated....
This answer using Python requests and BeautifulSoup 提取您正在寻找的视频信息.
import json
import requests
import re as regex
from bs4 import BeautifulSoup
# this function iteration through nested json for specific key
def item_generator(json_input, lookup_key):
if isinstance(json_input, dict):
for k, v in json_input.items():
if k == lookup_key:
yield v
else:
yield from item_generator(v, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from item_generator(item, lookup_key)
response = requests.get('https://www.youtube.com/c/TuckerBudzyn/videos')
soup = BeautifulSoup(response.text, 'html.parser')
# the video information is located in a script tag
for script_tags in soup.find_all('script'):
# the script contains the string "var ytInitialData"
script_match = bool(regex.findall('var ytInitialData', str(script_tags)))
if script_match:
# regex is used to extract the content that we want to harvest
extract_content = regex.match(r'.*(var ytInitialData =)(.*)(;<\/script>)', str(script_tags))
# load the extract content into JSON
content = json.loads(extract_content.group(2))
# use the generator to extract the dictionary with the
# video details
for video_dict in item_generator(content, 'gridVideoRenderer'):
for key, value in video_dict.items():
# here are the keys
# -----------------
# videoId
# thumbnail
# title
# publishedTimeText
# viewCountText
# navigationEndpoint
# ownerBadges
# trackingParams
# shortViewCountText
# menu
# thumbnailOverlays
# -----------------
if key == 'title':
video_title = value['runs'][0]['text']
elif key == 'viewCountText':
video_views = value['simpleText']
这是输出:
Video Title: This Is What My Dog Does When I Hug My Husband
Video Number of Views: 1,125,048 views
Video Title: My Dog Stole My Camera!
Video Number of Views: 808,420 views
Video Title: My Dog Rents a Log Cabin
Video Number of Views: 4,879,426 views
Video Title: My Dog Tries the Obstacle Course Challenge
Video Number of Views: 1,793,264 views
Video Title: Hugging My Dog's Brother for Too Long | Jealous Dog Reaction
Video Number of Views: 2,775,208 views
Video Title: My Dog Rents a Swimming Pool
Video Number of Views: 12,459,980 views
Video Title: Feeding My Dog Invisible Treats
Video Number of Views: 1,378,288 views
Video Title: Leaving My Dog Alone with a Juicy Steak
Video Number of Views: 4,155,106 views
truncated....