NamedTuple 到 Dataframe
NamedTuple to Dataframe
我正在努力从 YouTube 频道及其视频中检索元数据。
一切顺利,但目前我正在努力将所有信息放入我需要的 dataframe
中。这是我从 github
https://gist.github.com/andkamau/0d4e312c97f41a975440a05fd76b1d29
使用的以下代码
import urllib.request
import json
from bs4 import BeautifulSoup
from collections import namedtuple
import pafy
from pandas import *
import pandas as pd
df = pd.DataFrame()
Video = namedtuple("Video", "video_id title duration views thumbnail Description")
def parse_video_div(div):
video_id = div.get("data-context-item-id", "")
title = div.find("a", "yt-uix-tile-link").text
duration = div.find("span", "video-time").contents[0].text
views = str(div.find("ul", "yt-lockup-meta-info").contents[0].text.rstrip(" views").replace(",", ""))
img = div.find("img")
videoDescription = pafy.new("https://www.youtube.com/watch?v="+video_id)
thumbnail = "http:" + img.get("src", "") if img else ""
Description = videoDescription.description
l = Video(video_id, title, duration, views, thumbnail, Description)
# storing in the dataframe
df = pd.DataFrame(list(Video(video_id, title, duration, views, thumbnail, Description)))
return Video(video_id, title, duration, views, thumbnail, Description)
def parse_videos_page(page):
video_divs = page.find_all("div", "yt-lockup-video")
return [parse_video_div(div) for div in video_divs]
def find_load_more_url(page):
for button in page.find_all("button"):
url = button.get("data-uix-load-more-href")
if url:
return "http://www.youtube.com" + url
def download_page(url):
print("Downloading {0}".format(url))
return urllib.request.urlopen(url).read()
def get_videos(username):
page_url = "http://www.youtube.com/channel/{0}/videos".format(username)
page = BeautifulSoup(download_page(page_url))
videos = parse_videos_page(page)
page_url = find_load_more_url(page)
while page_url:
json_data = json.loads(str(download_page(page_url).decode("utf-8")))
page = BeautifulSoup(json_data.get("content_html", ""))
videos.extend(parse_videos_page(page))
page_url = find_load_more_url(BeautifulSoup(json_data.get("load_more_widget_html", "")))
return videos
if __name__ == "__main__":
videos = get_videos("UC-M9eLhclbe16sDaxLzc0ng")
for video in videos:
print(video)
print("{0} videos".format(len(videos)))
函数 parse_video_div(div)
拥有所有信息和我的 dataframe
。但遗憾的是 dataframe
returns 什么都没有。可能我需要以某种方式循环 namedtuple
。
关于如何实现 dataframe
查看我的数据的任何线索?
pd.DataFrame
与 namedtuple
完美结合,并且实际构建了列。
示例数据:
In [21]: Video = namedtuple("Video", "video_id title duration views thumbnail De
...: scription")
In [22]: In [20]: pd.DataFrame(data=[Video(1, 'Vid Title', 5, 10, 'Thumb',' Des'
...: )])
Out[22]:
video_id title duration views thumbnail Description
0 1 Vid Title 5 10 Thumb Des
由于您的函数实际上并没有 returning df
并且没有在代码的其他任何地方使用它,您如何确定它是空的?
更新
您只需将 parse_video_div
的 return 编辑为 return 一个 pd.DataFrame
并将列表连接成一个 pd.DataFrame
in [=18] =]函数。
这里是突出显示的编辑。
def parse_video_div(div):
#####
return pd.DataFrame(data=[Video(video_id, title, duration, views, thumbnail, Description)])
# shorter version
# return pd.DataFrame(data=[l])
def get_videos(username):
####
videos_df = pd.concat(videos, ignore_index=True)
return videos_df # return the DataFrame
最后你需要一个连接函数。在 parse_page_div
中,您可以 return 任何 pd.DataFrame
输入,可以是 dict
、pd.Series
、namedtuple
,甚至是一个列表。在这个例子中,我选择了 pd.DataFrame
来简化事情,但是,就性能而言,它可以为您的处理增加几毫秒。
我正在努力从 YouTube 频道及其视频中检索元数据。
一切顺利,但目前我正在努力将所有信息放入我需要的 dataframe
中。这是我从 github
https://gist.github.com/andkamau/0d4e312c97f41a975440a05fd76b1d29
import urllib.request
import json
from bs4 import BeautifulSoup
from collections import namedtuple
import pafy
from pandas import *
import pandas as pd
df = pd.DataFrame()
Video = namedtuple("Video", "video_id title duration views thumbnail Description")
def parse_video_div(div):
video_id = div.get("data-context-item-id", "")
title = div.find("a", "yt-uix-tile-link").text
duration = div.find("span", "video-time").contents[0].text
views = str(div.find("ul", "yt-lockup-meta-info").contents[0].text.rstrip(" views").replace(",", ""))
img = div.find("img")
videoDescription = pafy.new("https://www.youtube.com/watch?v="+video_id)
thumbnail = "http:" + img.get("src", "") if img else ""
Description = videoDescription.description
l = Video(video_id, title, duration, views, thumbnail, Description)
# storing in the dataframe
df = pd.DataFrame(list(Video(video_id, title, duration, views, thumbnail, Description)))
return Video(video_id, title, duration, views, thumbnail, Description)
def parse_videos_page(page):
video_divs = page.find_all("div", "yt-lockup-video")
return [parse_video_div(div) for div in video_divs]
def find_load_more_url(page):
for button in page.find_all("button"):
url = button.get("data-uix-load-more-href")
if url:
return "http://www.youtube.com" + url
def download_page(url):
print("Downloading {0}".format(url))
return urllib.request.urlopen(url).read()
def get_videos(username):
page_url = "http://www.youtube.com/channel/{0}/videos".format(username)
page = BeautifulSoup(download_page(page_url))
videos = parse_videos_page(page)
page_url = find_load_more_url(page)
while page_url:
json_data = json.loads(str(download_page(page_url).decode("utf-8")))
page = BeautifulSoup(json_data.get("content_html", ""))
videos.extend(parse_videos_page(page))
page_url = find_load_more_url(BeautifulSoup(json_data.get("load_more_widget_html", "")))
return videos
if __name__ == "__main__":
videos = get_videos("UC-M9eLhclbe16sDaxLzc0ng")
for video in videos:
print(video)
print("{0} videos".format(len(videos)))
函数 parse_video_div(div)
拥有所有信息和我的 dataframe
。但遗憾的是 dataframe
returns 什么都没有。可能我需要以某种方式循环 namedtuple
。
关于如何实现 dataframe
查看我的数据的任何线索?
pd.DataFrame
与 namedtuple
完美结合,并且实际构建了列。
示例数据:
In [21]: Video = namedtuple("Video", "video_id title duration views thumbnail De
...: scription")
In [22]: In [20]: pd.DataFrame(data=[Video(1, 'Vid Title', 5, 10, 'Thumb',' Des'
...: )])
Out[22]:
video_id title duration views thumbnail Description
0 1 Vid Title 5 10 Thumb Des
由于您的函数实际上并没有 returning df
并且没有在代码的其他任何地方使用它,您如何确定它是空的?
更新
您只需将 parse_video_div
的 return 编辑为 return 一个 pd.DataFrame
并将列表连接成一个 pd.DataFrame
in [=18] =]函数。
这里是突出显示的编辑。
def parse_video_div(div):
#####
return pd.DataFrame(data=[Video(video_id, title, duration, views, thumbnail, Description)])
# shorter version
# return pd.DataFrame(data=[l])
def get_videos(username):
####
videos_df = pd.concat(videos, ignore_index=True)
return videos_df # return the DataFrame
最后你需要一个连接函数。在 parse_page_div
中,您可以 return 任何 pd.DataFrame
输入,可以是 dict
、pd.Series
、namedtuple
,甚至是一个列表。在这个例子中,我选择了 pd.DataFrame
来简化事情,但是,就性能而言,它可以为您的处理增加几毫秒。