R:使用 qplot 绘制数据
R: Plotting data using qplot
movies_df
是一个有100条记录和结构的数据框:
具有最高运行时间的类型使用 qplot
绘制
qplot(data = movies_df, Runtime, fill = Genre, bins = 30)
从上图中,4 部动作片的运行时间最高 (160)
如何获取 genre = Action(运行时间最长的类型)电影的 Title
情节?
重现数据的代码:
library("rvest")
url = "https://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature"
webpage = read_html(url)
## ---- PRE-PROCESSING ---- ##
# rank scraping
rank_data_html = html_nodes(webpage, ".text-primary")
rank_data = html_text(rank_data_html)
rank_data = as.numeric(rank_data)
#title scraping
title_data_html = html_nodes (webpage, ".lister-item-header a")
title_data = html_text(title_data_html)
#description scraping
desc_nodes = html_nodes(webpage, ".ratings-bar+.text-muted")
desc_data = html_text(desc_nodes)
desc_data = gsub("\n","",desc_data)
runtime_data_html = html_nodes (webpage, ".text-muted .runtime")
runtime_data = html_text(runtime_data_html)
runtime_data = gsub(" min", "", runtime_data)
runtime_data = as.numeric(runtime_data)
genre_data_html = html_nodes (webpage, ".genre")
genre_data = html_text (genre_data_html)
genre_data = gsub("\n", "", genre_data)
genre_data = gsub (" ","", genre_data)
genre_data = gsub(",.*", "", genre_data)
genre_data = as.factor(genre_data)
rating_data_html = html_nodes(webpage, ".ratings-imdb-rating strong")
rating_data = html_text(rating_data_html)
rating_data = as.numeric(rating_data)
votes_data_html = html_nodes(webpage, ".sort-num_votes-visible span:nth-child(2)")
votes_data = html_text(votes_data_html)
votes_data = gsub(",", "", votes_data)
votes_data = as.numeric(votes_data)
directors_data_html = html_nodes(webpage, ".text-muted+ p a:nth-child(1)")
directors_data = html_text(directors_data_html)
directors_data = as.factor(directors_data)
actors_data_html = html_nodes(webpage, ".lister-item-content .ghost+ a")
actors_data = html_text(actors_data_html)
actors_data = as.factor(actors_data)
metascore_data_html = html_nodes(webpage, ".metascore")
metascore_data = html_text(metascore_data_html)
metascore_data = gsub(" ", "", metascore_data)
for (i in c(39, 73, 80)){
a = metascore_data[1:(i-1)]
b = metascore_data[i:length(metascore_data)]
metascore_data = append(a, list("NA"))
metascore_data = append(metascore_data, b)
metascore_data = as.numeric(metascore_data)
}
gross_data_html = html_nodes(webpage, ".ghost~ .text-muted+ span")
gross_data = html_text(gross_data_html)
gross_data = gsub("M","",gross_data)
gross_data = substring(gross_data, 2, 6)
for (i in c(1,2,3,4,5,6,7,8,9,10)){
a = gross_data[1:(i-1)]
b = gross_data[i:length(gross_data)]
gross_data = append(a,list("NA"))
gross_data = append(gross_data,b)
}
gross_data = as.numeric(gross_data)
movies_df = data.frame(Rank = rank_data, Title = title_data,
Description = desc_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data,
Gross_Earning_in_Mil = gross_data, Director = directors_data,
Actor = actors_data)
首先你需要安装包 dplyr 并加载库来过滤你的数据。
然后,您需要将您的数据过滤为所有流派 = 动作的电影。就是这样
library(dplyr)
newDataset <- movies_df %>% filter( Genre == "Action" )
qplot(data = newDataset, Runtime, fill = Title , bins = 30)
如果对您有帮助,请给个好评。我希望现在一切都清楚了
movies_df
是一个有100条记录和结构的数据框:
具有最高运行时间的类型使用 qplot
qplot(data = movies_df, Runtime, fill = Genre, bins = 30)
从上图中,4 部动作片的运行时间最高 (160)
如何获取 genre = Action(运行时间最长的类型)电影的 Title
情节?
重现数据的代码:
library("rvest")
url = "https://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature"
webpage = read_html(url)
## ---- PRE-PROCESSING ---- ##
# rank scraping
rank_data_html = html_nodes(webpage, ".text-primary")
rank_data = html_text(rank_data_html)
rank_data = as.numeric(rank_data)
#title scraping
title_data_html = html_nodes (webpage, ".lister-item-header a")
title_data = html_text(title_data_html)
#description scraping
desc_nodes = html_nodes(webpage, ".ratings-bar+.text-muted")
desc_data = html_text(desc_nodes)
desc_data = gsub("\n","",desc_data)
runtime_data_html = html_nodes (webpage, ".text-muted .runtime")
runtime_data = html_text(runtime_data_html)
runtime_data = gsub(" min", "", runtime_data)
runtime_data = as.numeric(runtime_data)
genre_data_html = html_nodes (webpage, ".genre")
genre_data = html_text (genre_data_html)
genre_data = gsub("\n", "", genre_data)
genre_data = gsub (" ","", genre_data)
genre_data = gsub(",.*", "", genre_data)
genre_data = as.factor(genre_data)
rating_data_html = html_nodes(webpage, ".ratings-imdb-rating strong")
rating_data = html_text(rating_data_html)
rating_data = as.numeric(rating_data)
votes_data_html = html_nodes(webpage, ".sort-num_votes-visible span:nth-child(2)")
votes_data = html_text(votes_data_html)
votes_data = gsub(",", "", votes_data)
votes_data = as.numeric(votes_data)
directors_data_html = html_nodes(webpage, ".text-muted+ p a:nth-child(1)")
directors_data = html_text(directors_data_html)
directors_data = as.factor(directors_data)
actors_data_html = html_nodes(webpage, ".lister-item-content .ghost+ a")
actors_data = html_text(actors_data_html)
actors_data = as.factor(actors_data)
metascore_data_html = html_nodes(webpage, ".metascore")
metascore_data = html_text(metascore_data_html)
metascore_data = gsub(" ", "", metascore_data)
for (i in c(39, 73, 80)){
a = metascore_data[1:(i-1)]
b = metascore_data[i:length(metascore_data)]
metascore_data = append(a, list("NA"))
metascore_data = append(metascore_data, b)
metascore_data = as.numeric(metascore_data)
}
gross_data_html = html_nodes(webpage, ".ghost~ .text-muted+ span")
gross_data = html_text(gross_data_html)
gross_data = gsub("M","",gross_data)
gross_data = substring(gross_data, 2, 6)
for (i in c(1,2,3,4,5,6,7,8,9,10)){
a = gross_data[1:(i-1)]
b = gross_data[i:length(gross_data)]
gross_data = append(a,list("NA"))
gross_data = append(gross_data,b)
}
gross_data = as.numeric(gross_data)
movies_df = data.frame(Rank = rank_data, Title = title_data,
Description = desc_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data,
Gross_Earning_in_Mil = gross_data, Director = directors_data,
Actor = actors_data)
首先你需要安装包 dplyr 并加载库来过滤你的数据。 然后,您需要将您的数据过滤为所有流派 = 动作的电影。就是这样
library(dplyr)
newDataset <- movies_df %>% filter( Genre == "Action" )
qplot(data = newDataset, Runtime, fill = Title , bins = 30)
如果对您有帮助,请给个好评。我希望现在一切都清楚了