R:使用 qplot 绘制数据

R: Plotting data using qplot

movies_df是一个有100条记录和结构的数据框:

具有最高运行时间的类型使用 qplot

绘制

qplot(data = movies_df, Runtime, fill = Genre, bins = 30)



从上图中,4 部动作片的运行时间最高 (160)

如何获取 genre = Action(运行时间最长的类型)电影的 Title 情节?

重现数据的代码:

library("rvest")


url = "https://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature"

webpage = read_html(url)

## ---- PRE-PROCESSING ---- ##

# rank scraping
rank_data_html = html_nodes(webpage, ".text-primary")
rank_data = html_text(rank_data_html)
rank_data = as.numeric(rank_data)

#title scraping
title_data_html = html_nodes (webpage, ".lister-item-header a")
title_data = html_text(title_data_html)

#description scraping
desc_nodes = html_nodes(webpage, ".ratings-bar+.text-muted")
desc_data = html_text(desc_nodes)
desc_data = gsub("\n","",desc_data)


runtime_data_html = html_nodes (webpage, ".text-muted .runtime")
runtime_data = html_text(runtime_data_html)
runtime_data = gsub(" min", "", runtime_data)
runtime_data = as.numeric(runtime_data)

genre_data_html = html_nodes (webpage, ".genre")
genre_data = html_text (genre_data_html)
genre_data = gsub("\n", "", genre_data)
genre_data = gsub (" ","", genre_data)
genre_data = gsub(",.*", "", genre_data)
genre_data = as.factor(genre_data)

rating_data_html = html_nodes(webpage, ".ratings-imdb-rating strong")
rating_data = html_text(rating_data_html)
rating_data = as.numeric(rating_data)

votes_data_html = html_nodes(webpage, ".sort-num_votes-visible span:nth-child(2)")
votes_data = html_text(votes_data_html)
votes_data = gsub(",", "", votes_data)
votes_data = as.numeric(votes_data)

directors_data_html = html_nodes(webpage, ".text-muted+ p a:nth-child(1)")
directors_data = html_text(directors_data_html)
directors_data = as.factor(directors_data)

actors_data_html = html_nodes(webpage, ".lister-item-content .ghost+ a")
actors_data = html_text(actors_data_html)
actors_data = as.factor(actors_data)

metascore_data_html = html_nodes(webpage, ".metascore")
metascore_data = html_text(metascore_data_html)
metascore_data = gsub(" ", "", metascore_data)

for (i in c(39, 73, 80)){
  a = metascore_data[1:(i-1)]
  b = metascore_data[i:length(metascore_data)]
  metascore_data = append(a, list("NA"))
  metascore_data = append(metascore_data, b)
  metascore_data = as.numeric(metascore_data)
}


gross_data_html = html_nodes(webpage, ".ghost~ .text-muted+ span")
gross_data = html_text(gross_data_html)
gross_data = gsub("M","",gross_data)  
gross_data = substring(gross_data, 2, 6)

for (i in c(1,2,3,4,5,6,7,8,9,10)){
  a = gross_data[1:(i-1)]
  b = gross_data[i:length(gross_data)]
  gross_data = append(a,list("NA"))
  gross_data = append(gross_data,b)
}

gross_data = as.numeric(gross_data)



movies_df = data.frame(Rank = rank_data, Title = title_data,
                       Description = desc_data, Runtime = runtime_data,
                       Genre = genre_data, Rating = rating_data,
                       Metascore = metascore_data, Votes = votes_data,
                       Gross_Earning_in_Mil = gross_data, Director = directors_data,
                       Actor = actors_data)

首先你需要安装包 dplyr 并加载库来过滤你的数据。 然后,您需要将您的数据过滤为所有流派 = 动作的电影。就是这样

library(dplyr)
newDataset  <- movies_df %>% filter( Genre == "Action" ) 
qplot(data = newDataset, Runtime,  fill = Title , bins = 30)

如果对您有帮助,请给个好评。我希望现在一切都清楚了