如何在R中可视化hashtags,并查看hashtags的趋势?

How to visualize hashtags in R, and see the trends of the hashtags?

我正在做趋势分析,并尝试使用条形图来可视化不同年份标签的频率。所以我可以看到前 3 个最常见的主题标签术语,并了解这些术语的频率在这些年中是如何演变的我有一个这样的数据集:

    terms          year
1   #A;#B;#C       2017
2   #B;#C;#D       2016
3   #C;#D;#E       2021
4   #D;#E;#F       2020
5   #E;#F;#G       2020
6   #F;#G;#H       2020
7   #G;#H;#I       2019
8   #H;#I;#J       2018
9   #I;#J;#K       2020
10  #J;#K;#L       2020

谢谢!

基本上,我们需要计算每年的主题标签。由于特定年份的主题标签在单列中,我们需要将其分成不同的列,然后我们可以将 df 转换为长 df,我们可以根据年份和主题标签对其进行分组以找到计数.

library(tidyverse)

structure(list(terms = c("#A;#B;#C", "#B;#C;#D", "#C;#D;#E", 
                         "#D;#E;#F", "#E;#F;#G", "#F;#G;#H", "#G;#H;#I", "#H;#I;#J", "#I;#J;#K", 
                         "#J;#K;#L"), year = c(2017, 2016, 2021, 2020, 2020, 2020, 2019, 
                                               2018, 2020, 2020)), row.names = c(NA, -10L), class = c("tbl_df", 
                                                                                                      "tbl", "data.frame")) -> df

df %>% 
   separate(terms, into = paste0("t", 1:3), sep = ";") %>% 
   pivot_longer(-year) %>% 
   group_by(year, value) %>% 
   count(value) %>% 
   ggplot(aes(x = year, y = n, fill = value, label = n)) +
   geom_col(position = position_dodge()) +
   geom_text(position = position_dodge(1))

reprex package (v0.3.0)

于 2021-02-05 创建

如果每个主题标签都获得年份的唯一信息,您可以创建一个新的数据框。

之后,您可以使用 geom_bar 来处理数据。

我无法将情节上传到此 post,因为这是一个新帐户。

library(tidyverse)
library(data.table)

#your Data:

#terms          year
#1   #A;#B;#C       2017
#2   #B;#C;#D       2016
#3   #C;#D;#E       2021
#4   #D;#E;#F       2020
#5   #E;#F;#G       2020
#6   #F;#G;#H       2020
#7   #G;#H;#I       2019
#8   #H;#I;#J       2018
#9   #I;#J;#K       2020
#10  #J;#K;#L       2020

# make a df that looks like your data:
terms<- c("#A;#B;#C",
          "#B;#C;#D",
          "#C;#D;#E",
          "#D;#E;#F",
          "#E;#F;#G",
          "#G;#H;#I",
          "#H;#I;#J",
          "#I;#J;#K",
          "#J;#K;#L")

terms<-as.data.frame(terms)
year<-c(2017,2016,2021,2020,2020,2019,2018,2020,2020)        
year<-as.data.frame(year)
df<-cbind(terms,year)

# read your data from what I assume is your Data frame
terms<-c(df$terms)
year.list<-c(df$year)

loopcount<-length(terms)

# make new dummys
year<-c()
hashtags<-c()

all.years<-as.data.frame(hashtags,year)
#split hashtags based on ";"
hashtag.list<-str_split(terms, ";")

通过这个循环你创建了一个新的 DF

# make new df were every hashtags gets the information for year
for (i in 1:loopcount){
   hashtags<-hashtag.list[[i]]
   hashtags<-as.data.frame(hashtags)
   
   year<-c()
   for(k in 1:nrow(hashtags)) {
     year[k]<- year.list[i]
   }
   year<-as.data.frame(year)
   one.year<-cbind(hashtags,year)
   
   all.years<-rbind(all.years,one.year)
 }

hashtagDF<-all.years


head(hashtagDF)

然后可以使用新的 DF 来绘制您想要的内容

或者 如果我理解你是对的

如果显示每年标签的频率,您可以创建一个新的 df 并且只包含前 3 个主题标签

#only include the three most used hashtags per year

# dummys for new df
hashtags<-c()
year<-c()
Freq<-c()

top.3<-as.data.frame(hashtags,year,Freq)

years.in.study<-unique(hashtagDF$year)
#i<-3
for ( i in 1: length(years.in.study)){
  what.year<-paste(years.in.study[i])
  #subset per year
one.subset<-subset(hashtagDF, year == what.year)


# calculate how often a hashtag is present per year
freq<-table(one.subset)
frequency.per.year<-as.data.frame(freq)
frequency.per.year<-frequency.per.year[order(-frequency.per.year[,3]), ]

# only keep the 3 most occurring terms
  lenght.of.file.to.delete<-nrow(frequency.per.year)
if (nrow(frequency.per.year) == 3){
  lenght.of.file.to.delete<-lenght.of.file.to.delete+1
  }
frequency.per.year<-frequency.per.year[-c(4:lenght.of.file.to.delete), ]

# make a df with all years
top.3<-rbind(top.3,frequency.per.year)

}

top.3


#order for year
top.3$year<-as.character(top.3$year)
top.3[order(top.3[,2]), ]

#year should be a factor
top.3$yearF<-as.factor(top.3$year)

然后就可以画图了

# plot as barplot
# with
# the frequencies of the hashtags in different years.
# the top 3 most frequent hashtag terms per year


barplot<-ggplot(data=top.3, aes(x=yearF, y=Freq,fill=hashtags)) +
  geom_bar(stat="identity")+
  labs(title = "",
       subtitle = "",
       caption = "",
       x= "", 
       y= "")


barplot


ggsave(file="hashtag.png", barplot, width = 210, height = 297, units = "mm")

要为 Base R 中的每一年生成一个易于阅读的图表,我们可以执行以下操作:

代码

# First create a list of data.frames that we can utilize to plot

# Split by year
listdf <- split(df, df$year)

# Only choose trends and name list according to year
listdf <- lapply(listdf, "[[", 1)

# Stringsplit to obtain trends as vector for each year
listdfplot <- lapply(listdf, function(x){
  unlist(strsplit(x, ";"))
})


# Plot 
# Plot side by side
par(mfrow = c(2, 3))

# A barplot for each year 
Map(function(x, y){
  barplot(table(x), main = paste("Trends in", y), las = 2)
},
listdfplot,
names(listdfplot))

数据

df <- structure(list(terms = c("#A;#B;#C", "#B;#C;#D", "#C;#D;#E", 
                         "#D;#E;#F", "#E;#F;#G", "#F;#G;#H", "#G;#H;#I", "#H;#I;#J", "#I;#J;#K", 
                         "#J;#K;#L"), year = c(2017, 2016, 2021, 2020, 2020, 2020, 2019, 
                                               2018, 2020, 2020)), row.names = c(NA, -10L), class = c("tbl_df", 
                                                                                                      "tbl", "data.frame"))