我们如何使用 R 删除特定用户(拥有大量推文的用户)的推文以进行情绪分析?
How can we remove tweets from a specific user (user with high number of tweets) for sentiment analysis using R?
目的:对美国法院对同性婚姻的历史判决进行情感分析。
# 由于某些用户的推文数量非常高,因此可能会引入偏差。我们怎样才能删除它们?
# 另外,为什么 usafull 和 total 中的唯一推文数量不同?
rm(list=ls())
library(twitteR)
library(wordcloud)
library(tm)
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")
consumer_key <- 'key'
consumer_secret <- 'secret'
access_token <- 'key'
access_secret <- 'secret'
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
usa <- searchTwitter("#LoveWins", n=1500 , lang="en")
usa2 <- searchTwitter("#LGBT", n=1500 , lang="en")
usa3 <- searchTwitter("#gay", n=1500 , lang="en")
#get the text
tusa <- sapply(usa, function(x) x$getText())
tusa2 <- sapply(usa2, function(x) x$getText())
tusa3 <- sapply(usa3, function(x) x$getText())
#join texts
total <- c(tusa,tusa2,tusa3)
#remove the duplicated tweets
total <- total[!duplicated(total)]
#no. of unique tweets
uni <- length(total)
# merging three set of tweets horozontally
usafull<-c(usa,usa2,usa3)
#convert the tweets into dafa frame
usafull <- twListToDF(usafull)
usafull <- unique(usafull)
#to know the dates of the tweets (date formatting)
usafull$date <- format(usafull$created, format = "%Y-%m-%d")
table(usafull$date)
#make a table of number of tweets per user in decreasing number of tweets
tdata <- as.data.frame(table(usafull$screenName))
tdata <- tdata[order(tdata$Freq, decreasing = T), ]
names(tdata) <- c("User","Tweets")
head(tdata)
# plot the freq of tweets over time in two hour windows
library(ggplot2)
minutes <-60
ggplot(data = usafull, aes(x=created))+geom_bar(aes(fill=..count..), binwidth =60*minutes)+scale_x_datetime("Date")+ scale_y_continuous("Frequency")
#plot the table above for the top 30 to identify any unusual trends
par(mar=c(5,10,2,2))
with(tdata[rev(1:30), ], barplot(Tweets, names=User, horiz = T, las =1, main="Top 30: Tweets per user", col = 1))
# the twitter users with more than 20 tweets for removing bias
userid <- tdata[(tdata$Tweets>20),]
userid <- userid[,1]
根据你的代码,我了解到你想删除 userid
中的推文,一种方法是这样,
usafull_nobias <- subset(usafull, !(screenName %in% userid$User))
至于为什么你在total
和usafull
中得到不同数量的推文,这可能是因为在total
中你使用的是推文的文本查找重复项,并且在 usafull
中您使用的是完整的推文;考虑到例如转推可能具有相同的文本,但可能来自不同的用户、具有不同的 ID 等。
希望对您有所帮助。
目的:对美国法院对同性婚姻的历史判决进行情感分析。 # 由于某些用户的推文数量非常高,因此可能会引入偏差。我们怎样才能删除它们? # 另外,为什么 usafull 和 total 中的唯一推文数量不同?
rm(list=ls())
library(twitteR)
library(wordcloud)
library(tm)
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")
consumer_key <- 'key'
consumer_secret <- 'secret'
access_token <- 'key'
access_secret <- 'secret'
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
usa <- searchTwitter("#LoveWins", n=1500 , lang="en")
usa2 <- searchTwitter("#LGBT", n=1500 , lang="en")
usa3 <- searchTwitter("#gay", n=1500 , lang="en")
#get the text
tusa <- sapply(usa, function(x) x$getText())
tusa2 <- sapply(usa2, function(x) x$getText())
tusa3 <- sapply(usa3, function(x) x$getText())
#join texts
total <- c(tusa,tusa2,tusa3)
#remove the duplicated tweets
total <- total[!duplicated(total)]
#no. of unique tweets
uni <- length(total)
# merging three set of tweets horozontally
usafull<-c(usa,usa2,usa3)
#convert the tweets into dafa frame
usafull <- twListToDF(usafull)
usafull <- unique(usafull)
#to know the dates of the tweets (date formatting)
usafull$date <- format(usafull$created, format = "%Y-%m-%d")
table(usafull$date)
#make a table of number of tweets per user in decreasing number of tweets
tdata <- as.data.frame(table(usafull$screenName))
tdata <- tdata[order(tdata$Freq, decreasing = T), ]
names(tdata) <- c("User","Tweets")
head(tdata)
# plot the freq of tweets over time in two hour windows
library(ggplot2)
minutes <-60
ggplot(data = usafull, aes(x=created))+geom_bar(aes(fill=..count..), binwidth =60*minutes)+scale_x_datetime("Date")+ scale_y_continuous("Frequency")
#plot the table above for the top 30 to identify any unusual trends
par(mar=c(5,10,2,2))
with(tdata[rev(1:30), ], barplot(Tweets, names=User, horiz = T, las =1, main="Top 30: Tweets per user", col = 1))
# the twitter users with more than 20 tweets for removing bias
userid <- tdata[(tdata$Tweets>20),]
userid <- userid[,1]
根据你的代码,我了解到你想删除 userid
中的推文,一种方法是这样,
usafull_nobias <- subset(usafull, !(screenName %in% userid$User))
至于为什么你在total
和usafull
中得到不同数量的推文,这可能是因为在total
中你使用的是推文的文本查找重复项,并且在 usafull
中您使用的是完整的推文;考虑到例如转推可能具有相同的文本,但可能来自不同的用户、具有不同的 ID 等。
希望对您有所帮助。