计算推文中的关键字和词干

Question

我有一个大型数据框，其中包含推文和加载为值的关键字词典，这些值具有与道德 (kw_Moral) 和情感 (kw_Emo) 相关的词。过去，我使用关键字字典对数据框进行子集化，以仅获取包含一个或多个关键字的推文。

例如，要创建仅包含那些具有情感关键字的推文的子集，我加载了我的关键字字典...

kw_Emo <- c("abusi*", "accept", "accepta*", "accepted", 
        "accepting", "accepts", "ache*", "aching", "active*", "admir*", 
        "ador*", "advantag*", "adventur*", "advers*", "affection*", "afraid", 
        "aggravat*", "aggress*", "agoniz*", "agony", "agree", "agreeab*", 
        "agreed", "agreeing", "agreement*", "agrees", "alarm*", "alone", 
        "alright*", "amaz*", "amor*", "amus*", "anger*", "angr*", "anguish*", 
        "annoy*", "antagoni*", "anxi*", "aok", "apath*", "appall*", "appreciat*", 
        "apprehens*", "argh*", "argu*", "arrogan*", "asham*", "assault*", 
        "asshole*", "assur*", "attachment*", "attract*", "aversi*", "avoid*", 
        "award*", "awesome", "awful", "awkward*", "bashful*", "bastard*", 
        "battl*", "beaten", "beaut*", "beloved", "benefic*", "benevolen*", 
        "benign*", "best", "better", "bitch*", "bitter*", "blam*", "bless*", 
        "bold*", "bonus*", "bore*", "boring", "bother*", "brave*", "bright*", 
        "brillian*", "broke", "burden*", "calm*", "cared", "carefree", 
        "careful*", "careless*", "cares", "casual", "casually", "certain*", 
        "challeng*", "champ*", "charit*", "charm*", "cheer*", "cherish*", 
        "chuckl*", "clever*", "comed*", "comfort*", "commitment*", "complain*", 
        "compliment*", "concerned", "confidence", "confident", "confidently", 
        "confront*", "confus*", "considerate", "contempt*", "contented*", 
        "contentment", "contradic*", "convinc*", "cool", "courag*", "crap", 
        "crappy", "craz*", "create*", "creati*", "credit*", "cried", 
        "cries", "critical", "critici*", "crude*", "cry", "crying", "cunt*", 
        "cut", "cute*", "cutie*", "cynic", "danger*", "daring", "darlin*", 
        "daze*", "dear*", "decay*", "defeat*", "defect*", "definite", 
        "definitely", "degrad*", "delectabl*", "delicate*", "delicious*", 
        "deligh*", "depress*", "depriv*", "despair*", "desperat*", "despis*", 
        "destruct*", "determina*", "determined", "devastat*", "difficult*", 
        "digni*", "disadvantage*", "disagree*", "disappoint*", "disaster*", 
        "discomfort*", "discourag*", "dishearten*", "disillusion*", "dislike", 
        "disliked", "dislikes", "disliking", "dismay*", "dissatisf*", 
        "distract*", "distraught", "distress*", "distrust*", "disturb*", 
        "divin*", "domina*", "doom*", "dork*", "doubt*", "dread*", "dull*", 
        "dumb*", "dump*", "dwell*", "dynam*", "eager*", "ease*", "easie*", 
        "easily", "easiness", "easing", "easy*", "ecsta*", "efficien*", 
        "egotis*", "elegan*", "embarrass*", "emotion", "emotional", "empt*", 
        "encourag*", "energ*", "engag*", "enjoy*", "enrag*", "entertain*", 
        "enthus*", "envie*", "envious", "excel*", "excit*", "excruciat*", 
        "exhaust*", "fab", "fabulous*", "fail*", "fake", "fantastic*", 
        "fatal*", "fatigu*", "favor*", "favour*", "fear", "feared", "fearful*", 
        "fearing", "fearless*", "fears", "feroc*", "festiv*", "feud*", 
        "fiery", "fiesta*", "fine", "fired", "flatter*", "flawless*", 
        "flexib*", "flirt*", "flunk*", "foe*", "fond", "fondly", "fondness", 
        "fool*", "forgave", "forgiv*", "fought", "frantic*", "freak*", 
        "free", "freeb*", "freed*", "freeing", "freely", "freeness", 
        "freer", "frees*", "friend*", "fright*", "frustrat*", "fuck", 
        "fucked*", "fucker*", "fuckin*", "fucks", "fume*", "fuming", 
        "fun", "funn*", "furious*", "fury", "geek*", "genero*", "gentle", 
        "gentler", "gentlest", "gently", "giggl*", "giver*", "giving", 
        "glad", "gladly", "glamor*", "glamour*", "gloom*", "glori*", 
        "glory", "goddam*", "gorgeous*", "gossip*", "grace", "graced", 
        "graceful*", "graces", "graci*", "grand", "grande*", "gratef*", 
        "grati*", "grave*", "great", "grief", "griev*", "grim*", "grin", 
        "grinn*", "grins", "grouch*", "grr*", "guilt*", "ha", "haha*", 
        "handsom*", "happi*", "happy", "harass*", "hated", "hateful*", 
        "hater*", "hates", "hating", "hatred", "hazy", "heartbreak*", 
        "heartbroke*", "heartfelt", "heartless*", "heartwarm*", "heh*", 
        "hellish", "helper*", "helpful*", "helping", "helpless*", "helps", 
        "hesita*", "hilarious", "hoho*", "homesick*", "honour*", "hope", 
        "hoped", "hopeful", "hopefully", "hopefulness", "hopeless*", 
        "hopes", "hoping", "horr*", "hostil*", "hug", "hugg*", "hugs", 
        "humiliat*", "humor*", "humour*", "hurra*", "idiot", "ignor*", 
        "impatien*", "impersonal", "impolite*", "importan*", "impress*", 
        "improve*", "improving", "inadequa*", "incentive*", "indecis*", 
        "ineffect*", "inferior*", "inhib*", "innocen*", "insecur*", "insincer*", 
        "inspir*", "insult*", "intell*", "interest*", "interrup*", "intimidat*", 
        "invigor*", "irrational*", "irrita*", "isolat*", "jaded", "jealous*", 
        "jerk", "jerked", "jerks", "joke*", "joking", "joll*", "joy*", 
        "keen*", "kidding", "kind", "kindly", "kindn*", "kiss*", "laidback", 
        "lame*", "laugh*", "lazie*", "lazy", "liabilit*", "libert*", 
        "lied", "lies", "like", "likeab*", "liked", "likes", "liking", 
        "livel*", "LMAO", "LOL", "lone*", "longing*", "lose", "loser*", 
        "loses", "losing", "loss*", "lost", "lous*", "love", "loved", 
        "lovely", "lover*", "loves", "loving*", "low*", "luck", "lucked", 
        "lucki*", "luckless*", "lucks", "lucky", "ludicrous*", "lying", 
        "mad", "maddening", "madder", "maddest", "madly", "magnific*", 
        "maniac*", "masochis*", "melanchol*", "merit*", "merr*", "mess", 
        "messy", "miser*", "miss", "missed", "misses", "missing", "mistak*", 
        "mock", "mocked", "mocker*", "mocking", "mocks", "molest*", "mooch*", 
        "mood", "moodi*", "moods", "moody", "moron*", "mourn*", "nag*", 
        "nast*", "neat*", "needy", "neglect*", "nerd*", "nervous*", "neurotic*", 
        "nice*", "numb*", "nurtur*", "obnoxious*", "obsess*", "offence*", 
        "offens*", "ok", "okay", "okays", "oks", "openminded*", "openness", 
        "opportun*", "optimal*", "optimi*", "original", "outgoing", "outrag*", 
        "overwhelm*", "pained", "painf*", "paining", "painl*", "pains", 
        "palatabl*", "panic*", "paradise", "paranoi*", "partie*", "party*", 
        "passion*", "pathetic*", "peculiar*", "perfect*", "personal", 
        "perver*", "pessimis*", "petrif*", "pettie*", "petty*", "phobi*", 
        "piss*", "piti*", "pity*", "play", "played", "playful*", "playing", 
        "plays", "pleasant*", "please*", "pleasing", "pleasur*", "poison*", 
        "popular*", "positiv*", "prais*", "precious*", "pressur*", "prettie*", 
        "pretty", "prick*", "pride", "privileg*", "prize*", "problem*", 
        "profit*", "promis*", "protested", "protesting", "proud*", "puk*", 
        "radian*", "rage*", "raging", "rancid*", "rape*", "raping", "rapist*", 
        "readiness", "ready", "reassur*", "reek*", "regret*", "reject*", 
        "relax*", "relief", "reliev*", "reluctan*", "remorse*", "repress*", 
        "resent*", "resign*", "resolv*", "restless*", "revigor*", "reward*", 
        "rich*", "ridicul*", "rigid*", "risk*", "ROFL", "romanc*", "romantic*", 
        "rotten", "rude*", "sad", "sadde*", "sadly", "sadness", "sarcas*", 
        "satisf*", "savage*", "scare*", "scaring", "scary", "sceptic*", 
        "scream*", "screw*", "selfish*", "sentimental*", "serious", "seriously", 
        "seriousness", "severe*", "shake*", "shaki*", "shaky", "share", 
        "shared", "shares", "sharing", "shit*", "shock*", "shook", "shy*", 
        "sigh", "sighed", "sighing", "sighs", "silli*", "silly", "sincer*", 
        "skeptic*", "smart*", "smil*", "smother*", "smug*", "snob*", 
        "sob", "sobbed", "sobbing", "sobs", "sociab*", "solemn*", "sorrow*", 
        "sorry", "soulmate*", "special", "splend*", "stammer*", "stank", 
        "startl*", "stink*", "strain*", "strange", "strength*", "stress*", 
        "strong*", "struggl*", "stubborn*", "stunk", "stunned", "stuns", 
        "stupid*", "stutter*", "succeed*", "success*", "suck", "sucked", 
        "sucker*", "sucks", "sucky", "sunnier", "sunniest", "sunny", 
        "sunshin*", "super", "superior*", "support", "supported", "supporter*", 
        "supporting", "supportive*", "supports", "suprem*", "sure*", 
        "surpris*", "suspicio*", "sweet", "sweetheart*", "sweetie*", 
        "sweetly", "sweetness*", "sweets", "talent*", "tantrum*", "tears", 
        "teas*", "tehe", "temper", "tempers", "tender*", "tense*", "tensing", 
        "tension*", "terribl*", "terrific*", "terrified", "terrifies", 
        "terrify", "terrifying", "terror*", "thank", "thanked", "thankf*", 
        "thanks", "thief", "thieve*", "thoughtful*", "threat*", "thrill*", 
        "ticked", "timid*", "toleran*", "tortur*", "tough*", "traged*", 
        "tragic*", "tranquil*", "trauma*", "treasur*", "treat", "trembl*", 
        "trick*", "trite", "triumph*", "trivi*", "troubl*", "TRUE", "trueness", 
        "truer", "truest", "truly", "trust*", "truth*", "turmoil", "ugh", 
        "ugl*", "unattractive", "uncertain*", "uncomfortabl*", "uncontrol*", 
        "uneas*", "unfortunate*", "unfriendly", "ungrateful*", "unhapp*", 
        "unimportant", "unimpress*", "unkind", "unlov*", "unpleasant", 
        "unprotected", "unsavo*", "unsuccessful*", "unsure*", "unwelcom*", 
        "upset*", "uptight*", "useful*", "useless*", "vain", "valuabl*", 
        "valuing", "vanity", "vicious*", "vigor*", "vigour*", "villain*", 
        "violat*", "virtuo*", "vital*", "vulnerab*", "vulture*", "warfare*", 
        "warm*", "warred", "weak*", "wealth*", "weapon*", "weep*", "weird*", 
        "welcom*", "well*", "wept", "whine*", "whining", "willing", "wimp*", 
        "win", "winn*", "wins", "wisdom", "wise*", "witch", "woe*", "won", 
        "wonderf*", "worr*", "worse*", "worship*", "worst", "wow*", "yay", 
        "yays","yearn*","stench*") %>% paste0(collapse="|")and then filtered my dataframe with the keywords...

tweets_E <- tweets[with(tweets, grepl(paste0("\b(?:",paste(kw_Emo, collapse="|"),")\b"), text)),]

如何扩展此过程以准确计算每条推文中出现了多少字典单词？换句话说，我想向数据框添加一个向量，说 EmoWordCount，这将显示每条推文中出现的情感词的数量。

这是我的数据的可重现样本：

dput(droplevels(head(TestTweets, 20)))

structure(list(Time = c("24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:03", "24/06/2016 10:55:03"
), clean_text = c("mayagoodfellow as always making sense of it all for us ive never felt less welcome in this country brexit  httpstcoiai5xa9ywv", 
"never underestimate power of stupid people in a democracy brexit", 
"a quick guide to brexit and beyond after britain votes to quit eu httpstcos1xkzrumvg httpstcocniutojkt0", 
"this selfinflicted wound will be his legacy cameron falls on sword after brexit euref httpstcoegph3qonbj httpstcohbyhxodeda", 
"so the uk is out cameron resigned scotland wants to leave great britain sinn fein plans to unify ireland and its o", 
"this is a very good summary no biasspinagenda of the legal ramifications of the leave result brexit httpstcolobtyo48ng", 
"you cant make this up cornwall votes out immediately pleads to keep eu cash this was never a rehearsal httpstco", 
"no matter the outcome brexit polls demonstrate how quickly half of any population can be convinced to vote against itself q", 
"i wouldnt mind so much but the result is based on a pack of lies and unaccountable promises democracy didnt win brexit pro", 
"so the uk is out cameron resigned scotland wants to leave great britain sinn fein plans to unify ireland and its o", 
"absolutely brilliant poll on brexit by yougov httpstcoepevg1moaw", 
"retweeted mikhail golub golub\r\n\r\nbrexit to be followed by grexit departugal italeave fruckoff czechout httpstcoavkpfesddz", 
"think the brexit campaign relies on the same sort of logic that drpepper does whats the worst that can happen thingsthatarewellbrexit", 
"am baffled by nigel farages claim that brexit is a victory for real people as if the 47 voting remain are fucking smu", 
"not one of the uks problems has been solved by brexit vote migration inequality the uks centurylong decline as", 
"scotland should never leave eu  calls for new independence vote grow httpstcorudiyvthia brexit", 
"the most articulate take on brexit is actually this ft reader comment today httpstco98b4dwsrtv", 
"65 million refugees half of them are children  maybe instead of fighting each other we should be working hand in hand ", 
"im laughing at people who voted for brexit but are complaining about the exchange rate affecting their holiday\r\nremain", 
"life is too short to wear boring shoes  brexit")), .Names = c("Time", 
"clean_text"), row.names = c(NA, 20L), class = c("tbl_df", "tbl", 
"data.frame"))

这是我从 Francisco 使用的代码：

library(stringr)

 for (x in 1:length(kw_Emo)) {
   if (grepl("[*]", kw_Emo[x]) == TRUE) {
     kw_Emo[x] <- substr(kw_Emo[x],1,nchar(kw_Emo[x])-1)
   }
 }

 for (x in 1:length(kw_Emo)) {
   TestTweets[, kw_Emo[x]] <- 0
 }

 for (x in 1:nrow(TestTweets)) {
   partials <- data.frame(str_split(TestTweets[x,2], " "), stringsAsFactors=FALSE)
   partials <- partials[partials[] != ""]
   for(y in 1:length(partials)) {
     for (z in 1:length(kw_Emo)) {
       if (kw_Emo[z] == partials[y]) {
         TestTweets[x, kw_Emo[z]] <- TestTweets[x, kw_Emo[z]] + 1
       }
     }
   }
 }

下面是 Francisco 解决方案的输出（我重命名了新列 EmoWordCount）：

structure(list(Time = c("24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:04", "24/06/2016 10:55:04", 
"24/06/2016 10:55:04", "24/06/2016 10:55:03", "24/06/2016 10:55:03"
), clean_text = c("mayagoodfellow as always making sense of it all for us ive never felt less welcome in this country brexit  httpstcoiai5xa9ywv", 
"never underestimate power of stupid people in a democracy brexit", 
"a quick guide to brexit and beyond after britain votes to quit eu httpstcos1xkzrumvg httpstcocniutojkt0", 
"this selfinflicted wound will be his legacy cameron falls on sword after brexit euref httpstcoegph3qonbj httpstcohbyhxodeda", 
"so the uk is out cameron resigned scotland wants to leave great britain sinn fein plans to unify ireland and its o", 
"this is a very good summary no biasspinagenda of the legal ramifications of the leave result brexit httpstcolobtyo48ng", 
"you cant make this up cornwall votes out immediately pleads to keep eu cash this was never a rehearsal httpstco", 
"no matter the outcome brexit polls demonstrate how quickly half of any population can be convinced to vote against itself q", 
"i wouldnt mind so much but the result is based on a pack of lies and unaccountable promises democracy didnt win brexit pro", 
"so the uk is out cameron resigned scotland wants to leave great britain sinn fein plans to unify ireland and its o", 
"absolutely brilliant poll on brexit by yougov httpstcoepevg1moaw", 
"retweeted mikhail golub golub\r\n\r\n\r\n\r\nbrexit to be followed by grexit departugal italeave fruckoff czechout httpstcoavkpfesddz", 
"think the brexit campaign relies on the same sort of logic that drpepper does whats the worst that can happen thingsthatarewellbrexit", 
"am baffled by nigel farages claim that brexit is a victory for real people as if the 47 voting remain are fucking smu", 
"not one of the uks problems has been solved by brexit vote migration inequality the uks centurylong decline as", 
"scotland should never leave eu  calls for new independence vote grow httpstcorudiyvthia brexit", 
"the most articulate take on brexit is actually this ft reader comment today httpstco98b4dwsrtv", 
"65 million refugees half of them are children  maybe instead of fighting each other we should be working hand in hand ", 
"im laughing at people who voted for brexit but are complaining about the exchange rate affecting their holiday\r\n\r\nremain", 
"life is too short to wear boring shoes  brexit"), EmoWordCount = c(3, 
2, 0, 3, 5, 4, 3, 5, 7, 5, 2, 5, 11, 6, 6, 5, 1, 7, 6, 4)), .Names = c("Time", 
"clean_text", "EmoWordCount"), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

Answer 1

您的要求似乎适合矩阵类型的输出，例如，推文是行，每个术语是一列，单元格值是出现的次数。这是使用 gsub:

的基础 R 解决方案

terms <- c("cat", "hat", "bat")
tweets <- c("The cat in a hat met the man with the hat and a bat",
            "That cat was a fast cat!",
            "I bought a baseball bat while wearing a hat")

output <- sapply(terms, function(x) {
    sapply(tweets, function(y) {
        (nchar(y) - nchar(gsub(paste0("\b", x, "\b"), "", y))) / nchar(x)
    })
})

                                                    cat hat bat
The cat in a hat met the man with the hat and a bat   1   2   1
That cat was a fast cat!                              2   0   0
I bought a baseball bat while wearing a hat           0   1   1

这种方法首先使用sapply迭代terms中的每个关键词，然后迭代每条推文。对于每个 keyword/tweet 组合，它计算出现的次数。我使用的技巧是将原始推文的长度与删除所有出现的关键字的同一推文的长度进行比较，然后根据特定关键字的长度对差异进行归一化。

编辑：

如果您想要计算每条推文的关键字出现次数总和，那么我们可以在上面的矩阵上调用 rowSums：

rowSums(output)

The cat in a hat met the man with the hat and a bat
                                                  4
                           That cat was a fast cat!
                                                  2
        I bought a baseball bat while wearing a hat
                                                  2

Answer 2

我不知道这是否是最佳解决方案，但它工作得很好。你应该使用 "stringr" 包。

library(stringr)

 for (x in 1:length(keywords)) {
  if (grepl("[*]", keywords[x]) == TRUE) {
    keywords[x] <- substr(keywords[x],1,nchar(keywords[x])-1)
      }
    }

在这里，我从您的一些关键字中删除了“*”符号（据我了解，您希望分析它们在字符串中的部分包含。

IMPORTANT:

应该使用正则表达式 [*] 来捕捉 * 符号。

for (x in 1:length(keywords)) {
  dataframe[, keywords[x]] <- 0
    }

只需创建默认值为 0 的新列。

for (x in 1:nrow(dataframe)) {
  partials <- data.frame(str_split(dataframe[x,2], " "), stringsAsFactors=FALSE)
  partials <- partials[partials[] != ""]
  for(y in 1:length(partials)) {
    for (z in 1:length(keywords)) {
      if (keywords[z] == partials[y]) {
        dataframe[x, keywords[z]] <- dataframe[x, keywords[z]] + 1
      }
    }
  }
}

您将每条推文拆分为单词向量，查看关键字是否等于任何关键字，如果它们存在则加 +1，最后得到相同的数据框，但每个关键字都有新的列。

我用你的关键字测试了，给出了正确答案。

计算推文中的关键字和词干

Count keywords and word stems in tweets

twitter

text

r

count

keyword