如何将文本与 Twitter 流式传输 JSON 响应和 运行 文本分析与 python 分开?

How to separate text from twitter streaming JSON responses and run analysis on text with python?

我正在尝试使用 twitter API 对文本进行 运行 情感分析。我正在 运行 解决我不理解将文本与每条推文分开的方法以及 运行 TextBlob 库中提供的情感极性分析的问题。此外,我希望这仅能撤回英文推文。输出在 JSON 中。

这是根据关键字生成推文的代码(在本例中为 "usd"、"euro"、"loonie")以及我在存储文本和使用结果方面的蹩脚尝试在变量中:

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
import re
import pandas as pd
import matplotlib.pyplot as plt


#Variables that contains the user credentials to access Twitter API 
access_token = "xxxx"
access_token_secret = "xxxx"
consumer_key = "xxxx"
consumer_secret = "xxxx"


#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):

    def on_data(self, data):
        print data
        return True

    def on_error(self, status):
        print status


if __name__ == '__main__':

    #This handles Twitter authentication and the connection to Twitter Streaming API
    l = StdOutListener()
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    stream = Stream(auth, l)

    #This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
    stream.filter(track=['euro', 'dollar', 'loonie', ] )

    tweets_data_path = stream.filter

    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except:
            continue
print len(tweets_data)

tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
wiki = TextBlob(tweets['text'])
r = wiki.sentiment.polarity

print r

这是输出的样子:

{"created_at":"Sun Jun 14 23:43:31 +0000 2015","id":610231121016524801,"id_str":"610231121016524801","text":"RT @amirulimannn: RM6 diperlukan utk tukar kpd 1Pound.\nRM3 diperlukan utk tukar kpd 1S'pore Dollar.\n\nGraf matawang jatuh. Tak sedih ke? htt\u2026","source":"\u003ca href=\"http://twitter.com/download/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":42642877,"id_str":"42642877","name":"Wny","screen_name":"waaannnyyy","location":"Dirgahayu Darul Makmur","url":null,"description":"Aku serba tiada, aku kekurangan.","protected":false,"verified":false,"followers_count":320,"friends_count":239,"listed_count":1,"favourites_count":4344,"statuses_count":34408,"created_at":"Tue May 26 15:10:28 +0000 2009","utc_offset":28800,"time_zone":"Kuala Lumpur","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg","profile_background_tile":true,"profile_link_color":"DD2E44","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/42642877/1415486321","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat Jun 13 03:33:29 +0000 2015","id":609564219495706624,"id_str":"609564219495706624","text":"RM6 diperlukan utk tukar kpd 1Pound.\nRM3 diperlukan utk tukar kpd 1S'pore Dollar.\n\nGraf matawang jatuh. Tak sedih ke? http://t.co/dum4skb6uK","source":"\u003ca href=\"http://twitter.com/download/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":481856658,"id_str":"481856658","name":"seorang iman","screen_name":"amirulimannn","location":"+06MY","url":"http://instagram.com/amirulimannn","description":"I wanna drown myself in a bottle of her perfume","protected":false,"verified":false,"followers_count":723,"friends_count":834,"listed_count":2,"favourites_count":4810,"statuses_count":50981,"created_at":"Fri Feb 03 07:49:55 +0000 2012","utc_offset":28800,"time_zone":"Kuala Lumpur","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"AD0A20","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg","profile_background_tile":false,"profile_link_color":"E36009","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"24210E","profile_text_color":"89B5A2","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/481856658/1428379855","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1321,"favorite_count":229,"entities":{"hashtags":[],"trends":[],"urls":[],"user_mentions":[],"symbols":[],"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[118,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[118,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"in"},"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"trends":[],"urls":[],"user_mentions":[{"screen_name":"amirulimannn","name":"seorang iman","id":481856658,"id_str":"481856658","indices":[3,16]}],"symbols":[],"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[139,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}},"source_status_id":609564219495706624,"source_status_id_str":"609564219495706624"}]},"extended_entities":{"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[139,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}},"source_status_id":609564219495706624,"source_status_id_str":"609564219495706624"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"in","timestamp_ms":"1434325411453"}

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json

# Variables that contains the user credentials to access Twitter API
access_token = ''
access_token_secret = ''
consumer_key = ''
consumer_secret = ''


# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
    def on_data(self, data):
        json_load = json.loads(data)
        texts = json_load['text']
        coded = texts.encode('utf-8')
        s = str(coded)
        print(s[2:-1])
        return True

    def on_error(self, status):
        print(status)

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, StdOutListener())

# This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['euro', 'dollar', 'loonie', ], languages=['en'])

对于您关于 json 的原始问题:您可以使用 json.loads() 加载数据流。其他东西的原因是当你从 twitter 提取数据到 python 时你不会得到 charmap 错误。 s[2:-1]的原因是去掉编码为utf-8的多余字符。

对于只有英文的推文,您还可以使用 languages=['en'] 直接从流中过滤。

我不熟悉 TextBlob 库,但您可以通过多种方式存储它,只需将您的信息写入文件,然后 运行TextBlob 直接从文件中读取。您可以替换 print(s[2:-1]) 或添加到它:

myfile = open('text.csv','a')
myFile.write(s[2:-1])
myFile.write('\n') # adds a line between tweets
myFile.close() 

您可以使用 file = open('text.csv', 'r') 阅读它来进行情绪分析。每次打开文件时都不要忘记添加 file.close()