将推文字典保存到 JSON 文件中会导致字典为空

Saving dictionary of tweets into JSON file results in an empty dictionary

我正在尝试收集一些本地化的推文并将它们作为推文字典存储在我的硬盘上。在 fetchsamples 函数的某些迭代中,保存的字典被强制为空状态,尽管在 for 循环期间数据被添加到字典中(见下面的输出)。

我尝试了不同的编码或将 "w" 和 "wb" 标志传递给我的保存功能,但没有帮助。

我尝试使用随机字符串重现此内容(以便人们更容易检查我的代码),但我无法做到。我不确定推文结构或我的代码中的什么导致了这种行为。

注意:我已经添加了一个代码片段来捕捉字典被迫进入空状态以进行调试的情况。

import oauth2 as oauth
import urllib2 as urllib
import json
import pickle
import os

api_key = "Insert api_key here"
api_secret = "Insert api_secret here"
access_token_key = "Insert access_token_key"
access_token_secret = "Insert access_token_secret"

_debug = 0

oauth_token    = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)

signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()

http_method = "GET"

http_handler  = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)

def twitterreq(url, method, parameters):
    req = oauth.Request.from_consumer_and_token(oauth_consumer,
                                                token=oauth_token,
                                                http_method=http_method,
                                                http_url=url, 
                                                parameters=parameters)

    req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
    headers = req.to_header()

    if http_method == "POST":
        encoded_post_data = req.to_postdata()
    else:
        encoded_post_data = None
        url = req.to_url()

    opener = urllib.OpenerDirector()
    opener.add_handler(http_handler)
    opener.add_handler(https_handler)

    response = opener.open(url, encoded_post_data)

    return response

def fetchsamples():

    url = "https://stream.twitter.com/1/statuses/sample.json"
    url = "https://stream.twitter.com/1/statuses/filter.json?locations=-0.489,51.28,0.236,51.686"
    parameters = []
    response = twitterreq(url, "GET", parameters)

    data = {}
    count = 1
    for line in response:        
        try:
            strip = json.loads(line.strip())
            if strip['coordinates'] != None:
                data[count] = strip

                count += 1

                if count % 10 == 0: 
                    print count, len(data.keys())

        except Exception as e:
            # Print error and store in a log file
            print e            
            with open("/Temp/Data/error.log","w") as log:
                log.write(str(e))

        # If 100 tweets have passed save the file
        if count % 100 == 0:
            print "Before saving: ", len(data.keys())
            fp =  open("/Temp/Data/"+str(count/100)+".json","w")
            json.dump(data,fp,encoding="latin-1")
            fp.close()

            # This code is for debug purposes to catch when dictionary
            # when dictionary is forced into empty state
            if os.path.getsize("/Temp/Data/"+str(count/100)+".json") < 10:
                print "After saving: ", len(data.keys())
                return data
            else:
                data = {}

data = fetchsamples()

这会产生以下没有错误的输出。 data 字典为空。

100 99
Before saving:  99
110 10
120 20
130 30
140 40
150 50
160 60
170 70
180 80
190 90
200 100
Before saving:  100
Before saving:  0
After saving:  0

字典为空,因为在每 100 次迭代后,您要么设置 data={},要么字典已经为空。如果我理解正确,您将需要另一本永远不会清空的字典,并将项目也推送到该字典。

import oauth2 as oauth
import urllib2 as urllib
import json
import pickle
import os

api_key = "Insert api_key here"
api_secret = "Insert api_secret here"
access_token_key = "Insert access_token_key"
access_token_secret = "Insert access_token_secret"

_debug = 0

oauth_token    = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)

signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()

http_method = "GET"

http_handler  = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)

def twitterreq(url, method, parameters):
    req = oauth.Request.from_consumer_and_token(oauth_consumer,
                                                token=oauth_token,
                                                http_method=http_method,
                                                http_url=url, 
                                                parameters=parameters)

    req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
    headers = req.to_header()

    if http_method == "POST":
        encoded_post_data = req.to_postdata()
    else:
        encoded_post_data = None
        url = req.to_url()

    opener = urllib.OpenerDirector()
    opener.add_handler(http_handler)
    opener.add_handler(https_handler)

    response = opener.open(url, encoded_post_data)

    return response

def fetchsamples():

    url = "https://stream.twitter.com/1/statuses/sample.json"
    url = "https://stream.twitter.com/1/statuses/filter.json?locations=-0.489,51.28,0.236,51.686"
    parameters = []
    response = twitterreq(url, "GET", parameters)

    data = {}
    allData = {}
    count = 1
    for line in response:        
        try:
            strip = json.loads(line.strip())
            if strip['coordinates'] != None:
                data[count] = strip
                allData[count] = strip

                count += 1

                if count % 10 == 0: 
                    print count, len(data.keys())

        except Exception as e:
            # Print error and store in a log file
            print e            
            with open("/Temp/Data/error.log","w") as log:
                log.write(str(e))

        # If 100 tweets have passed save the file
        if count % 100 == 0:
            print "Before saving: ", len(data.keys())
            fp =  open("/Temp/Data/"+str(count/100)+".json","w")
            json.dump(data,fp,encoding="latin-1")
            fp.close()

            # Return data if the file is empty and stop
            if os.path.getsize("/Temp/Data/"+str(count/100)+".json") < 10:
                print "After saving: ", len(data.keys())
                return allData
            else:
                data = {}

data = fetchsamples()

问题出在我递增 count 值的方式上。由于 count 仅在 strip["coordinates"] != None 时递增,如果我收到一条推文,其中 strip["coordinates"] == None 计数值不会递增,但 data = {}count % 100 == 0 给出 True,这意味着原来的非空文件被替换为一个空文件。

解决方案是在保存后递增 count,例如此处:

    if count % 100 == 0:
        print "Before saving: ", len(data.keys())
        fp =  open("/Temp/Data/"+str(count/100)+".json","w")
        json.dump(data,fp,encoding="latin-1")
        fp.close()

        count += 1