将文本文件中的 Json 个对象读入 pandas

Reading Json objects from text file into pandas

我从 api 库中提取了 json 个对象并将它们写入文本文件。我现在陷入了如何获取保存在 .txt 文件中的 json 结构并将其读回 python pandas 库的问题。

有许多资源介绍了如何将 json 文件导入 pandas,但由于这是一个文本文件,而且我是编程和使用 json 的新手structure 我不确定如何有效地执行此任务。

文本文件中有许多 json 对象,我想分享一个示例,但它有一堆 url 缩短器,这使我无法 post 这个所以除非有人真的需要看到这个结构,否则我会推迟。我已经尝试过 pd.read_csv() 和 pd.read_json() 但由于这是 .txt 文件中的 json 结构,所以到目前为止它都不能正常工作。

到目前为止,这是我将数据返回 python 的最佳猜测:

data = []
with open('tweet_json.txt') as f:
    for line in f:
        data.append(json.loads(line))

但是当我尝试这样做时收到以下错误消息:JSONDecodeError: Extra data: line 1 column 4626 (char 4625)

这里有两条推文,您可以复制并保存到 .txt 文件以进行复制:

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
 'display_text_range': [0, 85],
 'entities': {'hashtags': [],
  'media': [{'display_url': 'pic.twitter.com/MgUWQ76dJU',
    'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
    'id': 892420639486877696,
    'id_str': '892420639486877696',
    'indices': [86, 109],
    'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'sizes': {'large': {'h': 528, 'resize': 'fit', 'w': 540},
     'medium': {'h': 528, 'resize': 'fit', 'w': 540},
     'small': {'h': 528, 'resize': 'fit', 'w': 540},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'type': 'photo',
    'url': na}],
  'symbols': [],
  'urls': [],
  'user_mentions': []},
 'extended_entities': {'media': [{'display_url': 'pic.twitter.com/MgUWQ76dJU',
    'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
    'id': 892420639486877696,
    'id_str': '892420639486877696',
    'indices': [86, 109],
    'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'sizes': {'large': {'h': 528, 'resize': 'fit', 'w': 540},
     'medium': {'h': 528, 'resize': 'fit', 'w': 540},
     'small': {'h': 528, 'resize': 'fit', 'w': 540},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'type': 'photo',
    'url': na}]},
 'favorite_count': 39311,
 'favorited': False,
 'full_text': "This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 na ",
 'geo': None,
 'id': 892420643555336193,
 'id_str': '892420643555336193',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'place': None,
 'possibly_sensitive': False,
 'possibly_sensitive_appealable': False,
 'retweet_count': 8778,
 'retweeted': False,
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Sun Nov 15 21:41:29 +0000 2015',
  'default_profile': False,
  'default_profile_image': False,
  'description': 'Only Legit Source for Professional Dog Ratings STORE: @ShopWeRateDogs | IG, FB & SC: WeRateDogs | MOBILE APP: @GoodDogsGame Business: dogratingtwitter@gmail.com',
  'entities': {'description': {'urls': []},
   'url': {'urls': [{'display_url': 'weratedogs.com',
      'expanded_url': 'http://weratedogs.com',
      'indices': [0, 23],
      'url': na }]}},
  'favourites_count': 126135,
  'follow_request_sent': False,
  'followers_count': 4730764,
  'following': False,
  'friends_count': 109,
  'geo_enabled': True,
  'has_extended_profile': True,
  'id': 4196983835,
  'id_str': '4196983835',
  'is_translation_enabled': False,
  'is_translator': False,
  'lang': 'en',
  'listed_count': 3700,
  'location': 'DM YOUR DOGS. WE WILL RATE',
  'name': 'WeRateDogs™',
  'notifications': False,
  'profile_background_color': '000000',
  'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_tile': False,
  'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4196983835/1510812288',
  'profile_image_url': 'http://pbs.twimg.com/profile_images/936608706107772929/GwbLQRxf_normal.jpg',
  'profile_image_url_https': 'https://pbs.twimg.com/profile_images/936608706107772929/GwbLQRxf_normal.jpg',
  'profile_link_color': 'F5ABB5',
  'profile_sidebar_border_color': '000000',
  'profile_sidebar_fill_color': '000000',
  'profile_text_color': '000000',
  'profile_use_background_image': False,
  'protected': False,
  'screen_name': 'dog_rates',
  'statuses_count': 6301,
  'time_zone': None,
  'translator_type': 'none',
  'url': n/a,
  'utc_offset': None,
  'verified': True}}

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 01 00:17:27 +0000 2017',
 'display_text_range': [0, 138],
 'entities': {'hashtags': [],
  'media': [{'display_url': 'pic.twitter.com/0Xxu71qeIV',
    'expanded_url': 'https://twitter.com/dog_rates/status/892177421306343426/photo/1',
    'id': 892177413194625024,
    'id_str': '892177413194625024',
    'indices': [139, 162],
    'media_url': 'http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'sizes': {'large': {'h': 1600, 'resize': 'fit', 'w': 1407},
     'medium': {'h': 1200, 'resize': 'fit', 'w': 1055},
     'small': {'h': 680, 'resize': 'fit', 'w': 598},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'type': 'photo',
    'url': na}],
  'symbols': [],
  'urls': [],
  'user_mentions': []},
 'extended_entities': {'media': [{'display_url': 'pic.twitter.com/0Xxu71qeIV',
    'expanded_url': 'https://twitter.com/dog_rates/status/892177421306343426/photo/1',
    'id': 892177413194625024,
    'id_str': '892177413194625024',
    'indices': [139, 162],
    'media_url': 'http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'sizes': {'large': {'h': 1600, 'resize': 'fit', 'w': 1407},
     'medium': {'h': 1200, 'resize': 'fit', 'w': 1055},
     'small': {'h': 680, 'resize': 'fit', 'w': 598},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'type': 'photo',
    'url': na}]},
 'favorite_count': 33662,
 'favorited': False,
 'full_text': "This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 na,
 'geo': None,
 'id': 892177421306343426,
 'id_str': '892177421306343426',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'place': None,
 'possibly_sensitive': False,
 'possibly_sensitive_appealable': False,
 'retweet_count': 6431,
 'retweeted': False,
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Sun Nov 15 21:41:29 +0000 2015',
  'default_profile': False,
  'default_profile_image': False,
  'description': 'Only Legit Source for Professional Dog Ratings STORE: @ShopWeRateDogs | IG, FB & SC: WeRateDogs | MOBILE APP: @GoodDogsGame Business: dogratingtwitter@gmail.com',
  'entities': {'description': {'urls': []},
   'url': {'urls': [{'display_url': 'weratedogs.com',
      'expanded_url': 'http://weratedogs.com',
      'indices': [0, 23],
      'url': na}]}},
  'favourites_count': 126135,
  'follow_request_sent': False,
  'followers_count': 4730865,
  'following': False,
  'friends_count': 109,
  'geo_enabled': True,
  'has_extended_profile': True,
  'id': 4196983835,
  'id_str': '4196983835',
  'is_translation_enabled': False,
  'is_translator': False,
  'lang': 'en',
  'listed_count': 3728,
  'location': 'DM YOUR DOGS. WE WILL RATE',
  'name': 'WeRateDogs™',
  'notifications': False,
  'profile_background_color': '000000',
  'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_tile': False,
  'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4196983835/1510812288',
  'profile_image_url': 'http://pbs.twimg.com/profile_images/936608706107772929/GwbLQRxf_normal.jpg',
  'profile_image_url_https': 'https://pbs.twimg.com/profile_images/936608706107772929/GwbLQRxf_normal.jpg',
  'profile_link_color': 'F5ABB5',
  'profile_sidebar_border_color': '000000',
  'profile_sidebar_fill_color': '000000',
  'profile_text_color': '000000',
  'profile_use_background_image': False,
  'protected': False,
  'screen_name': 'dog_rates',
  'statuses_count': 6301,
  'time_zone': None,
  'translator_type': 'none',
  'url': na,
  'utc_offset': None,
  'verified': True}}

更新

以下代码产生此错误:JSONDecodeError: Expecting ',' delimiter: line 1 column 4627 (char 4626)

with open('tweet_json.txt', 'r') as f:
    datastore = json.load(f)

这个 post 是迄今为止我找到的最接近解决问题的方法:

Python json.loads shows ValueError: Expecting , delimiter: line 1

感谢大家的反馈。我不得不调整有关如何从 API 中提取数据的代码,然后非常直接地将数据放入字典列表中。

with open('tweet_json.txt', 'a+', encoding='utf-8') as file:
    for tweet_id in twitter_archive_df['tweet_id']:
        try:
            tweet = api.get_status(id = tweet_id, tweet_mode='extended')
            file.write(json.dumps(tweet))
            file.write('\n')
        except:
            pass

file.close()

然后我 运行 使用以下代码将 json 对象从 .txt 文件导入字典列表:

with open('tweet_json.txt') as file:
    status = []
    for line in file:
        status.append(json.loads(line))