使用 pandas 追加函数后,行值变为 NaN
Row values turn into NaNs after using pandas append function
我正在尝试从日期 X 开始提取所有来自 subreddit X 的提及 X 的评论,并将他们的日期、评论(正文)和 score/upvotes 添加到我的数据框。
到目前为止,我(在可爱的互联网的帮助下)设法想出了这个代码:
import requests
from datetime import datetime
import traceback
import time
import json
import sys
import numpy as np
username = "" # put the username you want to download in the quotes
subreddit = "GME" # put the subreddit you want to download in the quotes
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit
filter_string = None
if username == "" and subreddit == "":
print("Fill in either username or subreddit")
sys.exit(0)
elif username == "" and subreddit != "":
filter_string = f"subreddit={subreddit}"
elif username != "" and subreddit == "":
filter_string = f"author={username}"
else:
filter_string = f"author={username}&subreddit={subreddit}"
url = "https://api.pushshift.io/reddit/search/{}/?q=gamestop&size=500&subreddit=gme&sort=desc&{}&before="
start_time = datetime.utcnow()
# Dataframe: comments
df_comments = pd.DataFrame()
df_comments["date"] = ""
df_comments["comment"] = ""
df_comments["score"] = ""
# Dataframe: posts
df_posts = pd.DataFrame()
def redditAPI(object_type):
print(f"\nLooping through {object_type}s and append to dataframe...")
count = 0
previous_epoch = int(start_time.timestamp())
while True:
# Ensures that loop breaks at March 12 2021 for testing purposes
if previous_epoch <= 1615503600:
break
new_url = url.format(object_type, filter_string)+str(previous_epoch)
json_text = requests.get(new_url)
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
try:
json_data = json.loads(json_text.text)
except json.decoder.JSONDecodeError:
time.sleep(1)
continue
if 'data' not in json_data:
break
objects = json_data['data']
df2 = pd.DataFrame.from_dict(objects)
if len(objects) == 0:
break
for object in objects:
previous_epoch = object['created_utc'] - 1
count += 1
if object_type == "comment":
df_comments["date"] = df_comments["date"].append(df2["created_utc"], ignore_index=True)
df_comments["comment"] = df_comments["comment".append(df2["body"], ignore_index=True)
df_comments["score"] = df_comments["score"].append(df2["score"], ignore_index=True)
elif object_type == "submission":
df_posts["date"] = df2["created_utc"]
df_posts["post"] = df2["selftext"] # include condition to skip empty selftext
df_posts["score"] = df2["score"]
# Convert UNIX to datetime
#df_comments["date"] = pd.to_datetime(df_comments["date"],unit='s')
#df_posts["date"] = pd.to_datetime(df_posts["date"],unit='s')
print("\nDone. Saved to dataframe.")
redditAPI("comment")
#redditAPI("submission")
请暂时忽略“提交”目标代码。
当我检查 df_comments 数据框的前 5 行时:
由于 API 每个请求有 100 个查询的限制,我使用循环直到它达到某个 UNIX 值。在每个循环中,代码应将数据附加到设置列。
知道这些值怎么可能变成 NAN and/or如何解决它?
问题出在:
if object_type == "comment":
df_comments["date"] = df_comments["date"].append(df2["created_utc"], ignore_index=True)
df_comments["comment"] = df_comments["comment".append(df2["body"], ignore_index=True)
df_comments["score"] = df_comments["score"].append(df2["score"], ignore_index=True)
简而言之,通过将系列分配给Dataframe列,系列将符合DataFrames索引。 append()
的结果比 df_comments
的索引有更多的元素,所以原始数据框列不会改变。有关更多详细信息,您可以在问题的 MWE 中查看我的 analyzing。
您可以添加数据框来避免它:
if object_type == "comment":
df2.rename(columns={'created_utc': 'date', 'body': 'comment'}, inplace=True)
df_comments = df_comments.append(df2[['date', 'comment', 'score']])
在这种情况下,df_comments
在函数体内被分配了一个新值,它被假定为一个 local 变量。所以需要在redditAPI()
函数中加入global df_comments
我正在尝试从日期 X 开始提取所有来自 subreddit X 的提及 X 的评论,并将他们的日期、评论(正文)和 score/upvotes 添加到我的数据框。
到目前为止,我(在可爱的互联网的帮助下)设法想出了这个代码:
import requests
from datetime import datetime
import traceback
import time
import json
import sys
import numpy as np
username = "" # put the username you want to download in the quotes
subreddit = "GME" # put the subreddit you want to download in the quotes
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit
filter_string = None
if username == "" and subreddit == "":
print("Fill in either username or subreddit")
sys.exit(0)
elif username == "" and subreddit != "":
filter_string = f"subreddit={subreddit}"
elif username != "" and subreddit == "":
filter_string = f"author={username}"
else:
filter_string = f"author={username}&subreddit={subreddit}"
url = "https://api.pushshift.io/reddit/search/{}/?q=gamestop&size=500&subreddit=gme&sort=desc&{}&before="
start_time = datetime.utcnow()
# Dataframe: comments
df_comments = pd.DataFrame()
df_comments["date"] = ""
df_comments["comment"] = ""
df_comments["score"] = ""
# Dataframe: posts
df_posts = pd.DataFrame()
def redditAPI(object_type):
print(f"\nLooping through {object_type}s and append to dataframe...")
count = 0
previous_epoch = int(start_time.timestamp())
while True:
# Ensures that loop breaks at March 12 2021 for testing purposes
if previous_epoch <= 1615503600:
break
new_url = url.format(object_type, filter_string)+str(previous_epoch)
json_text = requests.get(new_url)
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
try:
json_data = json.loads(json_text.text)
except json.decoder.JSONDecodeError:
time.sleep(1)
continue
if 'data' not in json_data:
break
objects = json_data['data']
df2 = pd.DataFrame.from_dict(objects)
if len(objects) == 0:
break
for object in objects:
previous_epoch = object['created_utc'] - 1
count += 1
if object_type == "comment":
df_comments["date"] = df_comments["date"].append(df2["created_utc"], ignore_index=True)
df_comments["comment"] = df_comments["comment".append(df2["body"], ignore_index=True)
df_comments["score"] = df_comments["score"].append(df2["score"], ignore_index=True)
elif object_type == "submission":
df_posts["date"] = df2["created_utc"]
df_posts["post"] = df2["selftext"] # include condition to skip empty selftext
df_posts["score"] = df2["score"]
# Convert UNIX to datetime
#df_comments["date"] = pd.to_datetime(df_comments["date"],unit='s')
#df_posts["date"] = pd.to_datetime(df_posts["date"],unit='s')
print("\nDone. Saved to dataframe.")
redditAPI("comment")
#redditAPI("submission")
请暂时忽略“提交”目标代码。
当我检查 df_comments 数据框的前 5 行时:
由于 API 每个请求有 100 个查询的限制,我使用循环直到它达到某个 UNIX 值。在每个循环中,代码应将数据附加到设置列。
知道这些值怎么可能变成 NAN and/or如何解决它?
问题出在:
if object_type == "comment":
df_comments["date"] = df_comments["date"].append(df2["created_utc"], ignore_index=True)
df_comments["comment"] = df_comments["comment".append(df2["body"], ignore_index=True)
df_comments["score"] = df_comments["score"].append(df2["score"], ignore_index=True)
简而言之,通过将系列分配给Dataframe列,系列将符合DataFrames索引。 append()
的结果比 df_comments
的索引有更多的元素,所以原始数据框列不会改变。有关更多详细信息,您可以在问题的 MWE 中查看我的 analyzing。
您可以添加数据框来避免它:
if object_type == "comment":
df2.rename(columns={'created_utc': 'date', 'body': 'comment'}, inplace=True)
df_comments = df_comments.append(df2[['date', 'comment', 'score']])
在这种情况下,df_comments
在函数体内被分配了一个新值,它被假定为一个 local 变量。所以需要在redditAPI()
函数中加入global df_comments