如何通过遍历 has_more 得到所有结果
How to get all results by iterating through has_more
我正在使用 stackexchange api 来获取 2000 年至 2019 年 8 月的评论。看起来我只遍历了 2 页。我不确定我的错误是在 api 参数还是在迭代过程中。
这是我的代码。
import requests
from datetime import datetime
import json
import csv
import os
import pprint
pp = pprint.PrettyPrinter(indent=4)
def write_to_json(data):
curr_dir = os.getcwd()
output_file_path = os.path.join(curr_dir, 'so_comment1.json')
with open(output_file_path, 'w') as outfile:
json.dump(data, outfile)
def get_comments(fromdate, todate):
so_url = 'https://api.stackexchange.com/2.2/comments?site=Whosebug&filter=!1zSn*g7xPU9g6(VDTS7_c&fromdate=' \
+str(fromdate)+'&todate='+str(todate)+'&pagesize=100'
headers = {"Content-type": "application/json"}
resp = requests.get(so_url, headers = headers)
if resp.status_code != 200:
print('error: ' + str(resp.status_code))
else:
print('Success')
data = resp.json()
data1 = resp.json()
page_num = 1
if data1['has_more']:
page_num += 1
so_url = 'https://api.stackexchange.com/2.2/comments?site=Whosebug&filter=!1zSn*g7xPU9g6(VDTS7_c&fromdate=' \
+str(fromdate)+'&todate='+str(todate)+'&pagesize=100&page='+str(page_num)
resp = requests.get(so_url, headers = headers)
if resp.status_code != 200:
print('error: ' + str(resp.status_code))
else:
print('Success')
data1 = resp.json()
for item in data1['items']:
data['items'].append(item)
write_to_json(data)
def filter_comment_body():
with open('so_comment1.json') as json_file_so:
comments = json.load(json_file_so)
with open('comments1.csv', 'w', encoding='utf-8') as comments_file:
comments_writer = csv.writer(comments_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for item in comments['items']:
comments_writer.writerow([item['body']])
if __name__ == '__main__':
# once comments are written to json file(s) stop calling to get_comments
fromdate = datetime.strptime('Jan 1 2000', '%b %d %Y')
todate = datetime.strptime('Aug 1 2019', '%b %d %Y')
# print(datetime.timestamp(fromdate), ' ', datetime.timestamp(todate))
get_comments(fromdate, todate)
filter_comment_body()
考虑到日期范围,我假设我会收到 1000 条评论。
但是我只收到了200条评论(2页)
您请求了两页 - 您收到了两页。
- 您获得了第一页
- ...然后设置
page_num = 1
- 然后你检查是否
data1['has_more']
- 如果是这种情况,您增加
page_num
,从 get_comments
下载第二页和 return。
- 如果不是,代码只是 returns
这是你打算做的吗?我认为您打算继续下载新页面,直到 data1['has_more']
变为 False
。
所以,算法可能是这样的:
create an empty list where you want to hold the data
set page_num=1
begin_loop:
download page number page_num
if data['has_more'] is False:
goto return_from_function
append the elements from `data` to the list you created earlier
increment page_num
goto begin_loop
return_from_function:
process the data in the list created on step 1 and return
我正在使用 stackexchange api 来获取 2000 年至 2019 年 8 月的评论。看起来我只遍历了 2 页。我不确定我的错误是在 api 参数还是在迭代过程中。
这是我的代码。
import requests
from datetime import datetime
import json
import csv
import os
import pprint
pp = pprint.PrettyPrinter(indent=4)
def write_to_json(data):
curr_dir = os.getcwd()
output_file_path = os.path.join(curr_dir, 'so_comment1.json')
with open(output_file_path, 'w') as outfile:
json.dump(data, outfile)
def get_comments(fromdate, todate):
so_url = 'https://api.stackexchange.com/2.2/comments?site=Whosebug&filter=!1zSn*g7xPU9g6(VDTS7_c&fromdate=' \
+str(fromdate)+'&todate='+str(todate)+'&pagesize=100'
headers = {"Content-type": "application/json"}
resp = requests.get(so_url, headers = headers)
if resp.status_code != 200:
print('error: ' + str(resp.status_code))
else:
print('Success')
data = resp.json()
data1 = resp.json()
page_num = 1
if data1['has_more']:
page_num += 1
so_url = 'https://api.stackexchange.com/2.2/comments?site=Whosebug&filter=!1zSn*g7xPU9g6(VDTS7_c&fromdate=' \
+str(fromdate)+'&todate='+str(todate)+'&pagesize=100&page='+str(page_num)
resp = requests.get(so_url, headers = headers)
if resp.status_code != 200:
print('error: ' + str(resp.status_code))
else:
print('Success')
data1 = resp.json()
for item in data1['items']:
data['items'].append(item)
write_to_json(data)
def filter_comment_body():
with open('so_comment1.json') as json_file_so:
comments = json.load(json_file_so)
with open('comments1.csv', 'w', encoding='utf-8') as comments_file:
comments_writer = csv.writer(comments_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for item in comments['items']:
comments_writer.writerow([item['body']])
if __name__ == '__main__':
# once comments are written to json file(s) stop calling to get_comments
fromdate = datetime.strptime('Jan 1 2000', '%b %d %Y')
todate = datetime.strptime('Aug 1 2019', '%b %d %Y')
# print(datetime.timestamp(fromdate), ' ', datetime.timestamp(todate))
get_comments(fromdate, todate)
filter_comment_body()
考虑到日期范围,我假设我会收到 1000 条评论。 但是我只收到了200条评论(2页)
您请求了两页 - 您收到了两页。
- 您获得了第一页
- ...然后设置
page_num = 1
- 然后你检查是否
data1['has_more']
- 如果是这种情况,您增加
page_num
,从get_comments
下载第二页和 return。 - 如果不是,代码只是 returns
- 如果是这种情况,您增加
这是你打算做的吗?我认为您打算继续下载新页面,直到 data1['has_more']
变为 False
。
所以,算法可能是这样的:
create an empty list where you want to hold the data
set page_num=1
begin_loop:
download page number page_num
if data['has_more'] is False:
goto return_from_function
append the elements from `data` to the list you created earlier
increment page_num
goto begin_loop
return_from_function:
process the data in the list created on step 1 and return