从 google analytics API v4 下载批量报告
Downloading batch reports from google analytics API v4
我试图获得 3 个月的报告,为此,我需要发出多个请求并将结果附加到列表中,因为 API 只有 returns 100,000
行每个请求。 API 返回一个名为 nextPageToken
的变量,我需要将其传递到下一个查询以获取报告的下一个 100,000
行。我很难做到这一点。
这是我的代码:
def initialize_analyticsreporting():
'''Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
'''
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
list = []
def get_report(analytics, pageTokenVariable):
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'pageSize': 100000,
'dateRanges': [{'startDate': '90daysAgo', 'endDate': 'yesterday'}],
'metrics': [{'expression': 'ga:adClicks'}, {'expression': 'ga:impressions'}, {'expression': 'ga:adCost'}, {'expression': 'ga:CTR'}, {'expression': 'ga:CPC'}, {'expression': 'ga:costPerTransaction'}, {'expression': 'ga:transactions'}, {'expression': 'ga:transactionsPerSession'}, {'expression': 'ga:pageviews'}, {'expression': 'ga:timeOnPage'}],
"pageToken": pageTokenVariable,
'dimensions': [{'name': 'ga:adMatchedQuery'}, {'name': 'ga:campaign'}, {'name': 'ga:adGroup'}, {'name': 'ga:adwordsCustomerID'}, {'name': 'ga:date'}],
'orderBys': [{'fieldName': 'ga:impressions', 'sortOrder': 'DESCENDING'}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adwordsCustomerID',
'operator': 'EXACT',
'expressions': 'abc',
'not': 'True'
}]
}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adMatchedQuery',
'operator': 'EXACT',
'expressions': '(not set)',
'not': 'True'
}]
}]
}]
}
).execute()
analytics = initialize_analyticsreporting()
response = get_report(analytics, "0")
for report in response.get('reports', []):
pagetoken = report.get('nextPageToken', None)
print(pagetoken)
#------printing the pagetoken here returns `100,000` which is expected
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get(
'metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
# Append that data to a list as a dictionary
# pagination function
while pagetoken: # This says while there is info in the nextPageToken get the data, process it and add to the list
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0]['nextPageToken']
print(pagetoken)
#------printing the pagetoken here returns `200,000` as is expected but the data being pulled is the same as for the first batch and so on. While in the loop the pagetoken is being incremented but it does not retrieve new data
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
df = pd.DataFrame(list)
print(df) # Append that data to a list as a dictionary
df.to_csv('full_dataset.csv', encoding="utf-8", index=False)
我尝试传递 pagetoken 的错误在哪里?
所以您正在 pagetoken = response['reports'][0]['nextPageToken']
中更新 pagetoken,但您不应该在 while 循环中用新数据更新 rows
吗?
像这样。
while pagetoken:
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0].get('nextPageToken')
for report in reponse.get('reports', []):
rows = report.get('data', {}).get('rows', [])
for row in rows:
我试图获得 3 个月的报告,为此,我需要发出多个请求并将结果附加到列表中,因为 API 只有 returns 100,000
行每个请求。 API 返回一个名为 nextPageToken
的变量,我需要将其传递到下一个查询以获取报告的下一个 100,000
行。我很难做到这一点。
这是我的代码:
def initialize_analyticsreporting():
'''Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
'''
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
list = []
def get_report(analytics, pageTokenVariable):
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'pageSize': 100000,
'dateRanges': [{'startDate': '90daysAgo', 'endDate': 'yesterday'}],
'metrics': [{'expression': 'ga:adClicks'}, {'expression': 'ga:impressions'}, {'expression': 'ga:adCost'}, {'expression': 'ga:CTR'}, {'expression': 'ga:CPC'}, {'expression': 'ga:costPerTransaction'}, {'expression': 'ga:transactions'}, {'expression': 'ga:transactionsPerSession'}, {'expression': 'ga:pageviews'}, {'expression': 'ga:timeOnPage'}],
"pageToken": pageTokenVariable,
'dimensions': [{'name': 'ga:adMatchedQuery'}, {'name': 'ga:campaign'}, {'name': 'ga:adGroup'}, {'name': 'ga:adwordsCustomerID'}, {'name': 'ga:date'}],
'orderBys': [{'fieldName': 'ga:impressions', 'sortOrder': 'DESCENDING'}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adwordsCustomerID',
'operator': 'EXACT',
'expressions': 'abc',
'not': 'True'
}]
}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adMatchedQuery',
'operator': 'EXACT',
'expressions': '(not set)',
'not': 'True'
}]
}]
}]
}
).execute()
analytics = initialize_analyticsreporting()
response = get_report(analytics, "0")
for report in response.get('reports', []):
pagetoken = report.get('nextPageToken', None)
print(pagetoken)
#------printing the pagetoken here returns `100,000` which is expected
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get(
'metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
# Append that data to a list as a dictionary
# pagination function
while pagetoken: # This says while there is info in the nextPageToken get the data, process it and add to the list
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0]['nextPageToken']
print(pagetoken)
#------printing the pagetoken here returns `200,000` as is expected but the data being pulled is the same as for the first batch and so on. While in the loop the pagetoken is being incremented but it does not retrieve new data
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
df = pd.DataFrame(list)
print(df) # Append that data to a list as a dictionary
df.to_csv('full_dataset.csv', encoding="utf-8", index=False)
我尝试传递 pagetoken 的错误在哪里?
所以您正在 pagetoken = response['reports'][0]['nextPageToken']
中更新 pagetoken,但您不应该在 while 循环中用新数据更新 rows
吗?
像这样。
while pagetoken:
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0].get('nextPageToken')
for report in reponse.get('reports', []):
rows = report.get('data', {}).get('rows', [])
for row in rows: