在 datetime64 dtype 上使用 Google DataLab 时出现 KeyError
KeyError Using Google DataLab on datetime64 dtype
我正在使用 Google 数据实验室查询 BigQuery 并将该数据用于 RNN。我正在尝试将大部分数据转换为数字格式,但我收到 "Date" 列的 KeyError。如果我将它保留为字符串,我的代码可以工作,但我需要日期字段保留 RNN 的日期时间数据类型。我的代码和错误贴在下面。
import datalab.bigquery as bq
import seaborn as sns
import pandas as pd
import numpy as np
import tensorflow as tf
import shutil
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')
%%sql --m log_data
SELECT resource, place_id, CAST(end_time AS TIMESTAMP) AS Date, device, device_os, device_os_version, CAST(latency AS FLOAT) AS Latency,
CAST(megacycles AS FLOAT) AS Megacycles, method, CAST(response_size AS FLOAT) AS Response_Size, CAST(cost AS FLOAT) AS Cost, status,
device_brand, device_family, browser_version, app, ua_parse
FROM [project:app_logs_data]
WHERE DATE(CAST(end_time AS DATE)) >='2018-03-09' AND DATE(CAST(end_time AS DATE)) <='2018-03-10'
GROUP BY resource, place_id, Date, device, device_os, device_os_version, Latency, Megacycles, method,
Response_size, Cost, status, device_brand, device_family, browser_version, app, ua_parse
logs = bq.Query(log_data).to_dataframe()
logs['Date'] = pd.to_datetime(logs['Date'])
logs['resource'].apply(str)
logs.dtypes
resource object
place_id object
Date datetime64[ns]
device object
device_os object
device_os_version object
Latency float64
Megacycles float64
method object
Response_Size float64
Cost float64
status object
device_brand object
device_family object
browser_version object
app object
ua_parse object
dtype: object
def handle_non_numerical_data(logs):
columns = logs.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
if logs[column].dtype != np.int64 and \
logs[column].dtype != np.float64 and logs[column].dtype != np.datetime64:
column_contents = logs[column].values.tolist()
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
text_digit_vals[unique] = x
x+=1
logs[column] = list(map(convert_to_int, logs[column]))
return logs
logs = handle_non_numerical_data(logs)
KeyErrorTraceback (most recent call last)
<ipython-input-8-27437e47a873> in <module>()
17 return logs
18
---> 19 logs = handle_non_numerical_data(logs)
<ipython-input-8-27437e47a873> in handle_non_numerical_data(logs)
14 text_digit_vals[unique] = x
15 x+=1
---> 16 logs[column] = list(map(convert_to_int, logs[column]))
17 return logs
18
<ipython-input-8-27437e47a873> in convert_to_int(val)
5 text_digit_vals = {}
6 def convert_to_int(val):
----> 7 return text_digit_vals[val]
8 if logs[column].dtype != np.int64 and logs[column].dtype != np.float64 and logs[column].dtype != np.datetime64:
9 column_contents = logs[column].values.tolist()
KeyError: Timestamp('2018-03-09 00:00:00')
datalab.*
是一个已弃用的库,您应该改用 google.datalab.*
,并使用 %%bq
而不是 %%sql
。看看这个问题是否已经在新库中得到修复。
我正在使用 Google 数据实验室查询 BigQuery 并将该数据用于 RNN。我正在尝试将大部分数据转换为数字格式,但我收到 "Date" 列的 KeyError。如果我将它保留为字符串,我的代码可以工作,但我需要日期字段保留 RNN 的日期时间数据类型。我的代码和错误贴在下面。
import datalab.bigquery as bq
import seaborn as sns
import pandas as pd
import numpy as np
import tensorflow as tf
import shutil
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')
%%sql --m log_data
SELECT resource, place_id, CAST(end_time AS TIMESTAMP) AS Date, device, device_os, device_os_version, CAST(latency AS FLOAT) AS Latency,
CAST(megacycles AS FLOAT) AS Megacycles, method, CAST(response_size AS FLOAT) AS Response_Size, CAST(cost AS FLOAT) AS Cost, status,
device_brand, device_family, browser_version, app, ua_parse
FROM [project:app_logs_data]
WHERE DATE(CAST(end_time AS DATE)) >='2018-03-09' AND DATE(CAST(end_time AS DATE)) <='2018-03-10'
GROUP BY resource, place_id, Date, device, device_os, device_os_version, Latency, Megacycles, method,
Response_size, Cost, status, device_brand, device_family, browser_version, app, ua_parse
logs = bq.Query(log_data).to_dataframe()
logs['Date'] = pd.to_datetime(logs['Date'])
logs['resource'].apply(str)
logs.dtypes
resource object
place_id object
Date datetime64[ns]
device object
device_os object
device_os_version object
Latency float64
Megacycles float64
method object
Response_Size float64
Cost float64
status object
device_brand object
device_family object
browser_version object
app object
ua_parse object
dtype: object
def handle_non_numerical_data(logs):
columns = logs.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
if logs[column].dtype != np.int64 and \
logs[column].dtype != np.float64 and logs[column].dtype != np.datetime64:
column_contents = logs[column].values.tolist()
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
text_digit_vals[unique] = x
x+=1
logs[column] = list(map(convert_to_int, logs[column]))
return logs
logs = handle_non_numerical_data(logs)
KeyErrorTraceback (most recent call last)
<ipython-input-8-27437e47a873> in <module>()
17 return logs
18
---> 19 logs = handle_non_numerical_data(logs)
<ipython-input-8-27437e47a873> in handle_non_numerical_data(logs)
14 text_digit_vals[unique] = x
15 x+=1
---> 16 logs[column] = list(map(convert_to_int, logs[column]))
17 return logs
18
<ipython-input-8-27437e47a873> in convert_to_int(val)
5 text_digit_vals = {}
6 def convert_to_int(val):
----> 7 return text_digit_vals[val]
8 if logs[column].dtype != np.int64 and logs[column].dtype != np.float64 and logs[column].dtype != np.datetime64:
9 column_contents = logs[column].values.tolist()
KeyError: Timestamp('2018-03-09 00:00:00')
datalab.*
是一个已弃用的库,您应该改用 google.datalab.*
,并使用 %%bq
而不是 %%sql
。看看这个问题是否已经在新库中得到修复。