PyTables + Pandas Select 问题
PyTables + Pandas Select Problems
我有一个结构如下的 HDF5 (PyTables) 文件:
/<User>/<API Key>
ex:
/Dan/A4N5
/Dan/B8P0
/Dave/D3Y7
每个 table 的结构都是这样的,带有一个 sessionID 和一个存储在纪元中的时间:
sessionID time
0 3ODE3Nzll 1467590400
1 lMGVkMDc4 1467590400
2 jNzIzNmY1 1467590400
...
我希望 Pandas 遍历每个 table 并获取指定日期和指定日期前一天之间的所有行。目前我有这个代码:
scriptPath = os.path.dirname(os.path.abspath(__file__))
argdate = "2016/07/14"
dayTimestamp = datetime.datetime(int(argdate[0:4]), int(argdate[5:7]), int(argdate[8:10]), tzinfo=pytz.utc)
yesterdayTimestamp = dayTimestamp - datetime.timedelta(days=1)
with pd.HDFStore(os.path.join(scriptPath, "userdatabase.h5")) as db:
for table in db.keys():
print(table)
tableSplit = table.split('/')
client = tableSplit[1]
apiKey = tableSplit[2]
df = db.select('{}/{}'.format(client, apiKey), where='time<=int(dayTimestamp.timestamp()) & time>=int(yesterdayTimestamp.timestamp())')
print(df)
但是在 select 所在的行上抛出错误。
File "tester.py", line 570, in database
df = db.select('{}/{}'.format(client, apiKey), where='time<=int(dayTimestamp.timestamp()) & time>=int(yesterdayTimestamp.timestamp())')
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 680, in select
return it.get_result()
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 1364, in get_result
results = self.func(self.start, self.stop, where)
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 673, in func
columns=columns, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 4021, in read
if not self.read_axes(where=where, **kwargs):
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 3222, in read_axes
self.selection = Selection(self, where=where, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 4580, in __init__
self.terms = self.generate(where)
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 4593, in generate
return Expr(where, queryables=q, encoding=self.table.encoding)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/pytables.py", line 517, in __init__
self.terms = self.parse()
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 727, in parse
return self._visitor.visit(self.expr)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 317, in visit_Module
return self.visit(expr, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 320, in visit_Expr
return self.visit(node.value, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 655, in visit_BoolOp
return reduce(visitor, operands)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 647, in visitor
lhs = self._try_visit_binop(x)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 643, in _try_visit_binop
return self.visit(bop)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 628, in visit_Compare
return self.visit(binop)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 401, in visit_BinOp
op, op_class, left, right = self._possibly_transform_eq_ne(node)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 354, in _possibly_transform_eq_ne
right = self.visit(node.right, side='right')
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 615, in visit_Call_legacy
return self.const_type(res(*args, **keywords), self.env)
TypeError: 'str' object is not callable
1) 如何修复此错误?
2) 是否可以遍历 HDF5 文件层次结构,以便我可以将相同的 Pandas table 分组?
这是一个工作演示:
import io
import pandas as pd
df = pd.read_csv(io.StringIO("""
sessionID time
3ODE3Nzll 1467590400
lMGVkMDc4 1467590400
jNzIzNmY1 1467590400
3ODE3Nzll 1467676800
lMGVkMDc4 1467676800
jNzIzNmY1 1467676800
"""), sep='\s+')
filename = 'c:/temp/aaa.h5'
store = pd.HDFStore(filename)
store.append('/aaa/df1', df, data_columns=True)
store.append('/bbb/df1', df, data_columns=True)
# let's double # of rows
df = pd.concat([df] * 2, ignore_index=True)
# and write it to HDFStore
store.append('/aaa/df2', df, data_columns=True)
print(store)
argdate = "2016/07/04"
ts_from = int(pd.to_datetime(argdate).timestamp())
ts_to = ts_from + 24*60*60
client_flt = '/aaa/'
#qry = '(time >= {0}) & (time <= {1})'.format(dayTimestamp, dayTimestamp + 24*60*60)
qry = 'time >= ts_from & time <= ts_to'
print('WHERE:\t%s' %qry)
for k in store:
if k.startswith(client_flt):
x = store.select(k, where=qry)
print(k)
print(x)
输出:
<class 'pandas.io.pytables.HDFStore'>
File path: c:/temp/aaa.h5
/aaa/df1 frame_table (typ->appendable,nrows->6,ncols->2,indexers->[index],dc->[sessionID,time])
/aaa/df2 frame_table (typ->appendable,nrows->12,ncols->2,indexers->[index],dc->[sessionID,time])
/bbb/df1 frame_table (typ->appendable,nrows->6,ncols->2,indexers->[index],dc->[sessionID,time])
WHERE: time >= ts_from & time <= ts_to
/aaa/df1
sessionID time
0 3ODE3Nzll 1467590400
1 lMGVkMDc4 1467590400
2 jNzIzNmY1 1467590400
/aaa/df2
sessionID time
0 3ODE3Nzll 1467590400
1 lMGVkMDc4 1467590400
2 jNzIzNmY1 1467590400
6 3ODE3Nzll 1467590400
7 lMGVkMDc4 1467590400
8 jNzIzNmY1 1467590400
我有一个结构如下的 HDF5 (PyTables) 文件:
/<User>/<API Key>
ex:
/Dan/A4N5
/Dan/B8P0
/Dave/D3Y7
每个 table 的结构都是这样的,带有一个 sessionID 和一个存储在纪元中的时间:
sessionID time
0 3ODE3Nzll 1467590400
1 lMGVkMDc4 1467590400
2 jNzIzNmY1 1467590400
...
我希望 Pandas 遍历每个 table 并获取指定日期和指定日期前一天之间的所有行。目前我有这个代码:
scriptPath = os.path.dirname(os.path.abspath(__file__))
argdate = "2016/07/14"
dayTimestamp = datetime.datetime(int(argdate[0:4]), int(argdate[5:7]), int(argdate[8:10]), tzinfo=pytz.utc)
yesterdayTimestamp = dayTimestamp - datetime.timedelta(days=1)
with pd.HDFStore(os.path.join(scriptPath, "userdatabase.h5")) as db:
for table in db.keys():
print(table)
tableSplit = table.split('/')
client = tableSplit[1]
apiKey = tableSplit[2]
df = db.select('{}/{}'.format(client, apiKey), where='time<=int(dayTimestamp.timestamp()) & time>=int(yesterdayTimestamp.timestamp())')
print(df)
但是在 select 所在的行上抛出错误。
File "tester.py", line 570, in database
df = db.select('{}/{}'.format(client, apiKey), where='time<=int(dayTimestamp.timestamp()) & time>=int(yesterdayTimestamp.timestamp())')
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 680, in select
return it.get_result()
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 1364, in get_result
results = self.func(self.start, self.stop, where)
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 673, in func
columns=columns, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 4021, in read
if not self.read_axes(where=where, **kwargs):
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 3222, in read_axes
self.selection = Selection(self, where=where, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 4580, in __init__
self.terms = self.generate(where)
File "/usr/local/lib/python3.4/dist-packages/pandas/io/pytables.py", line 4593, in generate
return Expr(where, queryables=q, encoding=self.table.encoding)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/pytables.py", line 517, in __init__
self.terms = self.parse()
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 727, in parse
return self._visitor.visit(self.expr)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 317, in visit_Module
return self.visit(expr, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 320, in visit_Expr
return self.visit(node.value, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 655, in visit_BoolOp
return reduce(visitor, operands)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 647, in visitor
lhs = self._try_visit_binop(x)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 643, in _try_visit_binop
return self.visit(bop)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 628, in visit_Compare
return self.visit(binop)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 401, in visit_BinOp
op, op_class, left, right = self._possibly_transform_eq_ne(node)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 354, in _possibly_transform_eq_ne
right = self.visit(node.right, side='right')
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 311, in visit
return visitor(node, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pandas/computation/expr.py", line 615, in visit_Call_legacy
return self.const_type(res(*args, **keywords), self.env)
TypeError: 'str' object is not callable
1) 如何修复此错误?
2) 是否可以遍历 HDF5 文件层次结构,以便我可以将相同的 Pandas table 分组?
这是一个工作演示:
import io
import pandas as pd
df = pd.read_csv(io.StringIO("""
sessionID time
3ODE3Nzll 1467590400
lMGVkMDc4 1467590400
jNzIzNmY1 1467590400
3ODE3Nzll 1467676800
lMGVkMDc4 1467676800
jNzIzNmY1 1467676800
"""), sep='\s+')
filename = 'c:/temp/aaa.h5'
store = pd.HDFStore(filename)
store.append('/aaa/df1', df, data_columns=True)
store.append('/bbb/df1', df, data_columns=True)
# let's double # of rows
df = pd.concat([df] * 2, ignore_index=True)
# and write it to HDFStore
store.append('/aaa/df2', df, data_columns=True)
print(store)
argdate = "2016/07/04"
ts_from = int(pd.to_datetime(argdate).timestamp())
ts_to = ts_from + 24*60*60
client_flt = '/aaa/'
#qry = '(time >= {0}) & (time <= {1})'.format(dayTimestamp, dayTimestamp + 24*60*60)
qry = 'time >= ts_from & time <= ts_to'
print('WHERE:\t%s' %qry)
for k in store:
if k.startswith(client_flt):
x = store.select(k, where=qry)
print(k)
print(x)
输出:
<class 'pandas.io.pytables.HDFStore'>
File path: c:/temp/aaa.h5
/aaa/df1 frame_table (typ->appendable,nrows->6,ncols->2,indexers->[index],dc->[sessionID,time])
/aaa/df2 frame_table (typ->appendable,nrows->12,ncols->2,indexers->[index],dc->[sessionID,time])
/bbb/df1 frame_table (typ->appendable,nrows->6,ncols->2,indexers->[index],dc->[sessionID,time])
WHERE: time >= ts_from & time <= ts_to
/aaa/df1
sessionID time
0 3ODE3Nzll 1467590400
1 lMGVkMDc4 1467590400
2 jNzIzNmY1 1467590400
/aaa/df2
sessionID time
0 3ODE3Nzll 1467590400
1 lMGVkMDc4 1467590400
2 jNzIzNmY1 1467590400
6 3ODE3Nzll 1467590400
7 lMGVkMDc4 1467590400
8 jNzIzNmY1 1467590400