将带有 timedeltas 的 pandas 数据帧写入 parquet
writing pandas dataframe with timedeltas to parquet
我似乎无法通过 pyarrow 将包含时间增量的 pandas 数据帧写入镶木地板文件。
pyarrow 文档指定它可以以 ms
精度处理 numpy timedeltas64
。但是,当我从 numpy 的 timedelta64[ms]
构建数据框时,该列的数据类型是 timedelta64[ns]
.
Pyarrow 会因此抛出错误。
这是 pandas 或 pyarrow 中的错误吗?有没有简单的解决方法?
以下代码:
df = pd.DataFrame({
'timedelta': np.arange(start=0, stop=1000,
step=10,
dtype='timedelta64[ms]')
})
print(df.timedelta.dtypes)
df.to_parquet('test.parquet', engine='pyarrow', compression='gzip')
产生以下输出:timedelta64[ns]
和错误:
---------------------------------------------------------------------------
ArrowNotImplementedError Traceback (most recent call last)
<ipython-input-41-7df28b306c1e> in <module>()
3 step=10,
4 dtype='timedelta64[ms]')
----> 5 }).to_parquet('test.parquet', engine='pyarrow', compression='gzip')
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pandas/core/frame.py in to_parquet(self, fname, engine, compression, **kwargs)
1940 from pandas.io.parquet import to_parquet
1941 to_parquet(self, fname, engine,
-> 1942 compression=compression, **kwargs)
1943
1944 @Substitution(header='Write out the column names. If a list of strings '
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pandas/io/parquet.py in to_parquet(df, path, engine, compression, **kwargs)
255 """
256 impl = get_engine(engine)
--> 257 return impl.write(df, path, compression=compression, **kwargs)
258
259
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pandas/io/parquet.py in write(self, df, path, compression, coerce_timestamps, **kwargs)
116
117 else:
--> 118 table = self.api.Table.from_pandas(df)
119 self.api.parquet.write_table(
120 table, path, compression=compression,
table.pxi in pyarrow.lib.Table.from_pandas()
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pyarrow/pandas_compat.py in dataframe_to_arrays(df, schema, preserve_index, nthreads)
369 arrays = [convert_column(c, t)
370 for c, t in zip(columns_to_convert,
--> 371 convert_types)]
372 else:
373 from concurrent import futures
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pyarrow/pandas_compat.py in <listcomp>(.0)
368 if nthreads == 1:
369 arrays = [convert_column(c, t)
--> 370 for c, t in zip(columns_to_convert,
371 convert_types)]
372 else:
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pyarrow/pandas_compat.py in convert_column(col, ty)
364
365 def convert_column(col, ty):
--> 366 return pa.array(col, from_pandas=True, type=ty)
367
368 if nthreads == 1:
array.pxi in pyarrow.lib.array()
array.pxi in pyarrow.lib._ndarray_to_array()
error.pxi in pyarrow.lib.check_status()
ArrowNotImplementedError: Unsupported numpy type 22
fastparquet支持timedelta类型
首先install fastparquet,例如:
pip install fastparquet
那么你可以使用这个:
df.to_parquet('test.parquet.gzip', engine='fastparquet', compression='gzip')
我似乎无法通过 pyarrow 将包含时间增量的 pandas 数据帧写入镶木地板文件。
pyarrow 文档指定它可以以 ms
精度处理 numpy timedeltas64
。但是,当我从 numpy 的 timedelta64[ms]
构建数据框时,该列的数据类型是 timedelta64[ns]
.
Pyarrow 会因此抛出错误。
这是 pandas 或 pyarrow 中的错误吗?有没有简单的解决方法?
以下代码:
df = pd.DataFrame({
'timedelta': np.arange(start=0, stop=1000,
step=10,
dtype='timedelta64[ms]')
})
print(df.timedelta.dtypes)
df.to_parquet('test.parquet', engine='pyarrow', compression='gzip')
产生以下输出:timedelta64[ns]
和错误:
---------------------------------------------------------------------------
ArrowNotImplementedError Traceback (most recent call last)
<ipython-input-41-7df28b306c1e> in <module>()
3 step=10,
4 dtype='timedelta64[ms]')
----> 5 }).to_parquet('test.parquet', engine='pyarrow', compression='gzip')
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pandas/core/frame.py in to_parquet(self, fname, engine, compression, **kwargs)
1940 from pandas.io.parquet import to_parquet
1941 to_parquet(self, fname, engine,
-> 1942 compression=compression, **kwargs)
1943
1944 @Substitution(header='Write out the column names. If a list of strings '
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pandas/io/parquet.py in to_parquet(df, path, engine, compression, **kwargs)
255 """
256 impl = get_engine(engine)
--> 257 return impl.write(df, path, compression=compression, **kwargs)
258
259
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pandas/io/parquet.py in write(self, df, path, compression, coerce_timestamps, **kwargs)
116
117 else:
--> 118 table = self.api.Table.from_pandas(df)
119 self.api.parquet.write_table(
120 table, path, compression=compression,
table.pxi in pyarrow.lib.Table.from_pandas()
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pyarrow/pandas_compat.py in dataframe_to_arrays(df, schema, preserve_index, nthreads)
369 arrays = [convert_column(c, t)
370 for c, t in zip(columns_to_convert,
--> 371 convert_types)]
372 else:
373 from concurrent import futures
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pyarrow/pandas_compat.py in <listcomp>(.0)
368 if nthreads == 1:
369 arrays = [convert_column(c, t)
--> 370 for c, t in zip(columns_to_convert,
371 convert_types)]
372 else:
~/miniconda3/envs/myenv/lib/python3.6/site-packages/pyarrow/pandas_compat.py in convert_column(col, ty)
364
365 def convert_column(col, ty):
--> 366 return pa.array(col, from_pandas=True, type=ty)
367
368 if nthreads == 1:
array.pxi in pyarrow.lib.array()
array.pxi in pyarrow.lib._ndarray_to_array()
error.pxi in pyarrow.lib.check_status()
ArrowNotImplementedError: Unsupported numpy type 22
fastparquet支持timedelta类型
首先install fastparquet,例如:
pip install fastparquet
那么你可以使用这个:
df.to_parquet('test.parquet.gzip', engine='fastparquet', compression='gzip')