python fastparquet 模块可以读取压缩的parquet 文件吗?
Can python fastparquet module read in compressed parquet file?
我们的 parquet 文件存储在 aws S3 存储桶中,并由 SNAPPY 压缩。
我能够使用 python fastparquet 模块读取未压缩版本的镶木地板文件,但不能读取压缩版本。
这是我用于未压缩的代码
s3 = s3fs.S3FileSystem(key='XESF', secret='dsfkljsf')
myopen = s3.open
pf = ParquetFile('sample/py_test_snappy/part-r-12423423942834.parquet', open_with=myopen)
df=pf.to_pandas()
这 return 没有错误,但是当我尝试读取文件的快速压缩版本时:
pf = ParquetFile('sample/py_test_snappy/part-r-12423423942834.snappy.parquet', open_with=myopen)
我在使用 to_pandas()
时出错
df=pf.to_pandas()
错误信息
KeyErrorTraceback (most recent call last)
in ()
----> 1 df=pf.to_pandas()
/opt/conda/lib/python3.5/site-packages/fastparquet/api.py in
to_pandas(self, columns, categories, filters, index)
293 for (name, v) in views.items()}
294 self.read_row_group(rg, columns, categories, infile=f,
--> 295 index=index, assign=parts)
296 start += rg.num_rows
297 else:
/opt/conda/lib/python3.5/site-packages/fastparquet/api.py in
read_row_group(self, rg, columns, categories, infile, index, assign)
151 core.read_row_group(
152 infile, rg, columns, categories, self.helper, self.cats,
--> 153 self.selfmade, index=index, assign=assign)
154 if ret:
155 return df
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in
read_row_group(file, rg, columns, categories, schema_helper, cats,
selfmade, index, assign)
300 raise RuntimeError('Going with pre-allocation!')
301 read_row_group_arrays(file, rg, columns, categories, schema_helper,
--> 302 cats, selfmade, assign=assign)
303
304 for cat in cats:
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in
read_row_group_arrays(file, rg, columns, categories, schema_helper,
cats, selfmade, assign)
289 read_col(column, schema_helper, file, use_cat=use,
290 selfmade=selfmade, assign=out[name],
--> 291 catdef=out[name+'-catdef'] if use else None)
292
293
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in
read_col(column, schema_helper, infile, use_cat, grab_dict, selfmade,
assign, catdef)
196 dic = None
197 if ph.type == parquet_thrift.PageType.DICTIONARY_PAGE:
--> 198 dic = np.array(read_dictionary_page(infile, schema_helper, ph, cmd))
199 ph = read_thrift(infile, parquet_thrift.PageHeader)
200 dic = convert(dic, se)
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in
read_dictionary_page(file_obj, schema_helper, page_header,
column_metadata)
152 Consumes data using the plain encoding and returns an array of values.
153 """
--> 154 raw_bytes = _read_page(file_obj, page_header, column_metadata)
155 if column_metadata.type == parquet_thrift.Type.BYTE_ARRAY:
156 # no faster way to read variable-length-strings?
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in
_read_page(file_obj, page_header, column_metadata)
28 """Read the data page from the given file-object and convert it to raw, uncompressed bytes (if necessary)."""
29 raw_bytes = file_obj.read(page_header.compressed_page_size)
---> 30 raw_bytes = decompress_data(raw_bytes, column_metadata.codec)
31
32 assert len(raw_bytes) == page_header.uncompressed_page_size, \
/opt/conda/lib/python3.5/site-packages/fastparquet/compression.py in
decompress_data(data, algorithm)
48 def decompress_data(data, algorithm='gzip'):
49 if isinstance(algorithm, int):
---> 50 algorithm = rev_map[algorithm]
51 if algorithm.upper() not in decompressions:
52 raise RuntimeError("Decompression '%s' not available. Options: %s" %
KeyError: 1
该错误可能表明在您的系统上未找到用于解压 SNAPPY 的库 - 尽管错误消息显然可以更清楚!
根据您的系统,以下行可能会为您解决此问题:
conda install python-snappy
或
pip install python-snappy
如果您在 windows,构建链可能无法工作,您可能需要从 here.
安装
我们的 parquet 文件存储在 aws S3 存储桶中,并由 SNAPPY 压缩。 我能够使用 python fastparquet 模块读取未压缩版本的镶木地板文件,但不能读取压缩版本。
这是我用于未压缩的代码
s3 = s3fs.S3FileSystem(key='XESF', secret='dsfkljsf')
myopen = s3.open
pf = ParquetFile('sample/py_test_snappy/part-r-12423423942834.parquet', open_with=myopen)
df=pf.to_pandas()
这 return 没有错误,但是当我尝试读取文件的快速压缩版本时:
pf = ParquetFile('sample/py_test_snappy/part-r-12423423942834.snappy.parquet', open_with=myopen)
我在使用 to_pandas()
时出错df=pf.to_pandas()
错误信息
KeyErrorTraceback (most recent call last) in () ----> 1 df=pf.to_pandas()
/opt/conda/lib/python3.5/site-packages/fastparquet/api.py in to_pandas(self, columns, categories, filters, index) 293 for (name, v) in views.items()} 294 self.read_row_group(rg, columns, categories, infile=f, --> 295 index=index, assign=parts) 296 start += rg.num_rows 297 else:
/opt/conda/lib/python3.5/site-packages/fastparquet/api.py in read_row_group(self, rg, columns, categories, infile, index, assign) 151 core.read_row_group( 152 infile, rg, columns, categories, self.helper, self.cats, --> 153 self.selfmade, index=index, assign=assign) 154 if ret: 155 return df
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in read_row_group(file, rg, columns, categories, schema_helper, cats, selfmade, index, assign) 300 raise RuntimeError('Going with pre-allocation!') 301 read_row_group_arrays(file, rg, columns, categories, schema_helper, --> 302 cats, selfmade, assign=assign) 303 304 for cat in cats:
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in read_row_group_arrays(file, rg, columns, categories, schema_helper, cats, selfmade, assign) 289 read_col(column, schema_helper, file, use_cat=use, 290 selfmade=selfmade, assign=out[name], --> 291 catdef=out[name+'-catdef'] if use else None) 292 293
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in read_col(column, schema_helper, infile, use_cat, grab_dict, selfmade, assign, catdef) 196 dic = None 197 if ph.type == parquet_thrift.PageType.DICTIONARY_PAGE: --> 198 dic = np.array(read_dictionary_page(infile, schema_helper, ph, cmd)) 199 ph = read_thrift(infile, parquet_thrift.PageHeader) 200 dic = convert(dic, se)
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in read_dictionary_page(file_obj, schema_helper, page_header, column_metadata) 152 Consumes data using the plain encoding and returns an array of values. 153 """ --> 154 raw_bytes = _read_page(file_obj, page_header, column_metadata) 155 if column_metadata.type == parquet_thrift.Type.BYTE_ARRAY: 156 # no faster way to read variable-length-strings?
/opt/conda/lib/python3.5/site-packages/fastparquet/core.py in _read_page(file_obj, page_header, column_metadata) 28 """Read the data page from the given file-object and convert it to raw, uncompressed bytes (if necessary).""" 29 raw_bytes = file_obj.read(page_header.compressed_page_size) ---> 30 raw_bytes = decompress_data(raw_bytes, column_metadata.codec) 31 32 assert len(raw_bytes) == page_header.uncompressed_page_size, \
/opt/conda/lib/python3.5/site-packages/fastparquet/compression.py in decompress_data(data, algorithm) 48 def decompress_data(data, algorithm='gzip'): 49 if isinstance(algorithm, int): ---> 50 algorithm = rev_map[algorithm] 51 if algorithm.upper() not in decompressions: 52 raise RuntimeError("Decompression '%s' not available. Options: %s" %
KeyError: 1
该错误可能表明在您的系统上未找到用于解压 SNAPPY 的库 - 尽管错误消息显然可以更清楚!
根据您的系统,以下行可能会为您解决此问题:
conda install python-snappy
或
pip install python-snappy
如果您在 windows,构建链可能无法工作,您可能需要从 here.
安装