使用 arrow.js 读取时从数据框转换的 apache 箭头文件给出空值
Converted apache arrow file from data frame gives null while reading with arrow.js
我使用 pyarrow
将一个示例数据帧转换为 .arrow
文件
import numpy as np
import pandas as pd
import pyarrow as pa
df = pd.DataFrame({"a": [10, 2, 3]})
df['a'] = pd.to_numeric(df['a'],errors='coerce')
table = pa.Table.from_pandas(df)
writer = pa.RecordBatchFileWriter('test.arrow', table.schema)
writer.write_table(table)
writer.close()
这将创建一个文件 test.arrow
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 1 columns):
a 3 non-null int64
dtypes: int64(1)
memory usage: 104.0 bytes
然后在 NodeJS 中我用 arrowJS 加载文件。
https://arrow.apache.org/docs/js/
const fs = require('fs');
const arrow = require('apache-arrow');
const data = fs.readFileSync('test.arrow');
const table = arrow.Table.from(data);
console.log(table.schema.fields.map(f => f.name));
console.log(table.count());
console.log(table.get(0));
这样打印出来
[ 'a' ]
0
null
我原以为 table 的长度为 3,table.get(0)
给出第一行而不是 null
。
这是 table schehem 看起来像 console.log(table._schema)
[ Int_ [Int] { isSigned: true, bitWidth: 16 } ]
Schema {
fields:
[ Field { name: 'a', type: [Int_], nullable: true, metadata: Map {} } ],
metadata:
Map {
'pandas' => '{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 5, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "a", "field_name": "a", "pandas_type": "int16", "numpy_type": "int16", "metadata": null}], "creator": {"library": "pyarrow", "version": "0.15.0"}, "pandas_version": "0.22.0"}' },
dictionaries: Map {} }
知道为什么它没有按预期获取数据吗?
这是由于 Arrow 0.15 中的格式更改,如 Apache JIRA 上的 mentioned by Wes。这意味着 所有 Arrow 库,而不仅仅是 PyArrow,在将 IPC 文件发送到旧版本的 Arrow 时都会出现这个问题。修复方法是将 ArrowJS 升级到 0.15.0,这样您就可以在其他 Arrow 库和 JS 库之间来回切换。如果您由于某种原因无法更新,您可以改用以下解决方法之一:
将 use_legacy_format=True
作为 kwarg 传递给 RecordBatchFileWriter
:
with pa.RecordBatchFileWriter('file.arrow', table.schema, use_legacy_format=True) as writer:
writer.write_table(table)
设置环境变量ARROW_PRE_0_15_IPC_FORMAT
为1:
$ export ARROW_PRE_0_15_IPC_FORMAT = 1
$ python
>>> import pyarrow as pa
>>> table = pa.Table.from_pydict( {"a": [1, 2, 3], "b": [4, 5, 6]} )
>>> with pa.RecordBatchFileWriter('file.arrow', table.schema) as writer:
... writer.write_table(table)
...
或将 PyArrow 降级为 0.14.x
:
$ conda install -c conda-forge pyarrow=0.14.1
我使用 pyarrow
.arrow
文件
import numpy as np
import pandas as pd
import pyarrow as pa
df = pd.DataFrame({"a": [10, 2, 3]})
df['a'] = pd.to_numeric(df['a'],errors='coerce')
table = pa.Table.from_pandas(df)
writer = pa.RecordBatchFileWriter('test.arrow', table.schema)
writer.write_table(table)
writer.close()
这将创建一个文件 test.arrow
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 1 columns):
a 3 non-null int64
dtypes: int64(1)
memory usage: 104.0 bytes
然后在 NodeJS 中我用 arrowJS 加载文件。 https://arrow.apache.org/docs/js/
const fs = require('fs');
const arrow = require('apache-arrow');
const data = fs.readFileSync('test.arrow');
const table = arrow.Table.from(data);
console.log(table.schema.fields.map(f => f.name));
console.log(table.count());
console.log(table.get(0));
这样打印出来
[ 'a' ]
0
null
我原以为 table 的长度为 3,table.get(0)
给出第一行而不是 null
。
这是 table schehem 看起来像 console.log(table._schema)
[ Int_ [Int] { isSigned: true, bitWidth: 16 } ]
Schema {
fields:
[ Field { name: 'a', type: [Int_], nullable: true, metadata: Map {} } ],
metadata:
Map {
'pandas' => '{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 5, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "a", "field_name": "a", "pandas_type": "int16", "numpy_type": "int16", "metadata": null}], "creator": {"library": "pyarrow", "version": "0.15.0"}, "pandas_version": "0.22.0"}' },
dictionaries: Map {} }
知道为什么它没有按预期获取数据吗?
这是由于 Arrow 0.15 中的格式更改,如 Apache JIRA 上的 mentioned by Wes。这意味着 所有 Arrow 库,而不仅仅是 PyArrow,在将 IPC 文件发送到旧版本的 Arrow 时都会出现这个问题。修复方法是将 ArrowJS 升级到 0.15.0,这样您就可以在其他 Arrow 库和 JS 库之间来回切换。如果您由于某种原因无法更新,您可以改用以下解决方法之一:
将 use_legacy_format=True
作为 kwarg 传递给 RecordBatchFileWriter
:
with pa.RecordBatchFileWriter('file.arrow', table.schema, use_legacy_format=True) as writer:
writer.write_table(table)
设置环境变量ARROW_PRE_0_15_IPC_FORMAT
为1:
$ export ARROW_PRE_0_15_IPC_FORMAT = 1
$ python
>>> import pyarrow as pa
>>> table = pa.Table.from_pydict( {"a": [1, 2, 3], "b": [4, 5, 6]} )
>>> with pa.RecordBatchFileWriter('file.arrow', table.schema) as writer:
... writer.write_table(table)
...
或将 PyArrow 降级为 0.14.x
:
$ conda install -c conda-forge pyarrow=0.14.1