在 Erlang 中读取大型 JSON 文件的最佳方式是什么?
What is the best way of reading a large JSON file in Erlang?
有一个大的(不适合内存).json 文件包含以下内容:
[{
"doc_number": "xxx",
"other": "data"
}, {
"doc_number": "yyy",
"other": "data"
}, {
"doc_number": "zzz",
"other": "data"
}]
我想用尽可能少的内存尽快阅读它。在其他语言中,我通常创建文件的惰性序列,只在必要时读取。我想知道 Erlang 是否有实现该目标的惯用方法。
jsx 可以用作增量解析器,但对于您的数据格式,您必须编写自己的回调模块:
-module(jsx_increment).
-export([parse_file/1]).
-export([init/1, handle_event/2]).
parse_file(FN) ->
{ok, File} = file:open(FN, [read, raw, binary]),
read(File, jsx:decoder(?MODULE, [], [stream, return_tail])),
file:close(File).
read(File, JSX) ->
{ok, Data} = file:read(File, 8), %% eof should raise error
case JSX(Data) of
{incomplete, F} ->
read(File, F);
{with_tail, _, Tail} ->
Tail =/= <<>> andalso io:format("Surplus content: ~s~n", [Tail])
end.
init(_) ->
start.
handle_event(start_array, start) ->
[];
handle_event(_, start) ->
error(expect_array);
handle_event(start_object, L) ->
[start_object|L];
handle_event(start_array, L) ->
[start_array|L];
handle_event(end_object, L) ->
check_out(collect_object(L));
handle_event(end_array, []) ->
stop;
handle_event(end_array, L) ->
check_out(collect_array(L));
handle_event(E, L) ->
check_out([event(E)|L]).
check_out([X]) ->
io:format("Collected object: ~p~n", [X]),
[];
check_out(L) -> L.
event({_, X}) -> X;
event(X) -> X.
collect_object(L) ->
collect_object(L, #{}).
collect_object([start_object|T], M) ->
[M|T];
collect_object([V, K|T], M) ->
collect_object(T, M#{K => V}).
collect_array(L) ->
collect_array(L, []).
collect_array([start_array|T], L) ->
[L|T];
collect_array([H|T], L) ->
collect_array(T, [H|L]).
你的例子:
1> io:put_chars(element(2, file:read_file("data.json"))).
[{
"doc_number": "xxx",
"other": "data"
}, {
"doc_number": "yyy",
"other": "data"
}, {
"doc_number": "zzz",
"other": "data"
}]
ok
2> jsx_increment:parse_file("data.json").
Collected object: #{<<"doc_number">> => <<"xxx">>,<<"other">> => <<"data">>}
Collected object: #{<<"doc_number">> => <<"yyy">>,<<"other">> => <<"data">>}
Collected object: #{<<"doc_number">> => <<"zzz">>,<<"other">> => <<"data">>}
ok
这是概念验证代码,您无论如何都必须适应您的用例、处理错误等。 (使用过的地图处理仅适用于 R18。对 R17 使用 maps:put(K, V, M)
,对 R17 之前的版本使用 proplist
。)
有一个大的(不适合内存).json 文件包含以下内容:
[{
"doc_number": "xxx",
"other": "data"
}, {
"doc_number": "yyy",
"other": "data"
}, {
"doc_number": "zzz",
"other": "data"
}]
我想用尽可能少的内存尽快阅读它。在其他语言中,我通常创建文件的惰性序列,只在必要时读取。我想知道 Erlang 是否有实现该目标的惯用方法。
jsx 可以用作增量解析器,但对于您的数据格式,您必须编写自己的回调模块:
-module(jsx_increment).
-export([parse_file/1]).
-export([init/1, handle_event/2]).
parse_file(FN) ->
{ok, File} = file:open(FN, [read, raw, binary]),
read(File, jsx:decoder(?MODULE, [], [stream, return_tail])),
file:close(File).
read(File, JSX) ->
{ok, Data} = file:read(File, 8), %% eof should raise error
case JSX(Data) of
{incomplete, F} ->
read(File, F);
{with_tail, _, Tail} ->
Tail =/= <<>> andalso io:format("Surplus content: ~s~n", [Tail])
end.
init(_) ->
start.
handle_event(start_array, start) ->
[];
handle_event(_, start) ->
error(expect_array);
handle_event(start_object, L) ->
[start_object|L];
handle_event(start_array, L) ->
[start_array|L];
handle_event(end_object, L) ->
check_out(collect_object(L));
handle_event(end_array, []) ->
stop;
handle_event(end_array, L) ->
check_out(collect_array(L));
handle_event(E, L) ->
check_out([event(E)|L]).
check_out([X]) ->
io:format("Collected object: ~p~n", [X]),
[];
check_out(L) -> L.
event({_, X}) -> X;
event(X) -> X.
collect_object(L) ->
collect_object(L, #{}).
collect_object([start_object|T], M) ->
[M|T];
collect_object([V, K|T], M) ->
collect_object(T, M#{K => V}).
collect_array(L) ->
collect_array(L, []).
collect_array([start_array|T], L) ->
[L|T];
collect_array([H|T], L) ->
collect_array(T, [H|L]).
你的例子:
1> io:put_chars(element(2, file:read_file("data.json"))).
[{
"doc_number": "xxx",
"other": "data"
}, {
"doc_number": "yyy",
"other": "data"
}, {
"doc_number": "zzz",
"other": "data"
}]
ok
2> jsx_increment:parse_file("data.json").
Collected object: #{<<"doc_number">> => <<"xxx">>,<<"other">> => <<"data">>}
Collected object: #{<<"doc_number">> => <<"yyy">>,<<"other">> => <<"data">>}
Collected object: #{<<"doc_number">> => <<"zzz">>,<<"other">> => <<"data">>}
ok
这是概念验证代码,您无论如何都必须适应您的用例、处理错误等。 (使用过的地图处理仅适用于 R18。对 R17 使用 maps:put(K, V, M)
,对 R17 之前的版本使用 proplist
。)