使用 jq 将 JSON 行转换为 JSON 数组
Convert JSON lines to JSON array using jq
首先,我是 jq
的新手,刚接触 1 天,我也是 JSON 的新手,我是 SQL 的人,所以我正在学习速度很快,但无法解决这个问题......所以请耐心等待。
我 运行宁 Windows,在 PowerShell 上使用 jq v1.5。
我下载了多个 JSON 个文件,它们看起来像这样:
{"Header":{"AssetClass":"Commodities","InstrumentType":"Forward","UseCase":"Forward","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"EUR","ExpiryDate":"2018-01-01","ReturnorPayoutTrigger":"Contract for Difference (CFD)","DeliveryType":"CASH","BaseProduct":"AGRI","TransactionType":"FUTR","FinalPriceType":"ARGM","ReferenceRate":"10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN","SubProduct":"GROS","AdditionalSubProduct":"FWHT"},"ISIN":{"ISIN":"EZX27M86B860","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Agriculture","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Forward AGRI GROS FWHT EUR 20180101","ShortName":"NA/Fwd AGRI FWHT EUR 20180101","ClassificationType":"JTAXCC"}}
{"Header":{"AssetClass":"Commodities","InstrumentType":"Swap","UseCase":"Basis_Swap","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"SOS","ExpiryDate":"2208-12-10","ReturnorPayoutTrigger":"Total Return","DeliveryType":"OPTL","TransactionType":"ORIT","FinalPriceType":"IHSM","ReferenceRate":"NATURAL GAS-MONTHLY INDEX S. TEXAS (TETCO)-GAS DAILY PRICE GUIDE","OtherReferenceRate":"NATURAL GAS-MONTHLY INDEX W. LOUISIANA (TETCO)-GAS DAILY PRICE GUIDE","BaseProduct":"OTHR","OtherBaseProduct":"OTHR","SubProduct":"","AdditionalSubProduct":"","OtherSubProduct":"","OtherAdditionalSubProduct":""},"ISIN":{"ISIN":"EZBBH1XR9GV6","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Multi Commodity","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Swap Basis_Swap OTHR OTHR SOS 22081210","ShortName":"NA/Swap OTHR SOS 22081210","ClassificationType":"STQTXE"}}
{"Header":{"AssetClass":"Commodities","InstrumentType":"Swap","UseCase":"Multi_Exotic_Swap","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"LRD","ExpiryDate":"2200-01-31","ReturnorPayoutTrigger":"Contract for Difference (CFD)","DeliveryType":"CASH","TransactionType":"TAPO","FinalPriceType":"EXOF","UnderlyingInstrumentIndex":["BCOMF6","BCOMNG3"]},"ISIN":{"ISIN":"EZ286HJVY4Q2","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"UnderlyingAssetType":"Multi Commodity","BaseProduct":"MCEX","SubProduct":"","AdditionalSubProduct":"","FullName":"Commodities Multi_Exotic_Swap MCEX LRD 22000131","ShortName":"NA/Swap MCEX LRD 22000131","ClassificationType":"STQCXC"}}
{"Header":{"AssetClass":"Commodities","InstrumentType":"Option","UseCase":"Option","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"TND","ExpiryDate":"2209-10-18","OptionType":"OPTL","OptionExerciseStyle":"AMER","ValuationMethodorTrigger":"Asian","DeliveryType":"CASH","TransactionType":"OTHR","FinalPriceType":"IHSM","ReferenceRate":"NATURAL GAS-NGPL (NICOR, NIPSCO, PGLC CITYGATE), NBPL-NICOR-ICE/10X MONTHLY","BaseProduct":"OTHR","SubProduct":"","AdditionalSubProduct":""},"ISIN":{"ISIN":"EZ2TK5CWL9Y4","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Other","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Option OTHR TND 22091018","ShortName":"NA/O OTHR OPTL TND 22091018","ClassificationType":"HTMHAC"}}
{"Header":{"AssetClass":"Commodities","InstrumentType":"Option","UseCase":"Multi_Exotic_Option","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"SOS","ExpiryDate":"2209-10-18","UnderlyingInstrumentIndex":["BCOMSI2","BCOMPR3T"],"OptionType":"CALL","OptionExerciseStyle":"AMER","ValuationMethodorTrigger":"Other Path Dependent","DeliveryType":"CASH","TransactionType":"ORIT","FinalPriceType":"BLTC"},"ISIN":{"ISIN":"EZ82L36B6225","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"UnderlyingAssetType":"Multi Commodity","BaseProduct":"MCEX","SubProduct":"","AdditionalSubProduct":"","FullName":"Commodities Multi_Exotic_Option MCEX SOS 22091018","ShortName":"NA/O MCEX Call SOS 22091018","ClassificationType":"HTQBPC"}}
文件的大小可以超过 1 GB。
为了有效地使用这些,我需要将 JSON 行转换为 JSON 数组,包装文件,在前面添加“[”并附加“]”,并为每一行分隔逗号 (,).
使文件看起来像这样(理论上):
[
{
"Header": {
"AssetClass": "Commodities",
"InstrumentType": "Swap",
"UseCase": "Basis_Swap",
"Level": "InstRefDataReporting"
},
"Attributes": {
"NotionalCurrency": "EUR",
"ExpiryDate": "2017-08-31",
"ReturnorPayoutTrigger": "Contract for Difference (CFD)",
"DeliveryType": "CASH",
"BaseProduct": "AGRI",
"OtherBaseProduct": "AGRI",
"TransactionType": "FUTR",
"FinalPriceType": "ARGM",
"ReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"OtherReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"SubProduct": "GROS",
"AdditionalSubProduct": "FWHT",
"OtherSubProduct": "GROS",
"OtherAdditionalSubProduct": "FWHT"
},
"ISIN": {
"ISIN": "EZ68CZDRFYY7",
"Status": "New"
},
"TemplateVersion": 1,
"Derived": {
"CommodityDerivativeIndicator": "TRUE",
"UnderlyingAssetType": "Multi Commodity",
"IssuerorOperatoroftheTradingVenueIdentifier": "NA",
"PriceMultiplier": 1,
"FullName": "Commodities Swap Basis_Swap AGRI GROS FWHT AGRI GROS FWHT EUR 20170831",
"ShortName": "NA/Swap AGRI FWHT FWHT EUR 20170831",
"ClassificationType": "STQCXC"
}
},
{
"Header": {
"AssetClass": "Commodities",
"InstrumentType": "Swap",
"UseCase": "Basis_Swap",
"Level": "InstRefDataReporting"
},
"Attributes": {
"NotionalCurrency": "EUR",
"ExpiryDate": "2017-08-31",
"ReturnorPayoutTrigger": "Contract for Difference (CFD)",
"DeliveryType": "CASH",
"BaseProduct": "AGRI",
"OtherBaseProduct": "AGRI",
"TransactionType": "FUTR",
"FinalPriceType": "ARGM",
"ReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"OtherReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"SubProduct": "GROS",
"AdditionalSubProduct": "FWHT",
"OtherSubProduct": "GROS",
"OtherAdditionalSubProduct": "FWHT"
},
"ISIN": {
"ISIN": "EZ68CZDRFYY7",
"Status": "New"
},
"TemplateVersion": 1,
"Derived": {
"CommodityDerivativeIndicator": "TRUE",
"UnderlyingAssetType": "Multi Commodity",
"IssuerorOperatoroftheTradingVenueIdentifier": "NA",
"PriceMultiplier": 1,
"FullName": "Commodities Swap Basis_Swap AGRI GROS FWHT AGRI GROS FWHT EUR 20170831",
"ShortName": "NA/Swap AGRI FWHT FWHT EUR 20170831",
"ClassificationType": "STQCXC"
}
}
]
所以我找到了 jq,根据我的理解,我可以 运行 这个
jq --slurp 'map(select(. >= 2))' Inputfile.json > OutputFile.json
这有效,但是当 运行 宁文件大于 200 MB 时,我得到 "system out out of memory" 错误(如果我使用 ISE),如果我使用标准 Powershell 或 CMD,它需要很长时间。分钟 (5+)
如果我从命令中取出 --slurp
,它会工作,速度更快,但结果如下所示:
[
{
"Header": {
"AssetClass": "Commodities",
"InstrumentType": "Swap",
"UseCase": "Basis_Swap",
"Level": "InstRefDataReporting"
},
"Attributes": {
"NotionalCurrency": "EUR",
"ExpiryDate": "2017-08-31",
"ReturnorPayoutTrigger": "Contract for Difference (CFD)",
"DeliveryType": "CASH",
"BaseProduct": "AGRI",
"OtherBaseProduct": "AGRI",
"TransactionType": "FUTR",
"FinalPriceType": "ARGM",
"ReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"OtherReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"SubProduct": "GROS",
"AdditionalSubProduct": "FWHT",
"OtherSubProduct": "GROS",
"OtherAdditionalSubProduct": "FWHT"
},
"ISIN": {
"ISIN": "EZ68CZDRFYY7",
"Status": "New"
},
"TemplateVersion": 1,
"Derived": {
"CommodityDerivativeIndicator": "TRUE",
"UnderlyingAssetType": "Multi Commodity",
"IssuerorOperatoroftheTradingVenueIdentifier": "NA",
"PriceMultiplier": 1,
"FullName": "Commodities Swap Basis_Swap AGRI GROS FWHT AGRI GROS FWHT EUR 20170831",
"ShortName": "NA/Swap AGRI FWHT FWHT EUR 20170831",
"ClassificationType": "STQCXC"
}
}]
[{
"Header": {
"AssetClass": "Commodities",
"InstrumentType": "Swap",
"UseCase": "Basis_Swap",
"Level": "InstRefDataReporting"
},
"Attributes": {
"NotionalCurrency": "EUR",
"ExpiryDate": "2017-08-31",
"ReturnorPayoutTrigger": "Contract for Difference (CFD)",
"DeliveryType": "CASH",
"BaseProduct": "AGRI",
"OtherBaseProduct": "AGRI",
"TransactionType": "FUTR",
"FinalPriceType": "ARGM",
"ReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"OtherReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"SubProduct": "GROS",
"AdditionalSubProduct": "FWHT",
"OtherSubProduct": "GROS",
"OtherAdditionalSubProduct": "FWHT"
},
"ISIN": {
"ISIN": "EZ68CZDRFYY7",
"Status": "New"
},
"TemplateVersion": 1,
"Derived": {
"CommodityDerivativeIndicator": "TRUE",
"UnderlyingAssetType": "Multi Commodity",
"IssuerorOperatoroftheTradingVenueIdentifier": "NA",
"PriceMultiplier": 1,
"FullName": "Commodities Swap Basis_Swap AGRI GROS FWHT AGRI GROS FWHT EUR 20170831",
"ShortName": "NA/Swap AGRI FWHT FWHT EUR 20170831",
"ClassificationType": "STQCXC"
}
}
]
它为每一行创建一个数组,但数组不是逗号分隔的,这不是我想要的。
那么,如何在不使用 slurp 的情况下处理多 JSON 行的大文件,并为作为单个数组生成的输入文件创建一个文件,逗号分隔?
我已阅读有关输入的信息,但不确定这是否与我需要做的相关?
看起来 inputs
遇到了与 slurp 相同的问题。我不知道如何用 jq
在大文件上完成此操作,但 sed
可以做到:
sed '1s/^/[/; $!s/$/,/; $s/$/]/' in.json > out.json
输出:
[{"Header":{"AssetClass":"Commodities","InstrumentType":"Forward","UseCase":"Forward","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"EUR","ExpiryDate":"2018-01-01","ReturnorPayoutTrigger":"Contract for Difference (CFD)","DeliveryType":"CASH","BaseProduct":"AGRI","TransactionType":"FUTR","FinalPriceType":"ARGM","ReferenceRate":"10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN","SubProduct":"GROS","AdditionalSubProduct":"FWHT"},"ISIN":{"ISIN":"EZX27M86B860","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Agriculture","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Forward AGRI GROS FWHT EUR 20180101","ShortName":"NA/Fwd AGRI FWHT EUR 20180101","ClassificationType":"JTAXCC"}},
{"Header":{"AssetClass":"Commodities","InstrumentType":"Swap","UseCase":"Basis_Swap","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"SOS","ExpiryDate":"2208-12-10","ReturnorPayoutTrigger":"Total Return","DeliveryType":"OPTL","TransactionType":"ORIT","FinalPriceType":"IHSM","ReferenceRate":"NATURAL GAS-MONTHLY INDEX S. TEXAS (TETCO)-GAS DAILY PRICE GUIDE","OtherReferenceRate":"NATURAL GAS-MONTHLY INDEX W. LOUISIANA (TETCO)-GAS DAILY PRICE GUIDE","BaseProduct":"OTHR","OtherBaseProduct":"OTHR","SubProduct":"","AdditionalSubProduct":"","OtherSubProduct":"","OtherAdditionalSubProduct":""},"ISIN":{"ISIN":"EZBBH1XR9GV6","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Multi Commodity","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Swap Basis_Swap OTHR OTHR SOS 22081210","ShortName":"NA/Swap OTHR SOS 22081210","ClassificationType":"STQTXE"}},
{"Header":{"AssetClass":"Commodities","InstrumentType":"Swap","UseCase":"Multi_Exotic_Swap","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"LRD","ExpiryDate":"2200-01-31","ReturnorPayoutTrigger":"Contract for Difference (CFD)","DeliveryType":"CASH","TransactionType":"TAPO","FinalPriceType":"EXOF","UnderlyingInstrumentIndex":["BCOMF6","BCOMNG3"]},"ISIN":{"ISIN":"EZ286HJVY4Q2","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"UnderlyingAssetType":"Multi Commodity","BaseProduct":"MCEX","SubProduct":"","AdditionalSubProduct":"","FullName":"Commodities Multi_Exotic_Swap MCEX LRD 22000131","ShortName":"NA/Swap MCEX LRD 22000131","ClassificationType":"STQCXC"}},
{"Header":{"AssetClass":"Commodities","InstrumentType":"Option","UseCase":"Option","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"TND","ExpiryDate":"2209-10-18","OptionType":"OPTL","OptionExerciseStyle":"AMER","ValuationMethodorTrigger":"Asian","DeliveryType":"CASH","TransactionType":"OTHR","FinalPriceType":"IHSM","ReferenceRate":"NATURAL GAS-NGPL (NICOR, NIPSCO, PGLC CITYGATE), NBPL-NICOR-ICE/10X MONTHLY","BaseProduct":"OTHR","SubProduct":"","AdditionalSubProduct":""},"ISIN":{"ISIN":"EZ2TK5CWL9Y4","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Other","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Option OTHR TND 22091018","ShortName":"NA/O OTHR OPTL TND 22091018","ClassificationType":"HTMHAC"}},
{"Header":{"AssetClass":"Commodities","InstrumentType":"Option","UseCase":"Multi_Exotic_Option","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"SOS","ExpiryDate":"2209-10-18","UnderlyingInstrumentIndex":["BCOMSI2","BCOMPR3T"],"OptionType":"CALL","OptionExerciseStyle":"AMER","ValuationMethodorTrigger":"Other Path Dependent","DeliveryType":"CASH","TransactionType":"ORIT","FinalPriceType":"BLTC"},"ISIN":{"ISIN":"EZ82L36B6225","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"UnderlyingAssetType":"Multi Commodity","BaseProduct":"MCEX","SubProduct":"","AdditionalSubProduct":"","FullName":"Commodities Multi_Exotic_Option MCEX SOS 22091018","ShortName":"NA/O MCEX Call SOS 22091018","ClassificationType":"HTQBPC"}}]
说明
sed 脚本由三个独立的替换组成。它们在不同的行中:
1 s/^/[/ # Insert a left bracket at the beginning of the first line
$! s/$/,/ # On all but the last line append a comma
$ s/$/]/ # Append a right bracket to the last line
也许 awk 在眼睛上更容易:
awk 'BEGIN{print "["}
length(last)>0 {print last ","} {last=[=10=]}
END {print last, "]"}'
郑重声明,这是一个使用带有 input
的 jq 版本的无 slurp 解决方案:
jq -nr '"[", try (input|tojson, repeat(",\n\(input|tojson)")), "]"'
也许,我来晚了,但这就是你要找的!
jq -s '.' in.json > out.json
我使用了一个节点调用的衬里:
$ cat input.ndjson | node -e 'const rl = readline.createInterface({ input: process.stdin }); !async function () { let idx = 0; for await (const line of rl) { process.stdout.write((++idx === 1 ? "[" : "\n,") + JSON.stringify(JSON.parse(line))); } process.stdout.write("]"); }()' | tee output.json | jq 'length'
16814
解释:
$ cat input.ndjson | # pipe the ndjson input, can be a file or any stream
node -e '
const rl = readline.createInterface({ input: process.stdin });
!async function () { // to be able to use await, this function has to be async, and then `!` is to be able call it right away;
let idx = 0;
for await (const line of rl) {
process.stdout.write((++idx === 1 ? "[" : "\n,") + JSON.stringify(JSON.parse(line)));
}
process.stdout.write("]");
}()
' | tee output.json # save the output json file
| jq 'length' # call jq to calculate a length, also validate it, to make sure it's a valid single json file, this optional
Node的readline
是一个很好的按行读取流的方法,它asyncIterable
,可以使用(await ... of)获取每一行,然后调用JSON.parse
以确保验证每一行都是有效的 json,然后 JSON.stringify
在每行上转换回缩小的 JSON;
当然你可以换成本地的const arr = [];
然后把每行的obj推入t,最后一次JSON.stringify,得到最小化的JSON,
但我喜欢这种几乎缩小的格式,每一行的 obj 都缩小了,但仍然逐行保留外部数组,这样我就可以轻松地通过 wc -l
计算它的总数行
[{...minified line1obj with no spaces...}
,{...minified line2obj}
,{...minified line3obj}
,...
,{...minified lineNobj}]
首先,我是 jq
的新手,刚接触 1 天,我也是 JSON 的新手,我是 SQL 的人,所以我正在学习速度很快,但无法解决这个问题......所以请耐心等待。
我 运行宁 Windows,在 PowerShell 上使用 jq v1.5。
我下载了多个 JSON 个文件,它们看起来像这样:
{"Header":{"AssetClass":"Commodities","InstrumentType":"Forward","UseCase":"Forward","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"EUR","ExpiryDate":"2018-01-01","ReturnorPayoutTrigger":"Contract for Difference (CFD)","DeliveryType":"CASH","BaseProduct":"AGRI","TransactionType":"FUTR","FinalPriceType":"ARGM","ReferenceRate":"10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN","SubProduct":"GROS","AdditionalSubProduct":"FWHT"},"ISIN":{"ISIN":"EZX27M86B860","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Agriculture","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Forward AGRI GROS FWHT EUR 20180101","ShortName":"NA/Fwd AGRI FWHT EUR 20180101","ClassificationType":"JTAXCC"}}
{"Header":{"AssetClass":"Commodities","InstrumentType":"Swap","UseCase":"Basis_Swap","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"SOS","ExpiryDate":"2208-12-10","ReturnorPayoutTrigger":"Total Return","DeliveryType":"OPTL","TransactionType":"ORIT","FinalPriceType":"IHSM","ReferenceRate":"NATURAL GAS-MONTHLY INDEX S. TEXAS (TETCO)-GAS DAILY PRICE GUIDE","OtherReferenceRate":"NATURAL GAS-MONTHLY INDEX W. LOUISIANA (TETCO)-GAS DAILY PRICE GUIDE","BaseProduct":"OTHR","OtherBaseProduct":"OTHR","SubProduct":"","AdditionalSubProduct":"","OtherSubProduct":"","OtherAdditionalSubProduct":""},"ISIN":{"ISIN":"EZBBH1XR9GV6","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Multi Commodity","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Swap Basis_Swap OTHR OTHR SOS 22081210","ShortName":"NA/Swap OTHR SOS 22081210","ClassificationType":"STQTXE"}}
{"Header":{"AssetClass":"Commodities","InstrumentType":"Swap","UseCase":"Multi_Exotic_Swap","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"LRD","ExpiryDate":"2200-01-31","ReturnorPayoutTrigger":"Contract for Difference (CFD)","DeliveryType":"CASH","TransactionType":"TAPO","FinalPriceType":"EXOF","UnderlyingInstrumentIndex":["BCOMF6","BCOMNG3"]},"ISIN":{"ISIN":"EZ286HJVY4Q2","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"UnderlyingAssetType":"Multi Commodity","BaseProduct":"MCEX","SubProduct":"","AdditionalSubProduct":"","FullName":"Commodities Multi_Exotic_Swap MCEX LRD 22000131","ShortName":"NA/Swap MCEX LRD 22000131","ClassificationType":"STQCXC"}}
{"Header":{"AssetClass":"Commodities","InstrumentType":"Option","UseCase":"Option","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"TND","ExpiryDate":"2209-10-18","OptionType":"OPTL","OptionExerciseStyle":"AMER","ValuationMethodorTrigger":"Asian","DeliveryType":"CASH","TransactionType":"OTHR","FinalPriceType":"IHSM","ReferenceRate":"NATURAL GAS-NGPL (NICOR, NIPSCO, PGLC CITYGATE), NBPL-NICOR-ICE/10X MONTHLY","BaseProduct":"OTHR","SubProduct":"","AdditionalSubProduct":""},"ISIN":{"ISIN":"EZ2TK5CWL9Y4","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Other","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Option OTHR TND 22091018","ShortName":"NA/O OTHR OPTL TND 22091018","ClassificationType":"HTMHAC"}}
{"Header":{"AssetClass":"Commodities","InstrumentType":"Option","UseCase":"Multi_Exotic_Option","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"SOS","ExpiryDate":"2209-10-18","UnderlyingInstrumentIndex":["BCOMSI2","BCOMPR3T"],"OptionType":"CALL","OptionExerciseStyle":"AMER","ValuationMethodorTrigger":"Other Path Dependent","DeliveryType":"CASH","TransactionType":"ORIT","FinalPriceType":"BLTC"},"ISIN":{"ISIN":"EZ82L36B6225","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"UnderlyingAssetType":"Multi Commodity","BaseProduct":"MCEX","SubProduct":"","AdditionalSubProduct":"","FullName":"Commodities Multi_Exotic_Option MCEX SOS 22091018","ShortName":"NA/O MCEX Call SOS 22091018","ClassificationType":"HTQBPC"}}
文件的大小可以超过 1 GB。
为了有效地使用这些,我需要将 JSON 行转换为 JSON 数组,包装文件,在前面添加“[”并附加“]”,并为每一行分隔逗号 (,).
使文件看起来像这样(理论上):
[
{
"Header": {
"AssetClass": "Commodities",
"InstrumentType": "Swap",
"UseCase": "Basis_Swap",
"Level": "InstRefDataReporting"
},
"Attributes": {
"NotionalCurrency": "EUR",
"ExpiryDate": "2017-08-31",
"ReturnorPayoutTrigger": "Contract for Difference (CFD)",
"DeliveryType": "CASH",
"BaseProduct": "AGRI",
"OtherBaseProduct": "AGRI",
"TransactionType": "FUTR",
"FinalPriceType": "ARGM",
"ReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"OtherReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"SubProduct": "GROS",
"AdditionalSubProduct": "FWHT",
"OtherSubProduct": "GROS",
"OtherAdditionalSubProduct": "FWHT"
},
"ISIN": {
"ISIN": "EZ68CZDRFYY7",
"Status": "New"
},
"TemplateVersion": 1,
"Derived": {
"CommodityDerivativeIndicator": "TRUE",
"UnderlyingAssetType": "Multi Commodity",
"IssuerorOperatoroftheTradingVenueIdentifier": "NA",
"PriceMultiplier": 1,
"FullName": "Commodities Swap Basis_Swap AGRI GROS FWHT AGRI GROS FWHT EUR 20170831",
"ShortName": "NA/Swap AGRI FWHT FWHT EUR 20170831",
"ClassificationType": "STQCXC"
}
},
{
"Header": {
"AssetClass": "Commodities",
"InstrumentType": "Swap",
"UseCase": "Basis_Swap",
"Level": "InstRefDataReporting"
},
"Attributes": {
"NotionalCurrency": "EUR",
"ExpiryDate": "2017-08-31",
"ReturnorPayoutTrigger": "Contract for Difference (CFD)",
"DeliveryType": "CASH",
"BaseProduct": "AGRI",
"OtherBaseProduct": "AGRI",
"TransactionType": "FUTR",
"FinalPriceType": "ARGM",
"ReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"OtherReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"SubProduct": "GROS",
"AdditionalSubProduct": "FWHT",
"OtherSubProduct": "GROS",
"OtherAdditionalSubProduct": "FWHT"
},
"ISIN": {
"ISIN": "EZ68CZDRFYY7",
"Status": "New"
},
"TemplateVersion": 1,
"Derived": {
"CommodityDerivativeIndicator": "TRUE",
"UnderlyingAssetType": "Multi Commodity",
"IssuerorOperatoroftheTradingVenueIdentifier": "NA",
"PriceMultiplier": 1,
"FullName": "Commodities Swap Basis_Swap AGRI GROS FWHT AGRI GROS FWHT EUR 20170831",
"ShortName": "NA/Swap AGRI FWHT FWHT EUR 20170831",
"ClassificationType": "STQCXC"
}
}
]
所以我找到了 jq,根据我的理解,我可以 运行 这个
jq --slurp 'map(select(. >= 2))' Inputfile.json > OutputFile.json
这有效,但是当 运行 宁文件大于 200 MB 时,我得到 "system out out of memory" 错误(如果我使用 ISE),如果我使用标准 Powershell 或 CMD,它需要很长时间。分钟 (5+)
如果我从命令中取出 --slurp
,它会工作,速度更快,但结果如下所示:
[
{
"Header": {
"AssetClass": "Commodities",
"InstrumentType": "Swap",
"UseCase": "Basis_Swap",
"Level": "InstRefDataReporting"
},
"Attributes": {
"NotionalCurrency": "EUR",
"ExpiryDate": "2017-08-31",
"ReturnorPayoutTrigger": "Contract for Difference (CFD)",
"DeliveryType": "CASH",
"BaseProduct": "AGRI",
"OtherBaseProduct": "AGRI",
"TransactionType": "FUTR",
"FinalPriceType": "ARGM",
"ReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"OtherReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"SubProduct": "GROS",
"AdditionalSubProduct": "FWHT",
"OtherSubProduct": "GROS",
"OtherAdditionalSubProduct": "FWHT"
},
"ISIN": {
"ISIN": "EZ68CZDRFYY7",
"Status": "New"
},
"TemplateVersion": 1,
"Derived": {
"CommodityDerivativeIndicator": "TRUE",
"UnderlyingAssetType": "Multi Commodity",
"IssuerorOperatoroftheTradingVenueIdentifier": "NA",
"PriceMultiplier": 1,
"FullName": "Commodities Swap Basis_Swap AGRI GROS FWHT AGRI GROS FWHT EUR 20170831",
"ShortName": "NA/Swap AGRI FWHT FWHT EUR 20170831",
"ClassificationType": "STQCXC"
}
}]
[{
"Header": {
"AssetClass": "Commodities",
"InstrumentType": "Swap",
"UseCase": "Basis_Swap",
"Level": "InstRefDataReporting"
},
"Attributes": {
"NotionalCurrency": "EUR",
"ExpiryDate": "2017-08-31",
"ReturnorPayoutTrigger": "Contract for Difference (CFD)",
"DeliveryType": "CASH",
"BaseProduct": "AGRI",
"OtherBaseProduct": "AGRI",
"TransactionType": "FUTR",
"FinalPriceType": "ARGM",
"ReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"OtherReferenceRate": "10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN",
"SubProduct": "GROS",
"AdditionalSubProduct": "FWHT",
"OtherSubProduct": "GROS",
"OtherAdditionalSubProduct": "FWHT"
},
"ISIN": {
"ISIN": "EZ68CZDRFYY7",
"Status": "New"
},
"TemplateVersion": 1,
"Derived": {
"CommodityDerivativeIndicator": "TRUE",
"UnderlyingAssetType": "Multi Commodity",
"IssuerorOperatoroftheTradingVenueIdentifier": "NA",
"PriceMultiplier": 1,
"FullName": "Commodities Swap Basis_Swap AGRI GROS FWHT AGRI GROS FWHT EUR 20170831",
"ShortName": "NA/Swap AGRI FWHT FWHT EUR 20170831",
"ClassificationType": "STQCXC"
}
}
]
它为每一行创建一个数组,但数组不是逗号分隔的,这不是我想要的。
那么,如何在不使用 slurp 的情况下处理多 JSON 行的大文件,并为作为单个数组生成的输入文件创建一个文件,逗号分隔?
我已阅读有关输入的信息,但不确定这是否与我需要做的相关?
看起来 inputs
遇到了与 slurp 相同的问题。我不知道如何用 jq
在大文件上完成此操作,但 sed
可以做到:
sed '1s/^/[/; $!s/$/,/; $s/$/]/' in.json > out.json
输出:
[{"Header":{"AssetClass":"Commodities","InstrumentType":"Forward","UseCase":"Forward","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"EUR","ExpiryDate":"2018-01-01","ReturnorPayoutTrigger":"Contract for Difference (CFD)","DeliveryType":"CASH","BaseProduct":"AGRI","TransactionType":"FUTR","FinalPriceType":"ARGM","ReferenceRate":"10PPM ULTRA LOW SULPHUR DIESEL-CARGOES CIF NWE/BASIS ARA-PLATTS EUROPEAN","SubProduct":"GROS","AdditionalSubProduct":"FWHT"},"ISIN":{"ISIN":"EZX27M86B860","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Agriculture","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Forward AGRI GROS FWHT EUR 20180101","ShortName":"NA/Fwd AGRI FWHT EUR 20180101","ClassificationType":"JTAXCC"}},
{"Header":{"AssetClass":"Commodities","InstrumentType":"Swap","UseCase":"Basis_Swap","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"SOS","ExpiryDate":"2208-12-10","ReturnorPayoutTrigger":"Total Return","DeliveryType":"OPTL","TransactionType":"ORIT","FinalPriceType":"IHSM","ReferenceRate":"NATURAL GAS-MONTHLY INDEX S. TEXAS (TETCO)-GAS DAILY PRICE GUIDE","OtherReferenceRate":"NATURAL GAS-MONTHLY INDEX W. LOUISIANA (TETCO)-GAS DAILY PRICE GUIDE","BaseProduct":"OTHR","OtherBaseProduct":"OTHR","SubProduct":"","AdditionalSubProduct":"","OtherSubProduct":"","OtherAdditionalSubProduct":""},"ISIN":{"ISIN":"EZBBH1XR9GV6","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Multi Commodity","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Swap Basis_Swap OTHR OTHR SOS 22081210","ShortName":"NA/Swap OTHR SOS 22081210","ClassificationType":"STQTXE"}},
{"Header":{"AssetClass":"Commodities","InstrumentType":"Swap","UseCase":"Multi_Exotic_Swap","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"LRD","ExpiryDate":"2200-01-31","ReturnorPayoutTrigger":"Contract for Difference (CFD)","DeliveryType":"CASH","TransactionType":"TAPO","FinalPriceType":"EXOF","UnderlyingInstrumentIndex":["BCOMF6","BCOMNG3"]},"ISIN":{"ISIN":"EZ286HJVY4Q2","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"UnderlyingAssetType":"Multi Commodity","BaseProduct":"MCEX","SubProduct":"","AdditionalSubProduct":"","FullName":"Commodities Multi_Exotic_Swap MCEX LRD 22000131","ShortName":"NA/Swap MCEX LRD 22000131","ClassificationType":"STQCXC"}},
{"Header":{"AssetClass":"Commodities","InstrumentType":"Option","UseCase":"Option","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"TND","ExpiryDate":"2209-10-18","OptionType":"OPTL","OptionExerciseStyle":"AMER","ValuationMethodorTrigger":"Asian","DeliveryType":"CASH","TransactionType":"OTHR","FinalPriceType":"IHSM","ReferenceRate":"NATURAL GAS-NGPL (NICOR, NIPSCO, PGLC CITYGATE), NBPL-NICOR-ICE/10X MONTHLY","BaseProduct":"OTHR","SubProduct":"","AdditionalSubProduct":""},"ISIN":{"ISIN":"EZ2TK5CWL9Y4","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","UnderlyingAssetType":"Other","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"FullName":"Commodities Option OTHR TND 22091018","ShortName":"NA/O OTHR OPTL TND 22091018","ClassificationType":"HTMHAC"}},
{"Header":{"AssetClass":"Commodities","InstrumentType":"Option","UseCase":"Multi_Exotic_Option","Level":"InstRefDataReporting"},"Attributes":{"NotionalCurrency":"SOS","ExpiryDate":"2209-10-18","UnderlyingInstrumentIndex":["BCOMSI2","BCOMPR3T"],"OptionType":"CALL","OptionExerciseStyle":"AMER","ValuationMethodorTrigger":"Other Path Dependent","DeliveryType":"CASH","TransactionType":"ORIT","FinalPriceType":"BLTC"},"ISIN":{"ISIN":"EZ82L36B6225","Status":"New"},"TemplateVersion":1,"Derived":{"CommodityDerivativeIndicator":"TRUE","IssuerorOperatoroftheTradingVenueIdentifier":"NA","PriceMultiplier":1,"UnderlyingAssetType":"Multi Commodity","BaseProduct":"MCEX","SubProduct":"","AdditionalSubProduct":"","FullName":"Commodities Multi_Exotic_Option MCEX SOS 22091018","ShortName":"NA/O MCEX Call SOS 22091018","ClassificationType":"HTQBPC"}}]
说明
sed 脚本由三个独立的替换组成。它们在不同的行中:
1 s/^/[/ # Insert a left bracket at the beginning of the first line
$! s/$/,/ # On all but the last line append a comma
$ s/$/]/ # Append a right bracket to the last line
也许 awk 在眼睛上更容易:
awk 'BEGIN{print "["}
length(last)>0 {print last ","} {last=[=10=]}
END {print last, "]"}'
郑重声明,这是一个使用带有 input
的 jq 版本的无 slurp 解决方案:
jq -nr '"[", try (input|tojson, repeat(",\n\(input|tojson)")), "]"'
也许,我来晚了,但这就是你要找的!
jq -s '.' in.json > out.json
我使用了一个节点调用的衬里:
$ cat input.ndjson | node -e 'const rl = readline.createInterface({ input: process.stdin }); !async function () { let idx = 0; for await (const line of rl) { process.stdout.write((++idx === 1 ? "[" : "\n,") + JSON.stringify(JSON.parse(line))); } process.stdout.write("]"); }()' | tee output.json | jq 'length'
16814
解释:
$ cat input.ndjson | # pipe the ndjson input, can be a file or any stream
node -e '
const rl = readline.createInterface({ input: process.stdin });
!async function () { // to be able to use await, this function has to be async, and then `!` is to be able call it right away;
let idx = 0;
for await (const line of rl) {
process.stdout.write((++idx === 1 ? "[" : "\n,") + JSON.stringify(JSON.parse(line)));
}
process.stdout.write("]");
}()
' | tee output.json # save the output json file
| jq 'length' # call jq to calculate a length, also validate it, to make sure it's a valid single json file, this optional
Node的readline
是一个很好的按行读取流的方法,它asyncIterable
,可以使用(await ... of)获取每一行,然后调用JSON.parse
以确保验证每一行都是有效的 json,然后 JSON.stringify
在每行上转换回缩小的 JSON;
当然你可以换成本地的const arr = [];
然后把每行的obj推入t,最后一次JSON.stringify,得到最小化的JSON,
但我喜欢这种几乎缩小的格式,每一行的 obj 都缩小了,但仍然逐行保留外部数组,这样我就可以轻松地通过 wc -l
计算它的总数行
[{...minified line1obj with no spaces...}
,{...minified line2obj}
,{...minified line3obj}
,...
,{...minified lineNobj}]