使用 OrientDB ETL 将简单的 csv 文件导入图形的最简单方法
Easiest way to import a simple csv file to a graph with OrientDB ETL
我想将一个非常简单的 csv 有向图文件导入 OrientDB。具体来说,该文件是来自 SNAP 集合 https://snap.stanford.edu/data/roadNet-PA.html 的 roadNet-PA 数据集。该文件的第一行如下:
# Directed graph (each unordered pair of nodes is saved once)
# Pennsylvania road network
# Nodes: 1088092 Edges: 3083796
# FromNodeId ToNodeId
0 1
0 6309
0 6353
1 0
6353 0
6353 6354
只有一种类型的顶点(道路交叉口)并且边缘没有信息(我想 OrientDB 轻量级边缘是最好的选择)。另请注意,顶点以制表符隔开。
我尝试创建一个简单的 etl 来导入文件,但没有成功。这是 etl:
{
"config": {
"log": "debug"
},
"source" : {
"file": { "path": "/tmp/roadNet-PA.csv" }
},
"extractor": { "row": {} },
"transformers": [
{ "csv": { "separator": " ", "skipFrom": 1, "skipTo": 4 } },
{ "vertex": { "class": "Intersection" } },
{ "edge": { "class": "Road" } }
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/roads",
"dbType": "graph",
"classes": [
{"name": "Intersection", "extends": "V"},
{"name": "Road", "extends": "E"}
], "indexes": [
{"class":"Intersection", "fields":["id:integer"], "type":"UNIQUE" }
]
}
}
}
etl 有效,但它没有像我预期的那样导入文件。我想问题出在变压器上。我的想法是逐行读取 csv 并创建和边缘连接两个顶点,但我不确定如何在 etl 文件中表达这一点。有什么想法吗?
试试这个:
{
"config": {
"log": "debug"
},
"source" : {
"file": { "path": "/tmp/roadNet-PA.csv" }
},
"extractor": { "row": {} },
"transformers": [
{ "csv": { "separator": "\t", "skipFrom": 1, "skipTo": 4,
"columnsOnFirstLine": false,
"columns":["id", "to"] } },
{ "vertex": { "class": "Intersection" } },
{ "merge": { "joinFieldName":"id", "lookup":"Intersection.id" } },
{ "edge": {
"class": "Road",
"joinFieldName": "to",
"lookup": "Intersection.id",
"unresolvedLinkAction": "CREATE"
}
},
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/roads",
"dbType": "graph",
"wal": false,
"batchCommit": 1000,
"tx": true,
"txUseLog": false,
"useLightweightEdges" : true,
"classes": [
{"name": "Intersection", "extends": "V"},
{"name": "Road", "extends": "E"}
], "indexes": [
{"class":"Intersection", "fields":["id:integer"], "type":"UNIQUE" }
]
}
}
}
为了加快加载速度,我建议您关闭服务器,并使用 "plocal:" 而不是 "remote:" 导入 ETL。将现有的替换为:
的示例
"dbURL": "plocal:/orientdb/databases/roads",
终于成功了。我已经按照 Luca 的建议移动了顶点线之前的合并。我还将 'id' 字段更改为 'from' 以避免错误 "property key is reserved for all elements id"。这是片段:
{
"config": {
"log": "debug"
},
"source" : {
"file": { "path": "/tmp/roads.csv" }
},
"extractor": { "row": {} },
"transformers": [
{ "csv": { "separator": "\t",
"columnsOnFirstLine": false,
"columns":["from", "to"] } },
{ "merge": { "joinFieldName":"from", "lookup":"Intersection.from" } },
{ "vertex": { "class": "Intersection" } },
{ "edge": {
"class": "Road",
"joinFieldName": "to",
"lookup": "Intersection.from",
"unresolvedLinkAction": "CREATE"
}
},
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/roads",
"dbType": "graph",
"wal": false,
"batchCommit": 1000,
"tx": true,
"txUseLog": false,
"useLightweightEdges" : true,
"classes": [
{"name": "Intersection", "extends": "V"},
{"name": "Road", "extends": "E"}
], "indexes": [
{"class":"Intersection", "fields":["from:integer"], "type":"UNIQUE" }
]
}
}
}
我想将一个非常简单的 csv 有向图文件导入 OrientDB。具体来说,该文件是来自 SNAP 集合 https://snap.stanford.edu/data/roadNet-PA.html 的 roadNet-PA 数据集。该文件的第一行如下:
# Directed graph (each unordered pair of nodes is saved once)
# Pennsylvania road network
# Nodes: 1088092 Edges: 3083796
# FromNodeId ToNodeId
0 1
0 6309
0 6353
1 0
6353 0
6353 6354
只有一种类型的顶点(道路交叉口)并且边缘没有信息(我想 OrientDB 轻量级边缘是最好的选择)。另请注意,顶点以制表符隔开。
我尝试创建一个简单的 etl 来导入文件,但没有成功。这是 etl:
{
"config": {
"log": "debug"
},
"source" : {
"file": { "path": "/tmp/roadNet-PA.csv" }
},
"extractor": { "row": {} },
"transformers": [
{ "csv": { "separator": " ", "skipFrom": 1, "skipTo": 4 } },
{ "vertex": { "class": "Intersection" } },
{ "edge": { "class": "Road" } }
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/roads",
"dbType": "graph",
"classes": [
{"name": "Intersection", "extends": "V"},
{"name": "Road", "extends": "E"}
], "indexes": [
{"class":"Intersection", "fields":["id:integer"], "type":"UNIQUE" }
]
}
}
}
etl 有效,但它没有像我预期的那样导入文件。我想问题出在变压器上。我的想法是逐行读取 csv 并创建和边缘连接两个顶点,但我不确定如何在 etl 文件中表达这一点。有什么想法吗?
试试这个:
{
"config": {
"log": "debug"
},
"source" : {
"file": { "path": "/tmp/roadNet-PA.csv" }
},
"extractor": { "row": {} },
"transformers": [
{ "csv": { "separator": "\t", "skipFrom": 1, "skipTo": 4,
"columnsOnFirstLine": false,
"columns":["id", "to"] } },
{ "vertex": { "class": "Intersection" } },
{ "merge": { "joinFieldName":"id", "lookup":"Intersection.id" } },
{ "edge": {
"class": "Road",
"joinFieldName": "to",
"lookup": "Intersection.id",
"unresolvedLinkAction": "CREATE"
}
},
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/roads",
"dbType": "graph",
"wal": false,
"batchCommit": 1000,
"tx": true,
"txUseLog": false,
"useLightweightEdges" : true,
"classes": [
{"name": "Intersection", "extends": "V"},
{"name": "Road", "extends": "E"}
], "indexes": [
{"class":"Intersection", "fields":["id:integer"], "type":"UNIQUE" }
]
}
}
}
为了加快加载速度,我建议您关闭服务器,并使用 "plocal:" 而不是 "remote:" 导入 ETL。将现有的替换为:
的示例 "dbURL": "plocal:/orientdb/databases/roads",
终于成功了。我已经按照 Luca 的建议移动了顶点线之前的合并。我还将 'id' 字段更改为 'from' 以避免错误 "property key is reserved for all elements id"。这是片段:
{
"config": {
"log": "debug"
},
"source" : {
"file": { "path": "/tmp/roads.csv" }
},
"extractor": { "row": {} },
"transformers": [
{ "csv": { "separator": "\t",
"columnsOnFirstLine": false,
"columns":["from", "to"] } },
{ "merge": { "joinFieldName":"from", "lookup":"Intersection.from" } },
{ "vertex": { "class": "Intersection" } },
{ "edge": {
"class": "Road",
"joinFieldName": "to",
"lookup": "Intersection.from",
"unresolvedLinkAction": "CREATE"
}
},
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/roads",
"dbType": "graph",
"wal": false,
"batchCommit": 1000,
"tx": true,
"txUseLog": false,
"useLightweightEdges" : true,
"classes": [
{"name": "Intersection", "extends": "V"},
{"name": "Road", "extends": "E"}
], "indexes": [
{"class":"Intersection", "fields":["from:integer"], "type":"UNIQUE" }
]
}
}
}