Python : 从 CSV 构建分层 JSON

Python : build hierarchical JSON from CSV

我想从 CSV 构建一个 JSON 文件来表示我的数据的层次关系。关系是 parents 和 children :一个 child 可以有一个或多个 parents 和一个 parent 可以有一个或多个 child仁。一个 child 也可以有 children 值,多个级别是可能的。我认为 D3 中的 dendrogram 可能是一个很好的可视化。

我的 CSV 源文件包含数千行,如下所示:

parent         | children       | date
---------------------------------------------
830010000C0419 | 830010000C1205 | 1993/09/15
830010000C0947 | 830010000C1205 | 1993/09/15
830010000C0948 | 830010000C1205 | 1993/09/15
830010000B0854 | 830010000B1196 | 1994/03/11
830010000B0854 | 830010000B1197 | 1994/03/11
830010000B0721 | 830010000B1343 | 1988/12/05
830010000B1343 | 830010000B1344 | 1988/12/05
830010000B0721 | 830010000B1345 | 1988/12/05
830010000B1345 | 830010000B1344 | 1986/12/05
...

我不想生成具有以下结构的 JSON 文件:

var treeData = [
  {
    "name": "Root",
    "parent": "null",
    "children": [
      {
        "name": "830010000B0854",
        "parent": "Top Level",
        "children": [
          {
            "name": "830010000B1196",
            "parent": "830010000B0854"
          },
          {
            "name": "830010000B1197",
            "parent": "830010000B0854"
          }
        ]
      },
      {
        "name": "830010000B0721",
        "parent": "Top Level",
        "children": [
          {
            "name": "830010000B1343",
            "parent": "830010000B0721",
            "children": [
                {
                "name": "830010000B1344",
                "parent": "830010000B1343"
                }
            ]
          }
        ]
      },
      {
        "name": "830010000C0419",
        "parent": "Top Level",
        "children": [
          {
            "name": "830010000C1205",
            "parent": "830010000C0419"
          }
        ]
      },
      {
        "name": "830010000C0947",
        "parent": "Top Level",
        "children": [
          {
            "name": "830010000C1205",
            "parent": "830010000C0947"
          }
        ]
      },
      {
        "name": "830010000C0948",
        "parent": "Top Level",
        "children": [
          {
            "name": "830010000C1205",
            "parent": "830010000C0948"
          }
        ]
      }
    ]
  }
];

请注意,在这个例子中,我无法建立一个 child 有很多 parent 的关系,也许需要更复杂的树状图。

如何使用 Python 构建这种结构?

我首先想到的是下面这个。请注意,这还没有完成,您需要添加某种形式的 recursion/iteration 以深入到子节点,但我认为逻辑应该非常相似。

all_parents = df.parent

def get_children(parent_name):
    children = [child for child in df[df.parent == parent_name].children]
    return [{"name": name, "parent": parent_name} for name in children]

def get_node_representation(parent_name):
    if parent_name in all_parents:
        parent = "Top Level"
    else:
        # Your logic here
        parent = "null"
    return {"name": parent_name, "parent": parent, "children": get_children(parent_name)}

# this assumes all children are also parent which is not necessarily true of course, so you want to create some kind of recursion/iteration on calling node_representation on the children nodes
all_nodes = [get_node_representation(node) for node in df.parent]

我发现这个 method 允许父子之间的多重关系。

这是我的数据演示:

var width = 800,
    height = 800,
    boxWidth = 150,
    boxHeight = 20,
    gap = {
        width: 150,
        height: 12
    },
    margin = {
        top: 16,
        right: 16,
        bottom: 16,
        left: 16
    },
    svg;
    
var data = {
    "Nodes": [
    
    // Level 0
    {
            "lvl": 0,
            "name": "830010000C0419"
        },
        {
            "lvl": 0,
            "name": "830010000C0947"
        },
        {
            "lvl": 0,
            "name": "830010000C0948"
        },
        {
            "lvl": 0,
            "name": "830010000B0854"
        },
        {
            "lvl": 0,
            "name": "830010000B0721"
        },
        
    // Level 1
        
        {
            "lvl": 1,
            "name": "830010000C1205"
        },
        {
            "lvl": 1,
            "name": "830010000B1196"
        },
        {
            "lvl": 1,
            "name": "830010000B1197"
        },
        {
            "lvl": 1,
            "name": "830010000B1343"
        },
        {
            "lvl": 1,
            "name": "830010000B1345"
        },
        
    // Level 2
        {
            "lvl": 2,
            "name": "830010000B1344"
        }
        
    ],
    "links": [
        {
            "source": "830010000C0419",
            "target": "830010000C1205"
        },
        {
            "source": "830010000C0947",
            "target": "830010000C1205"
        },
        {
            "source": "830010000C0948",
            "target": "830010000C1205"
        },
        {
            "source": "830010000B0854",
            "target": "830010000B1196"
        },
        {
            "source": "830010000B0854",
            "target": "830010000B1197"
        },
        {
            "source": "830010000B0721",
            "target": "830010000B1343"
        },
        {
            "source": "830010000B1343",
            "target": "830010000B1344"
        },
        {
            "source": "830010000B0721",
            "target": "830010000B1345"
        },      
        {
        
            "source": "830010000B1345",
            "target": "830010000B1344"
        }
    ]
};

// test layout
var Nodes = [];
var links = [];
var lvlCount = 0;

var diagonal = d3.svg.diagonal()
    .projection(function (d) {
        "use strict";
        return [d.y, d.x];
    });

function find(text) {
    "use strict";
    var i;
    for (i = 0; i < Nodes.length; i += 1) {
        if (Nodes[i].name === text) {
            return Nodes[i];
        }
    }
    return null;
}

function mouse_action(val, stat, direction) {
    "use strict";
    d3.select("#" + val.id).classed("active", stat);
    
    links.forEach(function (d) {
        if (direction == "root") {
            if (d.source.id === val.id) {
                d3.select("#" + d.id).classed("activelink", stat); // change link color
                d3.select("#" + d.id).classed("link", !stat); // change link color
                if (d.target.lvl < val.lvl)
                    mouse_action(d.target, stat, "left");
                else if (d.target.lvl > val.lvl)
                    mouse_action(d.target, stat, "right");
            }
            if (d.target.id === val.id) {
                d3.select("#" + d.id).classed("activelink", stat); // change link color
                d3.select("#" + d.id).classed("link", !stat); // change link color
                if (direction == "root") {
                    if(d.source.lvl < val.lvl)
                        mouse_action(d.source, stat, "left");
                    else if (d.source.lvl > val.lvl)
                        mouse_action(d.source, stat, "right");
                }
            }
        }else if (direction == "left") {
            if (d.source.id === val.id && d.target.lvl < val.lvl) {
                d3.select("#" + d.id).classed("activelink", stat); // change link color
                d3.select("#" + d.id).classed("link", !stat); // change link color

                mouse_action(d.target, stat, direction);
            }
            if (d.target.id === val.id && d.source.lvl < val.lvl) {
                d3.select("#" + d.id).classed("activelink", stat); // change link color
                d3.select("#" + d.id).classed("link", !stat); // change link color
                mouse_action(d.source, stat, direction);
            }
        }else if (direction == "right") {
            if (d.source.id === val.id && d.target.lvl > val.lvl) {
                d3.select("#" + d.id).classed("activelink", stat); // change link color
                d3.select("#" + d.id).classed("link", !stat); // change link color
                mouse_action(d.target, stat, direction);
            }
            if (d.target.id === val.id && d.source.lvl > val.lvl) {
                d3.select("#" + d.id).classed("activelink", stat); // change link color
                d3.select("#" + d.id).classed("link", !stat); // change link color
                mouse_action(d.source, stat, direction);
            }
        }
    });
}

function unvisite_links() {
    "use strict";
    links.forEach(function (d) {
        d.visited = false;
    });
}

function renderRelationshipGraph(data) {
    "use strict";
    var count = [];

    data.Nodes.forEach(function (d) {
        count[d.lvl] = 0;
    });
    lvlCount = count.length;

    data.Nodes.forEach(function (d, i) {
        d.x = margin.left + d.lvl * (boxWidth + gap.width);
        d.y = margin.top + (boxHeight + gap.height) * count[d.lvl];
        d.id = "n" + i;
        count[d.lvl] += 1;
        Nodes.push(d);
    });

    data.links.forEach(function (d) {
        links.push({
            source: find(d.source),
            target: find(d.target),
            id: "l" + find(d.source).id + find(d.target).id
        });
    });
    unvisite_links();

    svg.append("g")
        .attr("class", "nodes");

    var node = svg.select(".nodes")
        .selectAll("g")
        .data(Nodes)
        .enter()
        .append("g")
        .attr("class", "unit");

    node.append("rect")
        .attr("x", function (d) { return d.x; })
        .attr("y", function (d) { return d.y; })
        .attr("id", function (d) { return d.id; })
        .attr("width", boxWidth)
        .attr("height", boxHeight)
        .attr("class", "node")
        .attr("rx", 6)
        .attr("ry", 6)
        .on("mouseover", function () {
            mouse_action(d3.select(this).datum(), true, "root");
            unvisite_links();
        })
        .on("mouseout", function () {
            mouse_action(d3.select(this).datum(), false, "root");
            unvisite_links();
        });

    node.append("text")
        .attr("class", "label")
        .attr("x", function (d) { return d.x + 14; })
        .attr("y", function (d) { return d.y + 15; })
        .text(function (d) { return d.name; });

    links.forEach(function (li) {
        svg.append("path", "g")
            .attr("class", "link")
            .attr("id", li.id)
            .attr("d", function () {
                var oTarget = {
                    x: li.target.y + 0.5 * boxHeight,
                    y: li.target.x
                };
                var oSource = {
                    x: li.source.y + 0.5 * boxHeight,
                    y: li.source.x
                };
                
                if (oSource.y < oTarget.y) {
                    oSource.y += boxWidth;
                } else {
                    oTarget.y += boxWidth;
                }
                return diagonal({
                    source: oSource,
                    target: oTarget
                });
            });
    });
}

svg = d3.select("#tree").append("svg")
    .attr("width", width)
    .attr("height", height)
    .append("g");
    
    renderRelationshipGraph(data);
rect {
  fill: #CCC;
  cursor: pointer;
}
.active {
  fill: orange;
  stroke: orange;
}
.activelink {
  fill: none;
  stroke: orange;
  stroke-width: 2.5px;
}
.label {
  fill: white;
  font-family: sans-serif;
  pointer-events: none;
}
.link {
  fill: none;
  stroke: #ccc;
  stroke-width: 2.5px;
}
<script src="https://d3js.org/d3.v3.min.js"></script>
<div id="tree"></div>

我需要知道一个脚本来生成节点和链接结构

我会首先构建一个节点字典,其中键是节点名称,值是一个包含父列表和子列表的元组。为了有一个更简单的方法来构建树,我还会保留所有顶级节点的集合(没有父节点)。

从那个字典,然后可以递归地构建一个 json 类数据,可以用来构建一个真正的 json 字符串。

但是你显示的是 不是 的 csv 格式,我使用 re.split 来解析输入:

import re

# First the data
t = '''parent         | children       | date
---------------------------------------------
830010000C0419 | 830010000C1205 | 1993/09/15
830010000C0947 | 830010000C1205 | 1993/09/15
830010000C0948 | 830010000C1205 | 1993/09/15
830010000B0854 | 830010000B1196 | 1994/03/11
830010000B0854 | 830010000B1197 | 1994/03/11
830010000B0721 | 830010000B1343 | 1988/12/05
830010000B1343 | 830010000B1344 | 1988/12/05
'''

rx = re.compile(r'\s*\|\s*')

# nodes is a dictionary of nodes, nodes[None] is the set of top-level names
nodes = {None: set()}
with io.StringIO(t) as fd:
    _ = next(fd)              # skip initial lines
    _ = next(fd)
    for linenum, line in enumerate(fd, 1):
        p, c = rx.split(line.strip())[:2]   # parse a line
        if p == c:            # a node cannot be its parent
            raise ValueError(f'Same node as parent and child {p} at line {linenum}')
        # process the nodes
        if c not in nodes:
            nodes[c] = ([], [])
        elif c in nodes[None]:
            nodes[None].remove(c)
        if p not in nodes:
            nodes[p] = ([], [c])
            nodes[None].add(p)
        else:
            nodes[p][1].append(c)
        nodes[c][0].append(p)


def subtree(node, nodes, parent=None, seen = None):
    """Builds a dict with the subtree of a node.
        node is a node name, nodes the dict, parent is the parent name,
        seen is a list of all previously seen node to prevent cycles
    """
    if seen is None:
        seen = [node]
    elif node in seen:    # special processing to break possible cycles
        return {'name': node, 'parent': parent, 'children': '...'}
    else:
        seen.append(node)
    return {'name': node, 'parent': parent, 'children':
            [subtree(c, nodes, node, seen) for c in nodes[node][1]]}

# We can now build the json data
js = {node: subtree(node, nodes) for node in nodes[None]}

pprint.pprint(js)

它给出:

{'830010000B0721': {'children': [{'children': [{'children': [],
                                                'name': '830010000B1344',
                                                'parent': '830010000B1343'}],
                                  'name': '830010000B1343',
                                  'parent': '830010000B0721'}],
                    'name': '830010000B0721',
                    'parent': None},
 '830010000B0854': {'children': [{'children': [],
                                  'name': '830010000B1196',
                                  'parent': '830010000B0854'},
                                 {'children': [],
                                  'name': '830010000B1197',
                                  'parent': '830010000B0854'}],
                    'name': '830010000B0854',
                    'parent': None},
 '830010000C0419': {'children': [{'children': [],
                                  'name': '830010000C1205',
                                  'parent': '830010000C0419'}],
                    'name': '830010000C0419',
                    'parent': None},
 '830010000C0947': {'children': [{'children': [],
                                  'name': '830010000C1205',
                                  'parent': '830010000C0947'}],
                    'name': '830010000C0947',
                    'parent': None},
 '830010000C0948': {'children': [{'children': [],
                                  'name': '830010000C1205',
                                  'parent': '830010000C0948'}],
                    'name': '830010000C0948',
                    'parent': None}}