将 2 列数据框转换为多级分层数据框

Convert 2 column dataframe into multi-level hierarchical dataframe

我有一个 pandas 数据框

From To
A B
A C
D E
F F
B G
B H
B I
G J
G K
L L
M M
N N

我想将其转换为多列层次结构。预期的层次结构看起来像

Level_1 Level_2 Level_3 Level_4
A B G J
A B G K
A B H
A B I
A C
D E
F F
L L
M M
N N

pandas 中是否有内置方法来实现此目的?我知道我可以使用递归,还有其他简化的方法吗?

您可以使用 networkx

轻松获得您期望的结果
# Python env: pip install networkx
# Anaconda env: conda install networkx

import networkx as nx
import pandas as pd

df = pd.DataFrame({'From': ['A', 'A', 'D', 'F', 'B', 'B', 'B', 'G', 'G', 'L', 'M', 'N'],
                   'To': ['B', 'C', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N']})

G = nx.from_pandas_edgelist(df, source='From', target='To', create_using=nx.DiGraph)
roots = [v for v, d in G.in_degree() if d == 0]
leaves = [v for v, d in G.out_degree() if d == 0]

all_paths = []
for root in roots:
    for leaf in leaves:
        paths = nx.all_simple_paths(G, root, leaf)
        all_paths.extend(paths)

for node in nx.nodes_with_selfloops(G):
    all_paths.append([node, node])

输出:

>>> pd.DataFrame(sorted(all_paths)).add_prefix('Level_').fillna('')
  Level_0 Level_1 Level_2 Level_3
0       A       B       G       J
1       A       B       G       K
2       A       B       H
3       A       B       I
4       A       C
5       D       E
6       F       F
7       L       L
8       M       M
9       N       N

文档:networkx.algorithms.simple_paths.all_simple_paths

没有networkx的解决方案:

def path(df, parent, cur_path=None):
    if cur_path is None:
        cur_path = []

    x = df[df.From.eq(parent)]

    if len(x) == 0:
        yield cur_path
        return
    elif len(x) == 1:
        yield cur_path + x["To"].to_list()
        return

    for _, row in x.iterrows():
        yield from path(df, row["To"], cur_path + [row["To"]])


def is_sublist(l1, l2):
    # checks if l1 is sublist of l2

    if len(l1) > len(l2):
        return False

    for i in range(len(l2)):
        if l1 == l2[i : i + len(l1)]:
            return True

    return False


unique_paths = []
for v in df["From"].unique():
    for p in path(df, v, [v]):
        if not any(is_sublist(p, up) for up in unique_paths):
            unique_paths.append(p)


df = pd.DataFrame(
    [{f"level_{i}": v for i, v in enumerate(p, 1)} for p in unique_paths]
).fillna("")
print(df)

打印:

  level_1 level_2 level_3 level_4
0       A       B       G       J
1       A       B       G       K
2       A       B       H        
3       A       B       I        
4       A       C                
5       D       E                
6       F       F                
7       L       L                
8       M       M                
9       N       N