Python 中的两个列表按文件名对路径进行分组
Group path by filename from two list in Python
给定两个包含多个文件完整路径的列表
list1=['fow/fol/TWO_BB_P3.xml', 'fow/fol/N0_AG_ES.xml','fow/fol/TWO_AG_GH.xml', 'fow/fol/TWO_AG_EY.xml']
和
list2 =['gt/so/TWO_AG_EY.txt','gt/so/TWO_AG_GH.txt','gt/so/TWO_BB_P3.txt']
如果代表路径有filename
,我想同步/分组路径。
例如。我要加群
'xml_path' = {str} 'gt/so/TWO_AG_EY.txt'
'txt_path' = {str} 'fow/fol/TWO_AG_EY.xml'
或
'xml_path' = {str} 'gt/so/TWO_AG_GH.txt'
'txt_path' = {str} 'fow/fol/TWO_AG_GH.xml'
为了实现这一点,我依靠以下代码,其中涉及拆分路径并使用 pandas
合并功能。
但是,我想知道是否有更简洁高效的方法来做到这一点。
这是我到目前为止尝试过的方法
import os
import re
import pandas as pd
def sort_path(path_all):
path_all.sort(key=lambda x: [int(c) if c.isdigit() else c for c in re.split(r'(\d+)', x)])
return path_all
def split_sbj_id(s, idx):
return {'fname': os.path.split(s)[-1].split('.')[0], 'xml_path' if idx == 1 else 'txt_path': s}
def merge_path(ls_eb, ls_mff):
df = pd.merge(pd.DataFrame([split_sbj_id(fname, 1) for fname in ls_eb]),
pd.DataFrame([split_sbj_id(fname, 2) for fname in ls_mff]), on='fname',how='left')
all_d=[dict(fname=row['fname'],xml_path=row['xml_path'],txt_path=row['txt_path'])
for index, row in df.iterrows()]
return all_d
list1=['fow/fol/TWO_BB_P3.xml', 'fow/fol/N0_AG_ES.xml',
'fow/fol/TWO_AG_GH.xml', 'fow/fol/TWO_AG_EY.xml']
list2 =['gt/so/TWO_AG_EY.txt','gt/so/TWO_AG_GH.txt','gt/so/TWO_BB_P3.txt']
path_sbj = merge_path(list1, list2)
产生
[{'fname': 'TWO_BB_P3', 'xml_path': 'fow/fol/TWO_BB_P3.xml', 'txt_path': 'gt/so/TWO_BB_P3.txt'}, {'fname': 'N0_AG_ES', 'xml_path': 'fow/fol/N0_AG_ES.xml', 'txt_path': nan}, {'fname': 'TWO_AG_GH', 'xml_path': 'fow/fol/TWO_AG_GH.xml', 'txt_path': 'gt/so/TWO_AG_GH.txt'}, {'fname': 'TWO_AG_EY', 'xml_path': 'fow/fol/TWO_AG_EY.xml', 'txt_path': 'gt/so/TWO_AG_EY.txt'}]
Pandas接近
s1 = pd.DataFrame({'xml_path': list1})
s2 = pd.DataFrame({'txt_path': list2})
regex = r'([^/]+)\.(?:xml|txt)$'
s1['fname'] = s1['xml_path'].str.extract(regex)
s2['fname'] = s2['txt_path'].str.extract(regex)
s1.merge(s2, how='outer').to_dict('r')
[{'fname': 'TWO_BB_P3',
'txt_path': 'gt/so/TWO_BB_P3.txt',
'xml_path': 'fow/fol/TWO_BB_P3.xml'},
{'fname': 'N0_AG_ES', 'txt_path': nan, 'xml_path': 'fow/fol/N0_AG_ES.xml'},
{'fname': 'TWO_AG_GH',
'txt_path': 'gt/so/TWO_AG_GH.txt',
'xml_path': 'fow/fol/TWO_AG_GH.xml'},
{'fname': 'TWO_AG_EY',
'txt_path': 'gt/so/TWO_AG_EY.txt',
'xml_path': 'fow/fol/TWO_AG_EY.xml'}]
给定两个包含多个文件完整路径的列表
list1=['fow/fol/TWO_BB_P3.xml', 'fow/fol/N0_AG_ES.xml','fow/fol/TWO_AG_GH.xml', 'fow/fol/TWO_AG_EY.xml']
和
list2 =['gt/so/TWO_AG_EY.txt','gt/so/TWO_AG_GH.txt','gt/so/TWO_BB_P3.txt']
如果代表路径有filename
,我想同步/分组路径。
例如。我要加群
'xml_path' = {str} 'gt/so/TWO_AG_EY.txt'
'txt_path' = {str} 'fow/fol/TWO_AG_EY.xml'
或
'xml_path' = {str} 'gt/so/TWO_AG_GH.txt'
'txt_path' = {str} 'fow/fol/TWO_AG_GH.xml'
为了实现这一点,我依靠以下代码,其中涉及拆分路径并使用 pandas
合并功能。
但是,我想知道是否有更简洁高效的方法来做到这一点。
这是我到目前为止尝试过的方法
import os
import re
import pandas as pd
def sort_path(path_all):
path_all.sort(key=lambda x: [int(c) if c.isdigit() else c for c in re.split(r'(\d+)', x)])
return path_all
def split_sbj_id(s, idx):
return {'fname': os.path.split(s)[-1].split('.')[0], 'xml_path' if idx == 1 else 'txt_path': s}
def merge_path(ls_eb, ls_mff):
df = pd.merge(pd.DataFrame([split_sbj_id(fname, 1) for fname in ls_eb]),
pd.DataFrame([split_sbj_id(fname, 2) for fname in ls_mff]), on='fname',how='left')
all_d=[dict(fname=row['fname'],xml_path=row['xml_path'],txt_path=row['txt_path'])
for index, row in df.iterrows()]
return all_d
list1=['fow/fol/TWO_BB_P3.xml', 'fow/fol/N0_AG_ES.xml',
'fow/fol/TWO_AG_GH.xml', 'fow/fol/TWO_AG_EY.xml']
list2 =['gt/so/TWO_AG_EY.txt','gt/so/TWO_AG_GH.txt','gt/so/TWO_BB_P3.txt']
path_sbj = merge_path(list1, list2)
产生
[{'fname': 'TWO_BB_P3', 'xml_path': 'fow/fol/TWO_BB_P3.xml', 'txt_path': 'gt/so/TWO_BB_P3.txt'}, {'fname': 'N0_AG_ES', 'xml_path': 'fow/fol/N0_AG_ES.xml', 'txt_path': nan}, {'fname': 'TWO_AG_GH', 'xml_path': 'fow/fol/TWO_AG_GH.xml', 'txt_path': 'gt/so/TWO_AG_GH.txt'}, {'fname': 'TWO_AG_EY', 'xml_path': 'fow/fol/TWO_AG_EY.xml', 'txt_path': 'gt/so/TWO_AG_EY.txt'}]
Pandas接近
s1 = pd.DataFrame({'xml_path': list1})
s2 = pd.DataFrame({'txt_path': list2})
regex = r'([^/]+)\.(?:xml|txt)$'
s1['fname'] = s1['xml_path'].str.extract(regex)
s2['fname'] = s2['txt_path'].str.extract(regex)
s1.merge(s2, how='outer').to_dict('r')
[{'fname': 'TWO_BB_P3',
'txt_path': 'gt/so/TWO_BB_P3.txt',
'xml_path': 'fow/fol/TWO_BB_P3.xml'},
{'fname': 'N0_AG_ES', 'txt_path': nan, 'xml_path': 'fow/fol/N0_AG_ES.xml'},
{'fname': 'TWO_AG_GH',
'txt_path': 'gt/so/TWO_AG_GH.txt',
'xml_path': 'fow/fol/TWO_AG_GH.xml'},
{'fname': 'TWO_AG_EY',
'txt_path': 'gt/so/TWO_AG_EY.txt',
'xml_path': 'fow/fol/TWO_AG_EY.xml'}]