使用 Python 配对相似的文件名
Pairing similar filenames using Python
我的文件夹中包含如下文件:
file1text_part1_morefile1text.ext
file1text_part2_morefile1text.ext
file2text_part1_morefile2text.ext
file2text_part2_morefile2text.ext
...
输出应该是这样的元组列表:
[(file1text_part1_morefile1text.ext, file1text_part2_morefile1text.ext), (...), ...]
我正在尝试使用循环和 os.split()
创建解决方案,但我无法获得正确的列表输出。您有什么建议或提示吗?
编辑:现在我意识到文件名中可能有更多下划线,例如 file_1_text_part1_morefile_1text.ext
。然而,这两个文件之间唯一不同的是 part1/part2.
取每个文件的主干(不带扩展名的名称),提取除 _partX_
之外的所有内容作为公共标识符,并按以下方式对文件进行分组:
import re
from collections import defaultdict
from pathlib import Path
filenames = Path('/base/folder/').glob('*')
grouped = defaultdict(list)
for file in filenames:
key = re.split(r'_part\d+_', file)
grouped[tuple(key)].append(file)
for key, files in grouped.items():
print(files)
如果您考虑 file_1_text
和 file1text
之间的区别,试试这个:
from collections import defaultdict
import re
fl_nm = ['file1text_part1_morefile1text.ext', 'file1text_part2_morefile1text.ext' ,
'file2text_part1_morefile2text.ext', 'file2text_part2_morefile2text.ext' ,
'file_1_text_part1_morefile_1text.ext', 'file_1_text_part2_morefile_1text.ext']
dct = defaultdict(list)
for fn in fl_nm:
# print()
dct[fn.split('_part')[0]].append(fn)
# dct
# defaultdict(list,
# {'file1text': ['file1text_part1_morefile1text.ext',
# 'file1text_part2_morefile1text.ext'],
# 'file2text': ['file2text_part1_morefile2text.ext',
# 'file2text_part2_morefile2text.ext'],
# 'file_1_text': ['file_1_text_part1_morefile_1text.ext',
# 'file_1_text_part2_morefile_1text.ext']
[tuple(v) for k,v in dct.items()]
输出:
[('file1text_part1_morefile1text.ext', 'file1text_part2_morefile1text.ext'),
('file2text_part1_morefile2text.ext', 'file2text_part2_morefile2text.ext'),
('file_1_text_part1_morefile_1text.ext','file_1_text_part2_morefile_1text.ext')]
或者如果您不考虑file_1_text
和file1text
之间的区别,试试这个:
from collections import defaultdict
import re
fl_nm = ['file1text_part1_morefile1text.ext', 'file1text_part2_morefile1text.ext' ,
'file2text_part1_morefile2text.ext', 'file2text_part2_morefile2text.ext' ,
'file_1_text_part1_morefile_1text.ext', 'file3text_part1_morefile2text.ext']
dct = defaultdict(list)
for fn in fl_nm:
dct[re.findall(r'\d+' , re.findall(r'file(.*)text', fn)[0])[0]].append(fn)
# dct
# defaultdict(list,
# {'1': ['file1text_part1_morefile1text.ext',
# 'file1text_part2_morefile1text.ext',
# 'file_1_text_part1_morefile_1text.ext'],
# '2': ['file2text_part1_morefile2text.ext',
# 'file2text_part2_morefile2text.ext'],
# '3': ['file3text_part1_morefile2text.ext']})
[tuple(v) for k,v in dct.items()]
输出:
[('file1text_part1_morefile1text.ext',
'file1text_part2_morefile1text.ext',
'file_1_text_part1_morefile_1text.ext'),
('file2text_part1_morefile2text.ext', 'file2text_part2_morefile2text.ext'),
('file3text_part1_morefile2text.ext',)]
使用os.listdir(path)
获取所有文件名。遍历每个 file_name 并匹配模式。
import os, re
def get_matched_files(path, pattern):
matched_files = []
for file_name in os.listdir(path=path):
if re.match(pattern, file_name):
matched_files.append(file_name)
return matched_files
if __name__ == '__main__':
path = "./"
pattern = "file(\d+)text_part(\d+)_morefile(\d+)text.ext"
print(get_matched_files(path, pattern))
输出:
['file1text_part1_morefile1text.ext', 'file1text_part2_morefile1text.ext', 'file2text_part1_morefile2text.ext', 'file2text_part2_morefile2text.ext']
Process finished with exit code 0