如何使用 list dir 和 path walk 优化搜索?
How to optimize search with list dir and path walk?
Python2.7.5Win/Mac.
我正在尝试找到在多个存储(大约 128Tio)上搜索文件(超过 10000 个)的最佳方法。这些文件有特定的扩展名,我可以忽略一些文件夹。
这是我使用 os.listdir
和递归的第一个函数:
count = 0
def SearchFiles1(path):
global count
pathList = os.listdir(path)
for i in pathList:
subPath = path+os.path.sep+i
if os.path.isfile(subPath) == True :
fileName = os.path.basename(subPath)
extension = fileName[fileName.rfind("."):]
if ".ext1" in extension or ".ext2" in extension or ".ext3" in extension:
count += 1
#do stuff . . .
else :
if os.path.isdir(subPath) == True:
if not "UselessFolder1" in subPath and not "UselessFolder1" in subPath:
SearchFiles1(subPath)
它有效,但我认为它可能会更好(更快更合适)还是我错了?
所以我尝试了 os.path.walk
:
def SearchFiles2(path):
count = 0
for dirpath, subdirs, files in os.walk(path):
for i in dirpath:
if not "UselessFolder1" in i and not "UselessFolder1" in i:
for y in files:
fileName = os.path.basename(y)
extension = fileName[fileName.rfind("."):]
if ".ext2" in extension or ".ext2" in extension or ".ext3" in extension:
count += 1
# do stuff . . .
return count
"count" 是错误的,而且速度慢了一点。而且我想我真的不明白 path.walk
是如何工作的。
我的问题是:我可以做些什么来优化这项研究?
你的第一个解决方案是合理的,除了你可以使用 os.path.splitext
。在第二个解决方案中,它是不正确的,因为您重新访问每个子目录的文件列表,而不是只处理一次。使用 os.path.walk
的诀窍是从 subdirs
中删除的目录不是下一轮枚举的一部分。
def SearchFiles2(path):
useless_dirs = set(("UselessFolder1", "UselessFolder2"))
useless_files = set((".ext1", ".ext2"))
count = 0
for dirpath, subdirs, files in os.walk(path):
# remove unwanted subdirs from future enumeration
for name in set(subdirs) & useless_dir:
subdirs.remove(name)
# list of interesting files
myfiles = [os.path.join(dirpath, name) for name in files
if os.path.splitext(name)[1] not in useless_files]
count += len(myfiles)
for filepath in myfiles:
# example shows file stats
print(filepath, os.stat(filepath)
return count
枚举单个存储单元的文件系统只能这么快。加快速度的最佳方法是 运行 枚举不同线程中的不同存储单元。
所以在与 tdelaney 进行测试和讨论之后,我对这两个解决方案进行了如下优化:
import os
count = 0
target_files = set((".ext1", ".ext2", ".ext3")) # etc
useless_dirs = set(("UselessFolder2", "UselessFolder2")) # etc
# it could be target_dirs, just change `in` with `not in` when compared.
def SearchFiles1(path):
global count
pathList = os.listdir(path)
for content in pathList:
fullPath = os.path.join(path,content)
if os.path.isfile(fullPath):
if os.path.splitext(fullPath)[1] in target_files:
count += 1
#do stuff with 'fullPath' . . .
else :
if os.path.isdir(fullPath):
if fullPath not in useless_dirs:
SearchFiles1(fullPath)
def SearchFiles2(path):
count = 0
for dirpath, subdirs, files in os.walk(path):
for name in set(subdirs) & useless_dirs:
subdirs.remove(name)
for filename in [name for name in files if os.path.splitext(name)[1] in target_files]:
count += 1
fullPath = os.path.join(dirpath, filename)
#do stuff with 'fullPath' . . .
return count
在 Mac/PC v2.7.5
上运行良好
关于速度完全均匀。
Python2.7.5Win/Mac.
我正在尝试找到在多个存储(大约 128Tio)上搜索文件(超过 10000 个)的最佳方法。这些文件有特定的扩展名,我可以忽略一些文件夹。
这是我使用 os.listdir
和递归的第一个函数:
count = 0
def SearchFiles1(path):
global count
pathList = os.listdir(path)
for i in pathList:
subPath = path+os.path.sep+i
if os.path.isfile(subPath) == True :
fileName = os.path.basename(subPath)
extension = fileName[fileName.rfind("."):]
if ".ext1" in extension or ".ext2" in extension or ".ext3" in extension:
count += 1
#do stuff . . .
else :
if os.path.isdir(subPath) == True:
if not "UselessFolder1" in subPath and not "UselessFolder1" in subPath:
SearchFiles1(subPath)
它有效,但我认为它可能会更好(更快更合适)还是我错了?
所以我尝试了 os.path.walk
:
def SearchFiles2(path):
count = 0
for dirpath, subdirs, files in os.walk(path):
for i in dirpath:
if not "UselessFolder1" in i and not "UselessFolder1" in i:
for y in files:
fileName = os.path.basename(y)
extension = fileName[fileName.rfind("."):]
if ".ext2" in extension or ".ext2" in extension or ".ext3" in extension:
count += 1
# do stuff . . .
return count
"count" 是错误的,而且速度慢了一点。而且我想我真的不明白 path.walk
是如何工作的。
我的问题是:我可以做些什么来优化这项研究?
你的第一个解决方案是合理的,除了你可以使用 os.path.splitext
。在第二个解决方案中,它是不正确的,因为您重新访问每个子目录的文件列表,而不是只处理一次。使用 os.path.walk
的诀窍是从 subdirs
中删除的目录不是下一轮枚举的一部分。
def SearchFiles2(path):
useless_dirs = set(("UselessFolder1", "UselessFolder2"))
useless_files = set((".ext1", ".ext2"))
count = 0
for dirpath, subdirs, files in os.walk(path):
# remove unwanted subdirs from future enumeration
for name in set(subdirs) & useless_dir:
subdirs.remove(name)
# list of interesting files
myfiles = [os.path.join(dirpath, name) for name in files
if os.path.splitext(name)[1] not in useless_files]
count += len(myfiles)
for filepath in myfiles:
# example shows file stats
print(filepath, os.stat(filepath)
return count
枚举单个存储单元的文件系统只能这么快。加快速度的最佳方法是 运行 枚举不同线程中的不同存储单元。
所以在与 tdelaney 进行测试和讨论之后,我对这两个解决方案进行了如下优化:
import os
count = 0
target_files = set((".ext1", ".ext2", ".ext3")) # etc
useless_dirs = set(("UselessFolder2", "UselessFolder2")) # etc
# it could be target_dirs, just change `in` with `not in` when compared.
def SearchFiles1(path):
global count
pathList = os.listdir(path)
for content in pathList:
fullPath = os.path.join(path,content)
if os.path.isfile(fullPath):
if os.path.splitext(fullPath)[1] in target_files:
count += 1
#do stuff with 'fullPath' . . .
else :
if os.path.isdir(fullPath):
if fullPath not in useless_dirs:
SearchFiles1(fullPath)
def SearchFiles2(path):
count = 0
for dirpath, subdirs, files in os.walk(path):
for name in set(subdirs) & useless_dirs:
subdirs.remove(name)
for filename in [name for name in files if os.path.splitext(name)[1] in target_files]:
count += 1
fullPath = os.path.join(dirpath, filename)
#do stuff with 'fullPath' . . .
return count
在 Mac/PC v2.7.5
上运行良好关于速度完全均匀。