如何列出所有不包含文件类型的目录?
How to list all directories that do not contain a file type?
我正在尝试 return 所有目录的唯一列表 (set
),如果它们不包含某些文件类型。如果未找到该文件类型,请将该目录名称添加到列表中以供进一步审核。
下面的函数将找到所有有效的文件夹并将其添加到一个集合中以供进一步比较。我想将其扩展到仅 return 那些不包含 out_list
中的文件的目录。这些目录可以包含 out_list
中带有文件的子目录。如果那是真的,我只想要有效目录的文件夹名称的路径。
# directory = r'w:\workorder'
#
# Example:
# w:\workorder\region1345678\hi.pdf
# w:\workorder\region2456789\test\bye.pdf
# w:\workorder\region3567891\<empty>
# w:\workorder\region4678912\Final.doc
#
# Results:
# ['34567891', '45678912']
job_folders = set([]) #set list is unique
out_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory):
"""
This function will generate the file names in a directory
tree by walking the tree either top-down or bottom-up. For each
directory in the tree rooted at directory top (including top itself),
it yields a 3-tuple (dirpath, dirnames, filenames).
"""
folder_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for item in os.listdir(directory):
if os.path.isdir(os.path.join(directory, item)):
folderpath = os.path.join(directory, item) # Join the two strings in order to form the full folderpath.
if re.search('^[0-9]', item):
job_folders.add(item[:8])
folder_paths.append(folderpath) # Add it to the list.
return folder_paths
获取文件扩展名:
name,ext = os.path.splitext(os.path.join(directory,item))
if ext not in out_list:
job_folders.add(item[:8])
这是你想要的吗?
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in get_directories_without_exts('W:\workorder', exts):
print(directory)
def get_directories_without_exts(root, exts):
for root, dirs, files in os.walk(root):
for file in files:
if os.path.splitext(file)[1] in exts:
break
else:
yield root
if __name__ == '__main__':
main()
编辑: 看了你的要求后,我决定创建一个树对象来分析你的目录结构。创建后,很容易通过缓存进行递归查询以查找目录是否为 "is okay." 从那里创建一个仅查找 "not okay" 的顶级目录的生成器非常简单。可能有更好的方法来执行此操作,但代码至少应该有效。
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in Tree('W:\workorder', exts).not_okay:
print(directory)
class Tree:
def __init__(self, root, exts):
if not os.path.isdir(root):
raise ValueError('root must be a directory')
self.name = root
self.exts = exts
self.files = set()
self.directories = []
try:
names = os.listdir(root)
except OSError:
pass
else:
for child in names:
path = os.path.join(root, child)
if os.path.isfile(path):
self.files.add(os.path.splitext(child)[1])
elif os.path.isdir(path):
self.directories.append(self.__class__(path, exts))
self._is_okay = None
@property
def is_okay(self):
if self._is_okay is None:
self._is_okay = any(c.is_okay for c in self.directories) or \
any(c in self.exts for c in self.files)
return self._is_okay
@property
def not_okay(self):
if self.is_okay:
for child in self.directories:
for not_okay in child.not_okay:
yield not_okay
else:
yield self.name
if __name__ == '__main__':
main()
您是否从其他地方复制并粘贴了现有代码?因为文档字符串似乎是 os.walk
...
你的问题有几点不清楚:
- 您声明代码的目标是 "return a unique list (set) of all directories if they do not contain certain file types"。
- 首先
list
和set
是不同的数据结构
- 其次,您的代码创建了每个之一:
job_folders
是一个包含数字的文件夹名称set
,而folder_paths
是一个list
个文件夹的完整路径,无论它们是否包含数字。
- 你实际上想要什么作为这里的输出?
- "those directories that DO NOT contain files in the out_list"应该递归定义,还是只包含那些目录的一级内容? 我的解决方案假设是后者
- 您的示例在这一点上是矛盾的:它在结果中显示
34567891
,但在结果中显示 而不是 region3
。无论定义是否递归,region3
都应该 包含在结果中,因为 region3
不包含其下列出的任何扩展名的文件。
-
job_folders
应该只填充满足其内容标准的目录,还是 所有 包含数字的文件夹名称? 我的解决方案假设是后者
我要强调的一个 poor practice in your code 是您对全局变量 out_list
和 job_folders
的使用。 我已将前者更改为 get_filepaths
的第二个参数,将后者更改为第二个 return 值。
不管怎样,解决方法来了...
import os, re
ext_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory, ext_list):
folder_paths = [] # List which will store all of the full filepaths.
job_folders = set([])
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
_, lastlevel = os.path.split(dir)
if re.search('^[0-9]', lastlevel):
job_folders.add(lastlevel[:8])
for item in files:
root, ext = os.path.splitext(item)
if ext in ext_list:
break
else:
# Since none of the file extensions matched ext_list, add it to the list of folder_paths
folder_paths.append(os.path.relpath(dir, directory))
return folder_paths, job_folders
我在 /tmp
和 运行 下创建了一个与您的目录结构相同的目录结构:
folder_paths, job_folders = get_filepaths( os.path.expandvars(r"%TEMP%\workorder"), ext_list )
print "folder_paths =", folder_paths
print "job_folders =", job_folders
这是输出:
folder_paths = ['.', 'region1', 'region2', 'region2\23456789', 'region3', 'region3\34567891', 'region4', 'region4\456789123']
job_folders = set(['12345678', '23456789', '34567891', '45678912'])
如您所见,region1345678
和 region2456789\test
未 包含在输出 folder_paths
中,因为它们 包含 直接包含指定扩展名的文件;所有其他子目录都包含在输出中,因为它们不直接包含指定扩展名的文件.
多亏了@DanLenski 和@NoctisSkytower,我才得以解决这个问题。
我的 WorkOrder
目录在走 in_path
时总是第 7 个文件夹,我发现使用 os.sep
.
我借鉴了你们的两种解决方案并得出了以下结论:
import os, re
ext_list = [".pdf"]
in_path = r'\server\E\Data\WorkOrder'
def get_filepaths(directory, ext_list):
not_okay = set([]) # Set which will store Job folder where no ext_list files found
okay = set([]) # Set which will store Job folder where ext_list files found
job_folders = set([]) #valid Job ID folder
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
for item in files:
root, ext = os.path.splitext(item)
if len(dir.split(os.sep)) >= 8: #Tree must contain Job ID folder
job_folder = dir.split(os.sep)[7]
if ext in ext_list:
okay.add(job_folder)
else: # Since none of the file extensions matched ext_list, add it to the list of folder_paths
not_okay.add(job_folder)
bad_list = list(not_okay - okay)
bad_list.sort()
return bad_list
bad_list = get_filepaths( os.path.expandvars(in_path), ext_list )
我正在尝试 return 所有目录的唯一列表 (set
),如果它们不包含某些文件类型。如果未找到该文件类型,请将该目录名称添加到列表中以供进一步审核。
下面的函数将找到所有有效的文件夹并将其添加到一个集合中以供进一步比较。我想将其扩展到仅 return 那些不包含 out_list
中的文件的目录。这些目录可以包含 out_list
中带有文件的子目录。如果那是真的,我只想要有效目录的文件夹名称的路径。
# directory = r'w:\workorder'
#
# Example:
# w:\workorder\region1345678\hi.pdf
# w:\workorder\region2456789\test\bye.pdf
# w:\workorder\region3567891\<empty>
# w:\workorder\region4678912\Final.doc
#
# Results:
# ['34567891', '45678912']
job_folders = set([]) #set list is unique
out_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory):
"""
This function will generate the file names in a directory
tree by walking the tree either top-down or bottom-up. For each
directory in the tree rooted at directory top (including top itself),
it yields a 3-tuple (dirpath, dirnames, filenames).
"""
folder_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for item in os.listdir(directory):
if os.path.isdir(os.path.join(directory, item)):
folderpath = os.path.join(directory, item) # Join the two strings in order to form the full folderpath.
if re.search('^[0-9]', item):
job_folders.add(item[:8])
folder_paths.append(folderpath) # Add it to the list.
return folder_paths
获取文件扩展名:
name,ext = os.path.splitext(os.path.join(directory,item))
if ext not in out_list:
job_folders.add(item[:8])
这是你想要的吗?
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in get_directories_without_exts('W:\workorder', exts):
print(directory)
def get_directories_without_exts(root, exts):
for root, dirs, files in os.walk(root):
for file in files:
if os.path.splitext(file)[1] in exts:
break
else:
yield root
if __name__ == '__main__':
main()
编辑: 看了你的要求后,我决定创建一个树对象来分析你的目录结构。创建后,很容易通过缓存进行递归查询以查找目录是否为 "is okay." 从那里创建一个仅查找 "not okay" 的顶级目录的生成器非常简单。可能有更好的方法来执行此操作,但代码至少应该有效。
import os
def main():
exts = {'.pdf', '.ppt', '.txt'}
for directory in Tree('W:\workorder', exts).not_okay:
print(directory)
class Tree:
def __init__(self, root, exts):
if not os.path.isdir(root):
raise ValueError('root must be a directory')
self.name = root
self.exts = exts
self.files = set()
self.directories = []
try:
names = os.listdir(root)
except OSError:
pass
else:
for child in names:
path = os.path.join(root, child)
if os.path.isfile(path):
self.files.add(os.path.splitext(child)[1])
elif os.path.isdir(path):
self.directories.append(self.__class__(path, exts))
self._is_okay = None
@property
def is_okay(self):
if self._is_okay is None:
self._is_okay = any(c.is_okay for c in self.directories) or \
any(c in self.exts for c in self.files)
return self._is_okay
@property
def not_okay(self):
if self.is_okay:
for child in self.directories:
for not_okay in child.not_okay:
yield not_okay
else:
yield self.name
if __name__ == '__main__':
main()
您是否从其他地方复制并粘贴了现有代码?因为文档字符串似乎是 os.walk
...
你的问题有几点不清楚:
- 您声明代码的目标是 "return a unique list (set) of all directories if they do not contain certain file types"。
- 首先
list
和set
是不同的数据结构 - 其次,您的代码创建了每个之一:
job_folders
是一个包含数字的文件夹名称set
,而folder_paths
是一个list
个文件夹的完整路径,无论它们是否包含数字。 - 你实际上想要什么作为这里的输出?
- 首先
- "those directories that DO NOT contain files in the out_list"应该递归定义,还是只包含那些目录的一级内容? 我的解决方案假设是后者
- 您的示例在这一点上是矛盾的:它在结果中显示
34567891
,但在结果中显示 而不是region3
。无论定义是否递归,region3
都应该 包含在结果中,因为region3
不包含其下列出的任何扩展名的文件。
- 您的示例在这一点上是矛盾的:它在结果中显示
-
job_folders
应该只填充满足其内容标准的目录,还是 所有 包含数字的文件夹名称? 我的解决方案假设是后者
我要强调的一个 poor practice in your code 是您对全局变量 out_list
和 job_folders
的使用。 我已将前者更改为 get_filepaths
的第二个参数,将后者更改为第二个 return 值。
不管怎样,解决方法来了...
import os, re
ext_list = [".pdf", ".ppt", ".txt"]
def get_filepaths(directory, ext_list):
folder_paths = [] # List which will store all of the full filepaths.
job_folders = set([])
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
_, lastlevel = os.path.split(dir)
if re.search('^[0-9]', lastlevel):
job_folders.add(lastlevel[:8])
for item in files:
root, ext = os.path.splitext(item)
if ext in ext_list:
break
else:
# Since none of the file extensions matched ext_list, add it to the list of folder_paths
folder_paths.append(os.path.relpath(dir, directory))
return folder_paths, job_folders
我在 /tmp
和 运行 下创建了一个与您的目录结构相同的目录结构:
folder_paths, job_folders = get_filepaths( os.path.expandvars(r"%TEMP%\workorder"), ext_list )
print "folder_paths =", folder_paths
print "job_folders =", job_folders
这是输出:
folder_paths = ['.', 'region1', 'region2', 'region2\23456789', 'region3', 'region3\34567891', 'region4', 'region4\456789123']
job_folders = set(['12345678', '23456789', '34567891', '45678912'])
如您所见,region1345678
和 region2456789\test
未 包含在输出 folder_paths
中,因为它们 包含 直接包含指定扩展名的文件;所有其他子目录都包含在输出中,因为它们不直接包含指定扩展名的文件.
多亏了@DanLenski 和@NoctisSkytower,我才得以解决这个问题。
我的 WorkOrder
目录在走 in_path
时总是第 7 个文件夹,我发现使用 os.sep
.
我借鉴了你们的两种解决方案并得出了以下结论:
import os, re
ext_list = [".pdf"]
in_path = r'\server\E\Data\WorkOrder'
def get_filepaths(directory, ext_list):
not_okay = set([]) # Set which will store Job folder where no ext_list files found
okay = set([]) # Set which will store Job folder where ext_list files found
job_folders = set([]) #valid Job ID folder
# Walk the tree.
for dir, subdirs, files in os.walk(directory):
for item in files:
root, ext = os.path.splitext(item)
if len(dir.split(os.sep)) >= 8: #Tree must contain Job ID folder
job_folder = dir.split(os.sep)[7]
if ext in ext_list:
okay.add(job_folder)
else: # Since none of the file extensions matched ext_list, add it to the list of folder_paths
not_okay.add(job_folder)
bad_list = list(not_okay - okay)
bad_list.sort()
return bad_list
bad_list = get_filepaths( os.path.expandvars(in_path), ext_list )