我的代码很丑陋:只从文件列表中提取我想要的文件
my code is ugly: extracting only the files I want from a list of files
我的代码可以完成工作,但它很丑陋、太长且笨拙。我必须处理数千个文件,这些文件分为 4 组,我只想要一种特定类型
我要:
'.docx'
我不要:
'.pdf'、'SS.docx' 或 'ss.docx'
我尝试了几个 if not
但它们并没有真正起作用。最后我建立了所有文件类型的列表,然后将它们反加入到完整列表中,以便只保留我感兴趣的文件。
问题:
是否可以简化我的 if elif 块?可以用更少的行来直接获取我需要的文件吗?
是否可以将 df 生成打包到一个循环中,而不是每次都手动执行?
#List all dirs under given dirs and subdirs
import os
import pandas as pd
import glob
import docx
from docx.api import Document
#fixed variable
location = 'C:\Data_analysis\N_TRACKING'
#all lists
dirs_in_dir = []
SS_files_in_dir = []
ss_files_in_dir = []
pdfs_in_dir = []
targets_in_dir = []
all_files = []
#active mapping of the directory tree and the files in it : List all dirs under given dirs and subdirs and add to list dirs_in_dirs
# r=>root, d=>directories, f=>files
for r, d, f in os.walk(location):
for item in d:
if '' in item:
dirs_in_dir.append(os.path.join(r, item))
for r, d, f in os.walk(location):
for item in f:
if '' in item:
all_files.append(os.path.join(r, item))
#active mapping: list all pdfs and add to list pdfs_in_dir,
#list all SS containing files and add to list files_in_dir,
#list all.docx files and add to list targets_in_dir
# r=>root, d=>directories, f=>files
for r, d, f in os.walk(location):
for item in f:
if '.pdf' in item:
pdfs_in_dir.append(os.path.join(r, item))
elif 'SS' in item:
SS_files_in_dir.append(os.path.join(r, item))
elif 'ss' in item:
ss_files_in_dir.append(os.path.join(r, item))
elif '.docx' in item:
targets_in_dir.append(os.path.join(r, item))
#antijoin: step one creating df
SS_files_df = pd.DataFrame(SS_files_in_dir)
ss_files_df = pd.DataFrame(ss_files_in_dir)
pdfs_df = pd.DataFrame(pdfs_in_dir)
all_files_df = pd.DataFrame(all_files)
all_files_df.columns=['Files']
SS_files_df.columns=['Files']
ss_files_df.columns=['Files']
pdfs_df.columns=['Files']
all_files_df.columns=['Files']
#antijoin: step 2 subtract all other df from all_files_df
#remove pdf df
no_pdfs = all_files_df.merge(pdfs_df, on='Files', how='left', indicator=True)
index_names = no_pdfs[no_pdfs['_merge'] == 'both'].index
# drop these row indexes
# from dataFrame
no_pdfs.drop(index_names, inplace = True)
no_pdfs.drop(['_merge'], axis = 1, inplace = True)
no_ss = no_pdfs
#remove ss_files
no_ss = no_ss.merge(ss_files_df, on='Files', how='left', indicator=True)
index_names = no_ss[no_ss['_merge'] == 'both'].index
# drop these row indexes
# from dataFrame
no_ss.drop(index_names, inplace = True)
no_ss.drop(['_merge'], axis = 1, inplace = True)
no_SS = no_ss
#remove SS_files
no_SS = no_SS.merge(SS_files_df, on='Files', how='left', indicator=True)
index_names = no_SS[no_SS['_merge'] == 'both'].index
# drop these row indexes
# from dataFrame
no_SS.drop(index_names, inplace = True)
no_SS.drop(['_merge'], axis = 1, inplace = True)
因为你:
- 只需要“.docx”(即由后缀决定)
- 不想要:“.pdf”、'SS.docx' 或 'ss.docx'(即具有这些结尾的 fies)
这可以更简单地完成,如下所示。
代码--选项1 using str endswith
import os
def find_docx(location):
all_files = [] # Will contain found files
# Walk top-down through directory tree
for (root, dirs, files) in os.walk(location, topdown=True):
for f in files:
f = f.lower() # Make conditional case insensitive
if f.endswith('.pdf'):
continue # Skipping pdf
if f.endswith('ss.docx'):
continue # Skipping ss.docx or SS.docx
if f.endswith('.docx'):
# Desired docx (undesired docx has been filtered out by previous conditional)
all_files.append(os.path.join(root, f))
return all_files
代码--使用正则表达式的选项 2
def find_docx(location):
desired = re.compile(r".*?docx$", re.IGNORECASE) # .docx suffix
undesired = re.compile(r".*?(.pdf|ss.docx)$", flags = re.IGNORECASE) # pdf and ss.docx suffix
all_files = [] # Will contain found files
# Walk top-down through directory tree
for (root, dirs, files) in os.walk(location, topdown=True):
for f in files:
if desired.match(f) and not undesired.match(f):
# Matches desired and doesn't match undesired
all_files.append(os.path.join(root, f))
return all_files
用法
使用上述任一 find_docx 函数。
location = r'C:\Data_analysis\N_TRACKING'
all_files = find_docx(location)
我的代码可以完成工作,但它很丑陋、太长且笨拙。我必须处理数千个文件,这些文件分为 4 组,我只想要一种特定类型
我要: '.docx'
我不要: '.pdf'、'SS.docx' 或 'ss.docx'
我尝试了几个 if not
但它们并没有真正起作用。最后我建立了所有文件类型的列表,然后将它们反加入到完整列表中,以便只保留我感兴趣的文件。
问题:
是否可以简化我的 if elif 块?可以用更少的行来直接获取我需要的文件吗?
是否可以将 df 生成打包到一个循环中,而不是每次都手动执行?
#List all dirs under given dirs and subdirs
import os
import pandas as pd
import glob
import docx
from docx.api import Document
#fixed variable
location = 'C:\Data_analysis\N_TRACKING'
#all lists
dirs_in_dir = []
SS_files_in_dir = []
ss_files_in_dir = []
pdfs_in_dir = []
targets_in_dir = []
all_files = []
#active mapping of the directory tree and the files in it : List all dirs under given dirs and subdirs and add to list dirs_in_dirs
# r=>root, d=>directories, f=>files
for r, d, f in os.walk(location):
for item in d:
if '' in item:
dirs_in_dir.append(os.path.join(r, item))
for r, d, f in os.walk(location):
for item in f:
if '' in item:
all_files.append(os.path.join(r, item))
#active mapping: list all pdfs and add to list pdfs_in_dir,
#list all SS containing files and add to list files_in_dir,
#list all.docx files and add to list targets_in_dir
# r=>root, d=>directories, f=>files
for r, d, f in os.walk(location):
for item in f:
if '.pdf' in item:
pdfs_in_dir.append(os.path.join(r, item))
elif 'SS' in item:
SS_files_in_dir.append(os.path.join(r, item))
elif 'ss' in item:
ss_files_in_dir.append(os.path.join(r, item))
elif '.docx' in item:
targets_in_dir.append(os.path.join(r, item))
#antijoin: step one creating df
SS_files_df = pd.DataFrame(SS_files_in_dir)
ss_files_df = pd.DataFrame(ss_files_in_dir)
pdfs_df = pd.DataFrame(pdfs_in_dir)
all_files_df = pd.DataFrame(all_files)
all_files_df.columns=['Files']
SS_files_df.columns=['Files']
ss_files_df.columns=['Files']
pdfs_df.columns=['Files']
all_files_df.columns=['Files']
#antijoin: step 2 subtract all other df from all_files_df
#remove pdf df
no_pdfs = all_files_df.merge(pdfs_df, on='Files', how='left', indicator=True)
index_names = no_pdfs[no_pdfs['_merge'] == 'both'].index
# drop these row indexes
# from dataFrame
no_pdfs.drop(index_names, inplace = True)
no_pdfs.drop(['_merge'], axis = 1, inplace = True)
no_ss = no_pdfs
#remove ss_files
no_ss = no_ss.merge(ss_files_df, on='Files', how='left', indicator=True)
index_names = no_ss[no_ss['_merge'] == 'both'].index
# drop these row indexes
# from dataFrame
no_ss.drop(index_names, inplace = True)
no_ss.drop(['_merge'], axis = 1, inplace = True)
no_SS = no_ss
#remove SS_files
no_SS = no_SS.merge(SS_files_df, on='Files', how='left', indicator=True)
index_names = no_SS[no_SS['_merge'] == 'both'].index
# drop these row indexes
# from dataFrame
no_SS.drop(index_names, inplace = True)
no_SS.drop(['_merge'], axis = 1, inplace = True)
因为你:
- 只需要“.docx”(即由后缀决定)
- 不想要:“.pdf”、'SS.docx' 或 'ss.docx'(即具有这些结尾的 fies)
这可以更简单地完成,如下所示。
代码--选项1 using str endswith
import os
def find_docx(location):
all_files = [] # Will contain found files
# Walk top-down through directory tree
for (root, dirs, files) in os.walk(location, topdown=True):
for f in files:
f = f.lower() # Make conditional case insensitive
if f.endswith('.pdf'):
continue # Skipping pdf
if f.endswith('ss.docx'):
continue # Skipping ss.docx or SS.docx
if f.endswith('.docx'):
# Desired docx (undesired docx has been filtered out by previous conditional)
all_files.append(os.path.join(root, f))
return all_files
代码--使用正则表达式的选项 2
def find_docx(location):
desired = re.compile(r".*?docx$", re.IGNORECASE) # .docx suffix
undesired = re.compile(r".*?(.pdf|ss.docx)$", flags = re.IGNORECASE) # pdf and ss.docx suffix
all_files = [] # Will contain found files
# Walk top-down through directory tree
for (root, dirs, files) in os.walk(location, topdown=True):
for f in files:
if desired.match(f) and not undesired.match(f):
# Matches desired and doesn't match undesired
all_files.append(os.path.join(root, f))
return all_files
用法
使用上述任一 find_docx 函数。
location = r'C:\Data_analysis\N_TRACKING'
all_files = find_docx(location)