python 仅在子目录中执行脚本并且不会返回到根目录
python script only in subdirectories and not going back to root
我的 python 脚本使用 os.walk 遍历子目录。我正在 运行 从当前工作目录中获取脚本,并希望该脚本仅在当前工作目录及其子目录中运行。但是,该脚本返回到根目录并从那里开始查找所有文件。
因此,例如,目录结构是:
文件夹1
- sub1
文件夹2
- sub2
- sub3
如果我 运行 我的脚本在 folder1 中,我只需要 folder1 和 sub1 中的文件,但我的脚本还使用 folder2 和 sub2 和 sub3 中的文件。
我的脚本很大,但希望有人能给我一个简短的答案,告诉我如何更改所有 os.walk 行以防止 python 进入其他文件夹。
#next step
print('Start merging contig files')
for root, dirs, files in os.walk(os.getcwd()):
filepath = os.path.join(root, 'genes.faa.genespercontig.csv')
if os.path.isfile(filepath):
with open(filepath, 'r') as f1:
df1 = pd.read_csv(f1, header=None, delim_whitespace=True, names = ["contig", "genes"])
df1['genome'] = os.path.basename(os.path.dirname(filepath))
else:
continue
filepath = os.path.join(root, 'hmmer.analyze.txt.results.txt')
if os.path.isfile(filepath):
with open(filepath, 'r') as f2:
df2 = pd.read_csv(f2, header=None, delim_whitespace=True, names = ["contig", "SCM"])
df2['genome'] = os.path.basename(os.path.dirname(filepath))
else:
continue
filepath = os.path.join(root, 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out')
if os.path.isfile(filepath):
with open(filepath, 'r') as f3:
df3 = pd.read_csv(f3, header=None, delim_whitespace=True, names = ["contig", "plasmid_genes"])
df3['genome'] = os.path.basename(os.path.dirname(filepath))
else:
continue
#merge dataframes
dfmerge1 = pd.merge(df1, df2, on=['genome', 'contig'], how='outer')
df_end = pd.merge(dfmerge1, df3, on=['genome', 'contig'], how='outer')
#set NaN in columns to 0
nan_cols = df_end.columns[df_end.isnull().any(axis=0)]
for col in nan_cols:
df_end[col] = df_end[col].fillna(0).astype(int)
#add column with genes/SCM en round to 2 decimals
df_end['SCM/genes'] = df_end['SCM']/df_end['genes']
df_end['SCM/genes'] = df_end['SCM/genes'].round(2)
#add column with genes/plasmid_genes en round to 2 decimals
df_end['plasmid_genes/genes'] = df_end['plasmid_genes']/df_end['genes']
df_end['plasmid_genes/genes'] = df_end['plasmid_genes/genes'].round(2)
df_end.to_csv(os.path.join(root,'outputgenesdf.csv'))
print('extra columns;done')
#next step
#CURRENT DIRECTORY
cd = os.path.dirname(os.getcwd())
# concatenate csv files
dfList = []
for root, dirs, files in os.walk(cd):
for fname in files:
if re.match("outputgenesdf.csv", fname):
frame = pd.read_csv(os.path.join(root, fname))
dfList.append(frame)
df = pd.concat(dfList)
基于:
[...] and want that the script only works in the current working directory and its subdirectories.[...]
你可以试试这个:
def next_file(directory=os.getcwd(), max_depth=0, depth=0):
if max_depth < 0 or depth <= max_depth:
for name in os.listdir(directory):
with_path = os.path.join(directory, name)
if os.path.isfile(with_path):
yield with_path
else:
for a_file in next_file(directory=with_path, max_depth=max_depth, depth=depth+1):
yield a_file
并使用以下方法处理您的文件:
for a_file in next_file(max_depth=1):
print 'processing file: %s' % a_file
# do your stuff here
使用max_depth
来控制要处理的嵌套目录数。 0
读取当前目录中的文件,-1
处理所有目录。 (如 os.walk
)。
编辑
我对文件迭代方法做了一个小修改。
这是您脚本的完整(未经测试)版本:
def next_file(current_dir=os.getcwd(), max_depth=0, depth=0):
if max_depth < 0 or depth <= max_depth:
for name in os.listdir(current_dir):
with_path = os.path.join(current_dir, name)
if os.path.isfile(with_path):
yield current_dir, name
else:
for directory, name in next_file(current_dir=with_path, max_depth=max_depth, depth=depth+1):
yield directory, name
for directory, name in next_file(max_depth=1):
print 'file: %s' % name
print('Start merging contig files')
## for root, dirs, files in os.walk(os.getcwd()):
for directory, name in next_file(max_depth=1):
## filepath = os.path.join(root, 'genes.faa.genespercontig.csv')
filepath = os.path.join(directory, name)
## if os.path.isfile(filepath):
if name == 'genes.faa.genespercontig.csv':
## with open(filepath, 'r') as f1:
with open(filepath, 'r')
df1 = pd.read_csv(f1, header=None, delim_whitespace=True, names = ["contig", "genes"])
## df1['genome'] = os.path.basename(os.path.dirname(filepath))
df1['genome'] = filepath
## else: # Not necessary
## continue
## filepath = os.path.join(root, 'hmmer.analyze.txt.results.txt')
## if os.path.isfile(filepath):
if name == 'hmmer.analyze.txt.results.txt':
with open(filepath, 'r') as f2:
df2 = pd.read_csv(f2, header=None, delim_whitespace=True, names = ["contig", "SCM"])
## df2['genome'] = os.path.basename(os.path.dirname(filepath))
df2['genome'] = filepath
## else:
## continue
## filepath = os.path.join(root, 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out')
## if os.path.isfile(filepath):
if name == 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out':
with open(filepath, 'r') as f3:
df3 = pd.read_csv(f3, header=None, delim_whitespace=True, names = ["contig", "plasmid_genes"])
## df3['genome'] = os.path.basename(os.path.dirname(filepath))
df3['genome'] = filepath
## else:
## continue
#merge dataframes
dfmerge1 = pd.merge(df1, df2, on=['genome', 'contig'], how='outer')
df_end = pd.merge(dfmerge1, df3, on=['genome', 'contig'], how='outer')
#set NaN in columns to 0
nan_cols = df_end.columns[df_end.isnull().any(axis=0)]
for col in nan_cols:
df_end[col] = df_end[col].fillna(0).astype(int)
#add column with genes/SCM en round to 2 decimals
df_end['SCM/genes'] = df_end['SCM']/df_end['genes']
df_end['SCM/genes'] = df_end['SCM/genes'].round(2)
#add column with genes/plasmid_genes en round to 2 decimals
df_end['plasmid_genes/genes'] = df_end['plasmid_genes']/df_end['genes']
df_end['plasmid_genes/genes'] = df_end['plasmid_genes/genes'].round(2)
#CURRENT DIRECTORY
cd = os.path.dirname(os.getcwd())
df_end.to_csv(os.path.join(cd,'outputgenesdf.csv'))
print('extra columns;done')
#next step
# concatenate csv files
dfList = []
## I'm not sure what you want to achieve with this:
for root, dirs, files in os.walk(cd):
for fname in files:
if re.match("outputgenesdf.csv", fname):
frame = pd.read_csv(os.path.join(root, fname))
dfList.append(frame)
df = pd.concat(dfList)
请注意,您的文件处理逻辑未经测试(我测试了文件迭代方法)。而且我不熟悉 pandas,但是在每个数据框中,您都设置了一个 gnome 属性 df1['genome']
来保存文件的路径。我不确定那是不是你想要的。
最后,我不明白你将 outputgenesdf.csv
文件合并到一个数据框中的最后一步。
希望这对您有所帮助。
我的 python 脚本使用 os.walk 遍历子目录。我正在 运行 从当前工作目录中获取脚本,并希望该脚本仅在当前工作目录及其子目录中运行。但是,该脚本返回到根目录并从那里开始查找所有文件。 因此,例如,目录结构是: 文件夹1 - sub1 文件夹2 - sub2 - sub3
如果我 运行 我的脚本在 folder1 中,我只需要 folder1 和 sub1 中的文件,但我的脚本还使用 folder2 和 sub2 和 sub3 中的文件。 我的脚本很大,但希望有人能给我一个简短的答案,告诉我如何更改所有 os.walk 行以防止 python 进入其他文件夹。
#next step
print('Start merging contig files')
for root, dirs, files in os.walk(os.getcwd()):
filepath = os.path.join(root, 'genes.faa.genespercontig.csv')
if os.path.isfile(filepath):
with open(filepath, 'r') as f1:
df1 = pd.read_csv(f1, header=None, delim_whitespace=True, names = ["contig", "genes"])
df1['genome'] = os.path.basename(os.path.dirname(filepath))
else:
continue
filepath = os.path.join(root, 'hmmer.analyze.txt.results.txt')
if os.path.isfile(filepath):
with open(filepath, 'r') as f2:
df2 = pd.read_csv(f2, header=None, delim_whitespace=True, names = ["contig", "SCM"])
df2['genome'] = os.path.basename(os.path.dirname(filepath))
else:
continue
filepath = os.path.join(root, 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out')
if os.path.isfile(filepath):
with open(filepath, 'r') as f3:
df3 = pd.read_csv(f3, header=None, delim_whitespace=True, names = ["contig", "plasmid_genes"])
df3['genome'] = os.path.basename(os.path.dirname(filepath))
else:
continue
#merge dataframes
dfmerge1 = pd.merge(df1, df2, on=['genome', 'contig'], how='outer')
df_end = pd.merge(dfmerge1, df3, on=['genome', 'contig'], how='outer')
#set NaN in columns to 0
nan_cols = df_end.columns[df_end.isnull().any(axis=0)]
for col in nan_cols:
df_end[col] = df_end[col].fillna(0).astype(int)
#add column with genes/SCM en round to 2 decimals
df_end['SCM/genes'] = df_end['SCM']/df_end['genes']
df_end['SCM/genes'] = df_end['SCM/genes'].round(2)
#add column with genes/plasmid_genes en round to 2 decimals
df_end['plasmid_genes/genes'] = df_end['plasmid_genes']/df_end['genes']
df_end['plasmid_genes/genes'] = df_end['plasmid_genes/genes'].round(2)
df_end.to_csv(os.path.join(root,'outputgenesdf.csv'))
print('extra columns;done')
#next step
#CURRENT DIRECTORY
cd = os.path.dirname(os.getcwd())
# concatenate csv files
dfList = []
for root, dirs, files in os.walk(cd):
for fname in files:
if re.match("outputgenesdf.csv", fname):
frame = pd.read_csv(os.path.join(root, fname))
dfList.append(frame)
df = pd.concat(dfList)
基于:
[...] and want that the script only works in the current working directory and its subdirectories.[...]
你可以试试这个:
def next_file(directory=os.getcwd(), max_depth=0, depth=0):
if max_depth < 0 or depth <= max_depth:
for name in os.listdir(directory):
with_path = os.path.join(directory, name)
if os.path.isfile(with_path):
yield with_path
else:
for a_file in next_file(directory=with_path, max_depth=max_depth, depth=depth+1):
yield a_file
并使用以下方法处理您的文件:
for a_file in next_file(max_depth=1):
print 'processing file: %s' % a_file
# do your stuff here
使用max_depth
来控制要处理的嵌套目录数。 0
读取当前目录中的文件,-1
处理所有目录。 (如 os.walk
)。
编辑
我对文件迭代方法做了一个小修改。
这是您脚本的完整(未经测试)版本:
def next_file(current_dir=os.getcwd(), max_depth=0, depth=0):
if max_depth < 0 or depth <= max_depth:
for name in os.listdir(current_dir):
with_path = os.path.join(current_dir, name)
if os.path.isfile(with_path):
yield current_dir, name
else:
for directory, name in next_file(current_dir=with_path, max_depth=max_depth, depth=depth+1):
yield directory, name
for directory, name in next_file(max_depth=1):
print 'file: %s' % name
print('Start merging contig files')
## for root, dirs, files in os.walk(os.getcwd()):
for directory, name in next_file(max_depth=1):
## filepath = os.path.join(root, 'genes.faa.genespercontig.csv')
filepath = os.path.join(directory, name)
## if os.path.isfile(filepath):
if name == 'genes.faa.genespercontig.csv':
## with open(filepath, 'r') as f1:
with open(filepath, 'r')
df1 = pd.read_csv(f1, header=None, delim_whitespace=True, names = ["contig", "genes"])
## df1['genome'] = os.path.basename(os.path.dirname(filepath))
df1['genome'] = filepath
## else: # Not necessary
## continue
## filepath = os.path.join(root, 'hmmer.analyze.txt.results.txt')
## if os.path.isfile(filepath):
if name == 'hmmer.analyze.txt.results.txt':
with open(filepath, 'r') as f2:
df2 = pd.read_csv(f2, header=None, delim_whitespace=True, names = ["contig", "SCM"])
## df2['genome'] = os.path.basename(os.path.dirname(filepath))
df2['genome'] = filepath
## else:
## continue
## filepath = os.path.join(root, 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out')
## if os.path.isfile(filepath):
if name == 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out':
with open(filepath, 'r') as f3:
df3 = pd.read_csv(f3, header=None, delim_whitespace=True, names = ["contig", "plasmid_genes"])
## df3['genome'] = os.path.basename(os.path.dirname(filepath))
df3['genome'] = filepath
## else:
## continue
#merge dataframes
dfmerge1 = pd.merge(df1, df2, on=['genome', 'contig'], how='outer')
df_end = pd.merge(dfmerge1, df3, on=['genome', 'contig'], how='outer')
#set NaN in columns to 0
nan_cols = df_end.columns[df_end.isnull().any(axis=0)]
for col in nan_cols:
df_end[col] = df_end[col].fillna(0).astype(int)
#add column with genes/SCM en round to 2 decimals
df_end['SCM/genes'] = df_end['SCM']/df_end['genes']
df_end['SCM/genes'] = df_end['SCM/genes'].round(2)
#add column with genes/plasmid_genes en round to 2 decimals
df_end['plasmid_genes/genes'] = df_end['plasmid_genes']/df_end['genes']
df_end['plasmid_genes/genes'] = df_end['plasmid_genes/genes'].round(2)
#CURRENT DIRECTORY
cd = os.path.dirname(os.getcwd())
df_end.to_csv(os.path.join(cd,'outputgenesdf.csv'))
print('extra columns;done')
#next step
# concatenate csv files
dfList = []
## I'm not sure what you want to achieve with this:
for root, dirs, files in os.walk(cd):
for fname in files:
if re.match("outputgenesdf.csv", fname):
frame = pd.read_csv(os.path.join(root, fname))
dfList.append(frame)
df = pd.concat(dfList)
请注意,您的文件处理逻辑未经测试(我测试了文件迭代方法)。而且我不熟悉 pandas,但是在每个数据框中,您都设置了一个 gnome 属性 df1['genome']
来保存文件的路径。我不确定那是不是你想要的。
最后,我不明白你将 outputgenesdf.csv
文件合并到一个数据框中的最后一步。
希望这对您有所帮助。