python 仅在子目录中执行脚本并且不会返回到根目录

python script only in subdirectories and not going back to root

我的 python 脚本使用 os.walk 遍历子目录。我正在 运行 从当前工作目录中获取脚本,并希望该脚本仅在当前工作目录及其子目录中运行。但是,该脚本返回到根目录并从那里开始查找所有文件。 因此,例如,目录结构是: 文件夹1 - sub1 文件夹2 - sub2 - sub3

如果我 运行 我的脚本在 folder1 中,我只需要 folder1 和 sub1 中的文件,但我的脚本还使用 folder2 和 sub2 和 sub3 中的文件。 我的脚本很大,但希望有人能给我一个简短的答案,告诉我如何更改所有 os.walk 行以防止 python 进入其他文件夹。

#next step
print('Start merging contig files')

for root, dirs, files in os.walk(os.getcwd()):
    filepath = os.path.join(root, 'genes.faa.genespercontig.csv')
    if os.path.isfile(filepath):
        with open(filepath, 'r') as f1:
            df1 = pd.read_csv(f1, header=None, delim_whitespace=True, names = ["contig", "genes"])
            df1['genome'] = os.path.basename(os.path.dirname(filepath))
    else:
        continue

    filepath = os.path.join(root, 'hmmer.analyze.txt.results.txt')
    if os.path.isfile(filepath):
        with open(filepath, 'r') as f2:
            df2 = pd.read_csv(f2, header=None, delim_whitespace=True, names = ["contig", "SCM"])
            df2['genome'] = os.path.basename(os.path.dirname(filepath))
    else:
        continue

    filepath = os.path.join(root, 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out')
    if os.path.isfile(filepath):
        with open(filepath, 'r') as f3:
            df3 = pd.read_csv(f3, header=None, delim_whitespace=True, names = ["contig", "plasmid_genes"])
            df3['genome'] = os.path.basename(os.path.dirname(filepath))
    else:
        continue

    #merge dataframes
    dfmerge1 = pd.merge(df1, df2, on=['genome', 'contig'], how='outer')
    df_end = pd.merge(dfmerge1, df3, on=['genome', 'contig'], how='outer')

    #set NaN in columns to 0
    nan_cols = df_end.columns[df_end.isnull().any(axis=0)]
    for col in nan_cols:
        df_end[col] = df_end[col].fillna(0).astype(int)

    #add column with genes/SCM en round to 2 decimals
    df_end['SCM/genes'] = df_end['SCM']/df_end['genes']
    df_end['SCM/genes'] = df_end['SCM/genes'].round(2)

    #add column with genes/plasmid_genes en round to 2 decimals
    df_end['plasmid_genes/genes'] = df_end['plasmid_genes']/df_end['genes']
    df_end['plasmid_genes/genes'] = df_end['plasmid_genes/genes'].round(2)      

    df_end.to_csv(os.path.join(root,'outputgenesdf.csv'))
print('extra columns;done')

#next step
#CURRENT DIRECTORY 
cd = os.path.dirname(os.getcwd())

# concatenate csv files
dfList = []

for root, dirs, files in os.walk(cd):
    for fname in files:
        if re.match("outputgenesdf.csv", fname):
            frame = pd.read_csv(os.path.join(root, fname))
            dfList.append(frame)    

df = pd.concat(dfList)

基于:

[...] and want that the script only works in the current working directory and its subdirectories.[...]

你可以试试这个:

def next_file(directory=os.getcwd(), max_depth=0, depth=0):

    if max_depth < 0 or depth <= max_depth:

        for name in os.listdir(directory):
            with_path = os.path.join(directory, name)

            if os.path.isfile(with_path):
                yield with_path
            else:
                for a_file in next_file(directory=with_path, max_depth=max_depth, depth=depth+1):
                    yield a_file

并使用以下方法处理您的文件:

for a_file in next_file(max_depth=1):
    print 'processing file: %s' % a_file
    # do your stuff here

使用max_depth 来控制要处理的嵌套目录数。 0 读取当前目录中的文件,-1 处理所有目录。 (如 os.walk)。

编辑

我对文件迭代方法做了一个小修改。

这是您脚本的完整(未经测试)版本:

def next_file(current_dir=os.getcwd(), max_depth=0, depth=0):

    if max_depth < 0 or depth <= max_depth:

        for name in os.listdir(current_dir):
            with_path = os.path.join(current_dir, name)

            if os.path.isfile(with_path):
                yield current_dir, name
            else:
                for directory, name in next_file(current_dir=with_path, max_depth=max_depth, depth=depth+1):
                    yield directory, name


for directory, name in next_file(max_depth=1):
    print 'file: %s' % name

print('Start merging contig files')

## for root, dirs, files in os.walk(os.getcwd()):
for directory, name in next_file(max_depth=1):

    ## filepath = os.path.join(root, 'genes.faa.genespercontig.csv')
    filepath = os.path.join(directory, name)

    ## if os.path.isfile(filepath):
    if name == 'genes.faa.genespercontig.csv':
        ## with open(filepath, 'r') as f1:
        with open(filepath, 'r')
            df1 = pd.read_csv(f1, header=None, delim_whitespace=True, names = ["contig", "genes"])
            ## df1['genome'] = os.path.basename(os.path.dirname(filepath))
            df1['genome'] = filepath
    ## else:  # Not necessary
    ##     continue

    ## filepath = os.path.join(root, 'hmmer.analyze.txt.results.txt')
    ## if os.path.isfile(filepath):
    if name == 'hmmer.analyze.txt.results.txt':
        with open(filepath, 'r') as f2:
            df2 = pd.read_csv(f2, header=None, delim_whitespace=True, names = ["contig", "SCM"])
            ## df2['genome'] = os.path.basename(os.path.dirname(filepath))
            df2['genome'] = filepath
    ## else:
    ##     continue

    ## filepath = os.path.join(root, 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out')
    ## if os.path.isfile(filepath):
    if name == 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out':
        with open(filepath, 'r') as f3:
            df3 = pd.read_csv(f3, header=None, delim_whitespace=True, names = ["contig", "plasmid_genes"])
            ## df3['genome'] = os.path.basename(os.path.dirname(filepath))
            df3['genome'] = filepath
        ## else:
        ##     continue

#merge dataframes
dfmerge1 = pd.merge(df1, df2, on=['genome', 'contig'], how='outer')
df_end = pd.merge(dfmerge1, df3, on=['genome', 'contig'], how='outer')

#set NaN in columns to 0
nan_cols = df_end.columns[df_end.isnull().any(axis=0)]
for col in nan_cols:
    df_end[col] = df_end[col].fillna(0).astype(int)

#add column with genes/SCM en round to 2 decimals
df_end['SCM/genes'] = df_end['SCM']/df_end['genes']
df_end['SCM/genes'] = df_end['SCM/genes'].round(2)

#add column with genes/plasmid_genes en round to 2 decimals
df_end['plasmid_genes/genes'] = df_end['plasmid_genes']/df_end['genes']
df_end['plasmid_genes/genes'] = df_end['plasmid_genes/genes'].round(2)      

#CURRENT DIRECTORY 
cd = os.path.dirname(os.getcwd())
df_end.to_csv(os.path.join(cd,'outputgenesdf.csv'))
print('extra columns;done')

#next step
# concatenate csv files
dfList = []

## I'm not sure what you want to achieve with this:
for root, dirs, files in os.walk(cd):
    for fname in files:
        if re.match("outputgenesdf.csv", fname):
            frame = pd.read_csv(os.path.join(root, fname))
            dfList.append(frame)    

df = pd.concat(dfList)

请注意,您的文件处理逻辑未经测试(我测试了文件迭代方法)。而且我不熟悉 pandas,但是在每个数据框中,您都设置了一个 gnome 属性 df1['genome'] 来保存文件的路径。我不确定那是不是你想要的。

最后,我不明白你将 outputgenesdf.csv 文件合并到一个数据框中的最后一步。

希望这对您有所帮助。