计算复杂文件夹结构中每个文件夹的文件数?

Compute number of files per folder in a complex folder structure?

我通过导入其中包含文件的文件夹结构创建了一个简单的 data.tree

if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh("trinker/pathr")

library(pathr)
library(data.tree)

folder_structure <- pathr::tree(path = "/Users/username/Downloads/top_level/",
 use.data.tree = T, include.files = T)

现在,我想将对象 folder_structure 转换为一个 data.frame,每个文件夹一行,一列指定每个文件夹包含多少文件。我怎样才能做到这一点?

例如,我的文件夹结构非常简单:

top_level_folder
    sub_folder_1
        file1.txt
    sub_folder_2
        file2.txt

回答这个问题将涉及创建如下所示的输出:

Folders             Files
top_level_folder    0
sub_folder_1        1
sub_folder_2        1

第一列可以通过调用list.dirs("/Users/username/Downloads/top_level/")简单地生成,但我不知道如何生成第二列。请注意,第二列是非递归的,这意味着不计算子文件夹中的文件(即 top_level_folder 包含 0 个文件,即使 top_level_folder 的子文件夹包含 2 个文件)。

如果您想查看您的解决方案是否可扩展,请下载 Rails 代码库:https://github.com/rails/rails/archive/master.zip 并在 Rails 更复杂的文件结构上尝试。

您可以将 dplyr 链与 pathr 包中的 parse_path() 函数一起使用。 tree 函数基本上只是 parse_path 的包装器,因此直接使用 parse_path 更容易。例如。像这样:

library(pathr)
library(dplyr)

fls <- dir("C:/RBuildTools/3.3", recursive = T, full.names = T) %>% 
parse_path() %>% 
index(4) %>% # this is where you indicate the level or "depth" 
             # of the folder of which want subfolder file counts
data.frame(folders = .) %>% 
group_by(folders) %>% 
tally() %>% 
arrange(n)

# if you want to get rid of all the files in your starting folder 
# just add a 
# filter(folder > 1) at the end of the dplyr chain

对我来说,上面的代码产生了以下结果:

> fls
# A tibble: 12 × 2
        folders     n
         <fctr> <int>
1       COPYING     1
2    README.txt     1
3    Rtools.txt     1
4  unins000.dat     1
5  unins000.exe     1
6   VERSION.txt     1
7           bin    56
8    mingw_libs   200
9      texinfo5   356
10    gcc-4.6.3  3787
11     mingw_32 13707
12     mingw_64 14619
dir.create("top_level_folder")
dir.create("top_level_folder/sub_folder_1")
dir.create("top_level_folder/sub_folder_2")
a <- "hello"
save(a,file = "top_level_folder/sub_folder_1/file1.txt")
save(a,file = "top_level_folder/sub_folder_2/file2.txt")

path <- "top_level_folder"
files   <- list.files(path, recursive=TRUE)
folders <- sapply(strsplit(files,"/"),function(x){x[length(x)-1]})
output <- setNames(as.data.frame(table(unlist(folders))),c("Folders","Files"))

all_folders <- data.frame(Folders = list.dirs(path,full.names=FALSE,recursive=TRUE),stringsAsFactors=FALSE)
all_folders$Folders[1] <- strsplit(path,",")[[1]][length(strsplit(path,",")[[1]])]

output <- merge(all_folders,output,all.x = TRUE)
output$Files[is.na(output$Files)] <- 0
output <- output[match(all_folders$Folders,output$Folders),]

#            Folders Files
# 3 top_level_folder     0
# 1     sub_folder_1     1
# 2     sub_folder_2     1

list.dirs() 提供从起始文件夹可到达的每个子目录的向量,以便处理数据框的第一列。很方便。

# Get a vector of all the directories and subdirectories from this folder
dir <- "."
xs <- list.dirs(dir, recursive = TRUE)

list.files() 可以告诉我们每个文件夹的内容,但它包括文件和文件夹。我们只想要文件。要获取文件数,我们需要使用谓词过滤 list.files() 的输出。 file.info() 可以告诉我们给定的文件是否是目录,因此我们以此为基础构建谓词。

# Helper to check if something is folder or file
is_dir <- function(x) file.info(x)[["isdir"]]
is_file <- Negate(is_dir)

现在,我们解决如何获取单个文件夹中的文件数。对布尔值求和 returns TRUE 个案例的数量。

# Count the files in a single folder
count_files_in_one_dir <- function(dir) {
  files <- list.files(dir, full.names = TRUE)
  sum(is_file(files))
}

为方便起见,我们包装了该函数以使其适用于许多文件夹。

# Vectorized version of the above
count_files_in_dir <- function(dir) {
  vapply(dir, count_files_in_one_dir, numeric(1), USE.NAMES = FALSE)
}

现在我们可以统计文件了。

df <- tibble::data_frame(
  dir = xs,
  nfiles = count_files_in_dir(xs))

df
#> # A tibble: 688 x 2
#>                                                  dir nfiles
#>                                                <chr>  <dbl>
#>  1                                                 .     11
#>  2                                         ./.github      3
#>  3                                     ./actioncable      7
#>  4                                 ./actioncable/app      0
#>  5                          ./actioncable/app/assets      0
#>  6              ./actioncable/app/assets/javascripts      1
#>  7 ./actioncable/app/assets/javascripts/action_cable      5
#>  8                                 ./actioncable/bin      1
#>  9                                 ./actioncable/lib      1
#> 10                    ./actioncable/lib/action_cable      8
#> # ... with 678 more rows

list.files returns 所有文件和目录路径。没有is.file函数,但是有dir.exists。因为我们知道所有路径都是实际节点,所以那些不是目录的路径将被计为文件。

top_level <- '~/rails-master'
setwd(top_level)
subitems <- data.frame(
  path = list.files(
    include.dirs = TRUE,
    recursive    = TRUE
  ),
  stringsAsFactors = FALSE
)
subitems$is_file <- !dir.exists(subitems$path)

对于每一行,如果路径指向一个目录,那么它就是它自己的目录路径。如果路径是一个文件,那么它的父级就是目录路径。然后这只是计算目录路径 is_file 为真的频率的问题。

subitems$dir_path <- ifelse(
  subitems$is_file,
  dirname(subitems$path),
  subitems$path
)
file_counts <- tapply(subitems$is_file, subitems$dir_path, sum)
result <- data.frame(
  Folders = names(file_counts),
  Files   = file_counts
)

你真正需要做的就是用list.dirs(默认为recursive = TRUE)创建一个目录列表并遍历它,找到list.files的长度(该目录默认为 recursive = FALSE)。 Neating 到一个不错的 data.frame,

library(purrr)

files <- .libPaths()[1] %>%    # omit for current directory or supply alternate path
    list.dirs() %>% 
    map_df(~list(path = .x, 
                 files = length(list.files(.x))))

files
#> # A tibble: 4,457 x 2
#>                                                                           path files
#>                                                                          <chr> <int>
#>  1              /Library/Frameworks/R.framework/Versions/3.4/Resources/library   314
#>  2        /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind     9
#>  3   /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/help     5
#>  4   /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/html     2
#>  5   /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/Meta     6
#>  6      /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/R     3
#>  7      /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack    14
#>  8 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/help     5
#>  9 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/html     2
#> 10 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/libs     2
#> # ... with 4,447 more rows

如果你愿意,也可以全部放在基地,

files <- do.call(rbind, lapply(list.dirs(.libPaths()[1]), function(path){
    data.frame(path = path, 
               files = length(list.files(path)), 
               stringsAsFactors = FALSE)
}))

head(files)
#>                                                                        path files
#> 1            /Library/Frameworks/R.framework/Versions/3.4/Resources/library   314
#> 2      /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind     9
#> 3 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/help     5
#> 4 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/html     2
#> 5 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/Meta     6
#> 6    /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/R     3

这是一个非常紧凑的解决方案:

print(folder_structure, 
      files = function(node) sum(Get(node$children, 'isLeaf')), 
      filterFun = isNotLeaf,
      pruneMethod = NULL
)

这会产生如下内容:

                                                     levelName files
1   data.tree                                                     16
2    ¦--data                                                       2
3    ¦--data_gen                                                   2
4    ¦--.git                                                       8
5    ¦   ¦--hooks                                                  9
6    ¦   ¦--info                                                   1
7    ¦   ¦--logs                                                   1
8    ¦   ¦   °--refs                                               1
9    ¦   ¦       ¦--heads                                          4
10   ¦   ¦       ¦--remotes                                        0
11   ¦   ¦       ¦   °--origin                                     5
12   ¦   ¦--objects                                                0
13   ¦   ¦   ¦--01                                                 4
14   ¦   ¦   ¦--02                                                 5
...

但是请注意,这也将空文件夹计为文件。