在文件夹中查找数据并提供有关丢失数据的反馈
Find data in folders and give feedback about missing data
我有一个 R 脚本来创建一个大数据集(实际上是一个欧洲数据集)的几个小部分。与使用一个大数据集相比,我们需要这些小部分(图块)来更轻松地编辑这些图块。
现在我有 1 个 windows 文件夹,在这个文件夹中我有 966 个自动生成的文件夹 - 每个文件夹有 4 个数据集(我希望至少是 4 个)。我们需要知道文件夹中是否恰好有这 4 个数据集,如果缺少某些数据集,我们需要知道是哪一个。您可以在下面看到的代码正在创建文件夹。它的发布只是为了让您了解结构。
in_file <- "P:/High_Resolution_Layers/Forest... .tif/2015/TCD_2015_020m_eu_03035_d04_full/TCD_2015_020m_eu_03035_d04_full.tif"
for (t in 1:length(tiles)){
tileID <- tiles[t]
out_dir <- file.path(output_dir,tileID)
# out_dir_tmp <- file.path(out_dir, "tmp")
if(!exists(out_dir)) {dir.create(out_dir, recursive = T)}
# if(!exists(out_dir)) {dir.create(out_dir_tmp, recursive = T)}
# tmp_file <- file.path(out_dir_tmp, paste0(tileID, "_HRL_Forest.tif")) ## <- ändern ("_HRL_Forest.tif", _clc_2012.tif, _clc_2018.tif, _slope.tif)
out_file <- file.path(out_dir, paste0(tileID, "_HRL_Forest.tif")) ## <- ändern ("_HRL_Forest.tif", _clc_2012.tif, _clc_2018.tif, _slope.tif)
cmd <- paste("gdalwarp",
"-overwrite",
"-s_srs EPSG:3035",
"-t_srs EPSG:3035",
"-r near",
"-q",
"-tr 20 20",
"-te ", tile_list[t,3],tile_list[t,4],tile_list[t,3]+100000, tile_list[t,4]+100000,
"-tap",
"-of GTiff",
in_file,
out_file)
system(osgeo, input=cmd)
# cmd <- sprintf('gdal_translate -ot Byte -a_nodata 255 -co "COMPRESS=LZW" %s %s', tmp_file, out_file)
# system(osgeo, input=cmd)
# unlink(out_dir_tmp,recursive=T)
}
我要组成一个结构和文件列表。
- 目录
A
到 D
- 每个目录必须有文件
a.tif
虽然 c.tif
由于所有目录中必须包含相同的文件,我们可以对它们进行笛卡尔/outer
连接:
dirs <- LETTERS[1:4]
files_each_dir <- paste0(letters[1:3], ".tif")
(all_files <- outer(dirs, files_each_dir, file.path))
# [,1] [,2] [,3]
# [1,] "A/a.tif" "A/b.tif" "A/c.tif"
# [2,] "B/a.tif" "B/b.tif" "B/c.tif"
# [3,] "C/a.tif" "C/b.tif" "C/c.tif"
# [4,] "D/a.tif" "D/b.tif" "D/c.tif"
由于我们不需要 matrix
,我将取消列出它们,然后创建 dirs/files:
c(all_files)
# [1] "A/a.tif" "B/a.tif" "C/a.tif" "D/a.tif" "A/b.tif" "B/b.tif" "C/b.tif"
# [8] "D/b.tif" "A/c.tif" "B/c.tif" "C/c.tif" "D/c.tif"
for (d in dirs) dir.create(d)
for (p in all_files) writeLines(p, p)
所有预期的文件都存在
(files_found <- list.files(pattern = "*.tif", recursive = TRUE, full.names = TRUE))
# [1] "./A/a.tif" "./A/b.tif" "./A/c.tif" "./B/a.tif" "./B/b.tif" "./B/c.tif"
# [7] "./C/a.tif" "./C/b.tif" "./C/c.tif" "./D/a.tif" "./D/b.tif" "./D/c.tif"
### remove the leading "./"
(files_found <- gsub("^\./", "", files_found))
# [1] "A/a.tif" "A/b.tif" "A/c.tif" "B/a.tif" "B/b.tif" "B/c.tif" "C/a.tif"
# [8] "C/b.tif" "C/c.tif" "D/a.tif" "D/b.tif" "D/c.tif"
all(all_files %in% files_found)
# [1] TRUE
all_files[!all_files %in% files_found]
# character(0)
测试丢失的文件:
file.remove("B/c.tif")
# [1] TRUE
files_found <- list.files(pattern = "*.tif", recursive = TRUE, full.names = TRUE)
files_found <- gsub("^\./", "", files_found)
all_files[!all_files %in% files_found]
# [1] "B/c.tif"
注意:我们不使用 files_each_dir
进行任何后续测试。只有当我们需要一组固定的文件名时才需要它。
计算每个目录中的文件数
如果文件名可能不同,那么我们可以统计每个目录中的文件数,而不考虑实际名称。
(len3 <- lengths(split(files_found, sapply(strsplit(files_found, "[/\]"), `[[`, 1))) == 3)
# A B C D
# TRUE FALSE TRUE TRUE
names(len3)[ !len3 ]
# [1] "B"
文件内容
如果您需要测试内容以使某些条件为真,请尝试这样的操作。在这里,我使用简单的 shell 命令 grep
,但是任何函数(R 或 shell)需要一个路径和 returns 你需要的东西(大小,属性, 等) 应该工作。
func <- function(path) length(system2("grep", c("-lE", "'[a-z]'", path), stdout = TRUE)) > 0
(proper_contents <- sapply(files_found, func))
# A/a.tif A/b.tif A/c.tif B/a.tif B/b.tif C/a.tif C/b.tif C/c.tif D/a.tif D/b.tif
# TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# D/c.tif
# TRUE
让我们更改一个文件的内容进行测试:
writeLines("123", "D/a.tif")
proper_contents <- sapply(files_found, func)
# Warning in system2("grep", c("-lE", "'[a-z]'", path), stdout = TRUE) :
# running command '"grep" -lE '[a-z]' D/a.tif' had status 1
names(proper_contents)[ !proper_contents ]
# [1] "D/a.tif"
我有一个 R 脚本来创建一个大数据集(实际上是一个欧洲数据集)的几个小部分。与使用一个大数据集相比,我们需要这些小部分(图块)来更轻松地编辑这些图块。
现在我有 1 个 windows 文件夹,在这个文件夹中我有 966 个自动生成的文件夹 - 每个文件夹有 4 个数据集(我希望至少是 4 个)。我们需要知道文件夹中是否恰好有这 4 个数据集,如果缺少某些数据集,我们需要知道是哪一个。您可以在下面看到的代码正在创建文件夹。它的发布只是为了让您了解结构。
in_file <- "P:/High_Resolution_Layers/Forest... .tif/2015/TCD_2015_020m_eu_03035_d04_full/TCD_2015_020m_eu_03035_d04_full.tif"
for (t in 1:length(tiles)){
tileID <- tiles[t]
out_dir <- file.path(output_dir,tileID)
# out_dir_tmp <- file.path(out_dir, "tmp")
if(!exists(out_dir)) {dir.create(out_dir, recursive = T)}
# if(!exists(out_dir)) {dir.create(out_dir_tmp, recursive = T)}
# tmp_file <- file.path(out_dir_tmp, paste0(tileID, "_HRL_Forest.tif")) ## <- ändern ("_HRL_Forest.tif", _clc_2012.tif, _clc_2018.tif, _slope.tif)
out_file <- file.path(out_dir, paste0(tileID, "_HRL_Forest.tif")) ## <- ändern ("_HRL_Forest.tif", _clc_2012.tif, _clc_2018.tif, _slope.tif)
cmd <- paste("gdalwarp",
"-overwrite",
"-s_srs EPSG:3035",
"-t_srs EPSG:3035",
"-r near",
"-q",
"-tr 20 20",
"-te ", tile_list[t,3],tile_list[t,4],tile_list[t,3]+100000, tile_list[t,4]+100000,
"-tap",
"-of GTiff",
in_file,
out_file)
system(osgeo, input=cmd)
# cmd <- sprintf('gdal_translate -ot Byte -a_nodata 255 -co "COMPRESS=LZW" %s %s', tmp_file, out_file)
# system(osgeo, input=cmd)
# unlink(out_dir_tmp,recursive=T)
}
我要组成一个结构和文件列表。
- 目录
A
到D
- 每个目录必须有文件
a.tif
虽然c.tif
由于所有目录中必须包含相同的文件,我们可以对它们进行笛卡尔/outer
连接:
dirs <- LETTERS[1:4]
files_each_dir <- paste0(letters[1:3], ".tif")
(all_files <- outer(dirs, files_each_dir, file.path))
# [,1] [,2] [,3]
# [1,] "A/a.tif" "A/b.tif" "A/c.tif"
# [2,] "B/a.tif" "B/b.tif" "B/c.tif"
# [3,] "C/a.tif" "C/b.tif" "C/c.tif"
# [4,] "D/a.tif" "D/b.tif" "D/c.tif"
由于我们不需要 matrix
,我将取消列出它们,然后创建 dirs/files:
c(all_files)
# [1] "A/a.tif" "B/a.tif" "C/a.tif" "D/a.tif" "A/b.tif" "B/b.tif" "C/b.tif"
# [8] "D/b.tif" "A/c.tif" "B/c.tif" "C/c.tif" "D/c.tif"
for (d in dirs) dir.create(d)
for (p in all_files) writeLines(p, p)
所有预期的文件都存在
(files_found <- list.files(pattern = "*.tif", recursive = TRUE, full.names = TRUE))
# [1] "./A/a.tif" "./A/b.tif" "./A/c.tif" "./B/a.tif" "./B/b.tif" "./B/c.tif"
# [7] "./C/a.tif" "./C/b.tif" "./C/c.tif" "./D/a.tif" "./D/b.tif" "./D/c.tif"
### remove the leading "./"
(files_found <- gsub("^\./", "", files_found))
# [1] "A/a.tif" "A/b.tif" "A/c.tif" "B/a.tif" "B/b.tif" "B/c.tif" "C/a.tif"
# [8] "C/b.tif" "C/c.tif" "D/a.tif" "D/b.tif" "D/c.tif"
all(all_files %in% files_found)
# [1] TRUE
all_files[!all_files %in% files_found]
# character(0)
测试丢失的文件:
file.remove("B/c.tif")
# [1] TRUE
files_found <- list.files(pattern = "*.tif", recursive = TRUE, full.names = TRUE)
files_found <- gsub("^\./", "", files_found)
all_files[!all_files %in% files_found]
# [1] "B/c.tif"
注意:我们不使用 files_each_dir
进行任何后续测试。只有当我们需要一组固定的文件名时才需要它。
计算每个目录中的文件数
如果文件名可能不同,那么我们可以统计每个目录中的文件数,而不考虑实际名称。
(len3 <- lengths(split(files_found, sapply(strsplit(files_found, "[/\]"), `[[`, 1))) == 3)
# A B C D
# TRUE FALSE TRUE TRUE
names(len3)[ !len3 ]
# [1] "B"
文件内容
如果您需要测试内容以使某些条件为真,请尝试这样的操作。在这里,我使用简单的 shell 命令 grep
,但是任何函数(R 或 shell)需要一个路径和 returns 你需要的东西(大小,属性, 等) 应该工作。
func <- function(path) length(system2("grep", c("-lE", "'[a-z]'", path), stdout = TRUE)) > 0
(proper_contents <- sapply(files_found, func))
# A/a.tif A/b.tif A/c.tif B/a.tif B/b.tif C/a.tif C/b.tif C/c.tif D/a.tif D/b.tif
# TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# D/c.tif
# TRUE
让我们更改一个文件的内容进行测试:
writeLines("123", "D/a.tif")
proper_contents <- sapply(files_found, func)
# Warning in system2("grep", c("-lE", "'[a-z]'", path), stdout = TRUE) :
# running command '"grep" -lE '[a-z]' D/a.tif' had status 1
names(proper_contents)[ !proper_contents ]
# [1] "D/a.tif"