从文本文件中递归提取值并遍历更多值并重新排列行和列
Recursively extracting values from within a text file and looping over more of them plus rearranging rows and columns
我想根据正则表达式模式从数百个 txt 文件中提取值,重新排列它们并将它们写入数据框。
文件的开头是这样的:http://pastebin.com/embed_js.php?i=vdbXfDhC
然后这样结束:http://pastebin.com/embed_js.php?i=hse7SDJd
我之前有过类似的问题 ()
rawr 向我提供此代码的地方:
(lf <- list.files('~/desktop', pattern = '^image\d+.txt', full.names = TRUE))
# [1] "/Users/rawr/desktop/image001.txt" "/Users/rawr/desktop/image002.txt"
# [3] "/Users/rawr/desktop/image003.txt"
res <- lapply(lf, function(xx) {
rl <- readLines(con <- file(xx), warn = FALSE)
close(con)
img_name <- gsub('.*file:\s+(.*).tif', '\1', rl[1])
rl <- rl[-(1:grep('==', rl))]
rl <- gsub('^\s+', '', rl)
mat <- do.call('rbind', strsplit(rl, '\s{2, }'))
dat <- as.data.frame(mat, stringsAsFactors = FALSE)
tmp <- `colnames<-`(do.call('rbind', strsplit(dat$V2, '[-\/\s]+', perl = TRUE)),
c('Foreground','Data pixels'))
dat <- cbind(dat[, -2], tmp, image_name = img_name)
dat[] <- lapply(dat, as.character)
dat[dat == ''] <- NA
names(dat)[1:2] <- c('MSPA-class','Frequency')
zzz <- reshape(dat, direction = 'wide', idvar = 'image_name', timevar = 'MSPA-class')
names(zzz)[-1] <- gsub('(.*)\.(.*) (?:.*)', '\2_\1', names(zzz)[-1], perl = TRUE)
zzz
})
然而,这段代码使用的是 txt 文件,其中每个文件只有一个分析步骤,现在我在一个 .log 文件中有很多分析,如 pastebin 示例所示(1/745 ...等)所以我不能使用相同的循环。
有人可以帮我修改上面发布的代码以提取
1) 网目尺寸:XXX [ha]
2)相对。碎片:XXX
3) 网格大小 comp.time [秒]: XXX
对于每个图像(例如,============== 703/745 ============== 表示具有图像名称的新图像进来的路径)
与我的其他问题类似,我需要重新排列数据,以便图像名称(路径中以 .tif 结尾的字符串:20130815 225017 957 000000 0892 0464)是行名称(我不需要.tif 结尾) 和 1) MeshSize[ha], 2) rel。碎片 3) MeshSize comp.time [sec] 是列。
image name 1) mesh size 2)..... 3)......
row1 xx xx xx
row2
编辑到rawr的绝妙解决方案
如果你想把整个事情循环并保存为 CSV,你可以这样做:
lf = list.files(path="xx", pattern = '^batch_mesh8\d.log', full.names = TRUE)
mesh2<-NULL
for (i in lf)
{
#rawr's code here:
#final lines of code:
mesh1<-cbind(data.frame('image_name' = img_names), mat)
mesh2 <- rbind(mesh2, mesh1)
}
write.csv(mesh2, file = "all_mesh_th8.csv")
我觉得这个更直接。 (或者也许我让另一个问题比原来更难了)
path <- '~/desktop/log.log'
x <- readLines(con <- file(path))
close(con)
# m <- gregexpr('(\d+/\d+)', x, perl = TRUE)
# img_names <- head(unlist(regmatches(x, m)), -1)
# completed <- tail(img_names, 1)
y <- x[grepl('File', x)]
img_names <- basename(gsub('File: ', '', gsub('\\+','/', y), perl = TRUE))
img_names <- gsub('\.([[:alnum:]]+)$','', img_names)
(x <- x[grepl('\d+\.\d+', x)])
# [1] "MeshSize: 0.30289606 [ha]; rel. fragmentation: 83.1300"
# [2] "MeshSize comp.time [sec]: 0.00099992752"
# [3] "MeshSize: 0.39157622 [ha]; rel. fragmentation: 81.4600"
# [4] "MeshSize comp.time [sec]: 0.00099992752"
# [5] "MeshSize: 0.45971902 [ha]; rel. fragmentation: 76.8700"
# [6] "MeshSize comp.time [sec]: 0.00000000"
# [7] "MeshSize: 0.032965344 [ha]; rel. fragmentation: 94.5500"
# [8] "MeshSize comp.time [sec]: 0.00000000"
# [9] "MeshSize: 0.034653125 [ha]; rel. fragmentation: 93.6300"
# [10] "MeshSize comp.time [sec]: 0.00000000"
# [11] "MeshSize: 0.74313322 [ha]; rel. fragmentation: 90.2700"
# [12] "MeshSize comp.time [sec]: 0.00099992752"
# [13] "MeshSize: 0.48677515 [ha]; rel. fragmentation: 85.5700"
# [14] "MeshSize comp.time [sec]: 0.00099992752"
nums <- unlist(regmatches(x, gregexpr('\d+\.\d+', x, perl = TRUE)))
mat <- matrix(nums, ncol = 3, byrow = TRUE,
dimnames = list(NULL, c('Mesh size','rel frag','comp time')))
cbind(data.frame('image_name' = img_names), mat)
# image_name Mesh size rel frag comp time
# 1 20130815 143656 507 000000 0952 0536 0.30289606 83.1300 0.00099992752
# 2 20130815 143657 673 000002 0244 0284 0.39157622 81.4600 0.00099992752
# 3 20130815 143657 706 000000 0764 0304 0.45971902 76.8700 0.00000000
# 4 20130815 143658 806 000000 0776 0672 0.032965344 94.5500 0.00000000
# 5 20130815 143700 005 000000 0232 0116 0.034653125 93.6300 0.00000000
# 6 20130815 225020 589 000000 0188 0564 0.74313322 90.2700 0.00099992752
# 7 20130815 225033 917 000000 0288 0804 0.48677515 85.5700 0.00099992752
我想根据正则表达式模式从数百个 txt 文件中提取值,重新排列它们并将它们写入数据框。
文件的开头是这样的:http://pastebin.com/embed_js.php?i=vdbXfDhC
然后这样结束:http://pastebin.com/embed_js.php?i=hse7SDJd
我之前有过类似的问题 (
(lf <- list.files('~/desktop', pattern = '^image\d+.txt', full.names = TRUE))
# [1] "/Users/rawr/desktop/image001.txt" "/Users/rawr/desktop/image002.txt"
# [3] "/Users/rawr/desktop/image003.txt"
res <- lapply(lf, function(xx) {
rl <- readLines(con <- file(xx), warn = FALSE)
close(con)
img_name <- gsub('.*file:\s+(.*).tif', '\1', rl[1])
rl <- rl[-(1:grep('==', rl))]
rl <- gsub('^\s+', '', rl)
mat <- do.call('rbind', strsplit(rl, '\s{2, }'))
dat <- as.data.frame(mat, stringsAsFactors = FALSE)
tmp <- `colnames<-`(do.call('rbind', strsplit(dat$V2, '[-\/\s]+', perl = TRUE)),
c('Foreground','Data pixels'))
dat <- cbind(dat[, -2], tmp, image_name = img_name)
dat[] <- lapply(dat, as.character)
dat[dat == ''] <- NA
names(dat)[1:2] <- c('MSPA-class','Frequency')
zzz <- reshape(dat, direction = 'wide', idvar = 'image_name', timevar = 'MSPA-class')
names(zzz)[-1] <- gsub('(.*)\.(.*) (?:.*)', '\2_\1', names(zzz)[-1], perl = TRUE)
zzz
})
然而,这段代码使用的是 txt 文件,其中每个文件只有一个分析步骤,现在我在一个 .log 文件中有很多分析,如 pastebin 示例所示(1/745 ...等)所以我不能使用相同的循环。
有人可以帮我修改上面发布的代码以提取
1) 网目尺寸:XXX [ha] 2)相对。碎片:XXX 3) 网格大小 comp.time [秒]: XXX
对于每个图像(例如,============== 703/745 ============== 表示具有图像名称的新图像进来的路径)
与我的其他问题类似,我需要重新排列数据,以便图像名称(路径中以 .tif 结尾的字符串:20130815 225017 957 000000 0892 0464)是行名称(我不需要.tif 结尾) 和 1) MeshSize[ha], 2) rel。碎片 3) MeshSize comp.time [sec] 是列。
image name 1) mesh size 2)..... 3)......
row1 xx xx xx
row2
编辑到rawr的绝妙解决方案
如果你想把整个事情循环并保存为 CSV,你可以这样做:
lf = list.files(path="xx", pattern = '^batch_mesh8\d.log', full.names = TRUE)
mesh2<-NULL
for (i in lf)
{
#rawr's code here:
#final lines of code:
mesh1<-cbind(data.frame('image_name' = img_names), mat)
mesh2 <- rbind(mesh2, mesh1)
}
write.csv(mesh2, file = "all_mesh_th8.csv")
我觉得这个更直接。 (或者也许我让另一个问题比原来更难了)
path <- '~/desktop/log.log'
x <- readLines(con <- file(path))
close(con)
# m <- gregexpr('(\d+/\d+)', x, perl = TRUE)
# img_names <- head(unlist(regmatches(x, m)), -1)
# completed <- tail(img_names, 1)
y <- x[grepl('File', x)]
img_names <- basename(gsub('File: ', '', gsub('\\+','/', y), perl = TRUE))
img_names <- gsub('\.([[:alnum:]]+)$','', img_names)
(x <- x[grepl('\d+\.\d+', x)])
# [1] "MeshSize: 0.30289606 [ha]; rel. fragmentation: 83.1300"
# [2] "MeshSize comp.time [sec]: 0.00099992752"
# [3] "MeshSize: 0.39157622 [ha]; rel. fragmentation: 81.4600"
# [4] "MeshSize comp.time [sec]: 0.00099992752"
# [5] "MeshSize: 0.45971902 [ha]; rel. fragmentation: 76.8700"
# [6] "MeshSize comp.time [sec]: 0.00000000"
# [7] "MeshSize: 0.032965344 [ha]; rel. fragmentation: 94.5500"
# [8] "MeshSize comp.time [sec]: 0.00000000"
# [9] "MeshSize: 0.034653125 [ha]; rel. fragmentation: 93.6300"
# [10] "MeshSize comp.time [sec]: 0.00000000"
# [11] "MeshSize: 0.74313322 [ha]; rel. fragmentation: 90.2700"
# [12] "MeshSize comp.time [sec]: 0.00099992752"
# [13] "MeshSize: 0.48677515 [ha]; rel. fragmentation: 85.5700"
# [14] "MeshSize comp.time [sec]: 0.00099992752"
nums <- unlist(regmatches(x, gregexpr('\d+\.\d+', x, perl = TRUE)))
mat <- matrix(nums, ncol = 3, byrow = TRUE,
dimnames = list(NULL, c('Mesh size','rel frag','comp time')))
cbind(data.frame('image_name' = img_names), mat)
# image_name Mesh size rel frag comp time
# 1 20130815 143656 507 000000 0952 0536 0.30289606 83.1300 0.00099992752
# 2 20130815 143657 673 000002 0244 0284 0.39157622 81.4600 0.00099992752
# 3 20130815 143657 706 000000 0764 0304 0.45971902 76.8700 0.00000000
# 4 20130815 143658 806 000000 0776 0672 0.032965344 94.5500 0.00000000
# 5 20130815 143700 005 000000 0232 0116 0.034653125 93.6300 0.00000000
# 6 20130815 225020 589 000000 0188 0564 0.74313322 90.2700 0.00099992752
# 7 20130815 225033 917 000000 0288 0804 0.48677515 85.5700 0.00099992752