AWK:用类似于FNR==NR的方法连接并处理三个或更多文件
AWK: Concatenate and process three or more files with a method similar to FNR==NR approach
自学以来awk
;我发现 FNR==NR
方法是处理两个文件的一种非常常用的方法。如果FNR==NR
;那么它是 first file
,当 FNR
重置为 1
并从串联文件中读取每一行时,这意味着 !(FNR==NR)
并且显然是 second file
.
当涉及 三个 或更多文件时,我看不出第二个和第三个文件的方式,因为它们具有相同的 !(FNR==NR)
条件。这让我试图弄清楚怎么会有像 FNR2
和 FNR3
?
这样的东西
所以我实现了一个处理三个文件的方法awk
。假设每个文件都有 FNR1
FNR2
FNR3
。对于我制作的每个单独运行的 for loop
文件。每个循环的条件都相同 NR==FNR#
并且实际上得到了我的预期:
所以我想知道是否有更清醒、简洁的方法可以提供与以下awk
代码
类似的结果
示例文件内容
$ cat file1
X|A1|Z
X|A2|Z
X|A3|Z
X|A4|Z
$ cat file2
X|Y|A3
X|Y|A4
X|Y|A5
$ cat file3
A1|Y|Z
A4|Y|Z
AWK for 循环
$ cat fnrarray.sh
awk -v FS='[|]' '{ for(i=FNR ; i<=NR && i<=FNR && NR==FNR; i++) {x++; print "NR:",NR,"FNR1:",i,"FNR:",FNR,"\tfirst file\t"}
for(i=FNR ; i+x<=NR && i<=FNR && NR==FNR+x; i++) {y++; print "NR:",NR,"FNR2:",i+x,"FNR:",FNR,"\tsecond file\t"}
for(i=FNR ; i+x+y<=NR && i<=FNR && NR==FNR+x+y; i++) {print "NR:",NR,"FNR3:",i+x+y,"FNR:",FNR,"\tthird file\t"}
}' file1 file2 file3
当前和期望的输出
$ sh fnrarray.sh
NR: 1 FNR1: 1 FNR: 1 first file
NR: 2 FNR1: 2 FNR: 2 first file
NR: 3 FNR1: 3 FNR: 3 first file
NR: 4 FNR1: 4 FNR: 4 first file
NR: 5 FNR2: 5 FNR: 1 second file
NR: 6 FNR2: 6 FNR: 2 second file
NR: 7 FNR2: 7 FNR: 3 second file
NR: 8 FNR3: 8 FNR: 1 third file
NR: 9 FNR3: 9 FNR: 2 third file
您可以看到 NR
与 FNR#
对齐,并且 NR
对应 file#
.
是可读的
另一种方法
我在这里 FNR==1{++f} f==1 {}
找到了这个方法 Handling 3 Files using awk
但是每次读取新行时,此方法都会替换arr1[1]
失败尝试 1
$ awk -v FS='[|]' 'FNR==1{++f} f==1 {split(,arr); print arr1[1]}' file1 file2 file3
A1
A2
A3
A4
for循环成功(arr1[1]
不变)
$ awk -v FS='[|]' '{for(i=FNR ; i<=NR && i<=FNR && NR==FNR; i++) {arr1[++k]=; print arr1[1]}}' file1 file2 file3
A1
A1
A1
A1
When it comes to three or more files I can't see a way which is second
and third file as both have the same !(FNR==NR) condition. This made
me to try to figure out how can there be something like FNR2 and FNR3?
示例如下:
$ cat f1
X|A1|Z
X|A2|Z
X|A3|Z
X|A4|Z
$ cat f2
X|Y|A3
X|Y|A4
X|Y|A5
$ cat f3
A1|Y|Z
A4|Y|Z
示例输出:
$ awk -F '|' 'FNR==1{file++}{array[file, FNR]=[=11=]; max=max>FNR?max:FNR}END{for(f=1; f<=file; f++){ for(row=1; row<=max; row++){ key=f SUBSEP row; if(key in array)print "file: "f,"row :"row,"record: "array[key] } }}' f1 f2 f3
file: 1 row :1 record: X|A1|Z
file: 1 row :2 record: X|A2|Z
file: 1 row :3 record: X|A3|Z
file: 1 row :4 record: X|A4|Z
file: 2 row :1 record: X|Y|A3
file: 2 row :2 record: X|Y|A4
file: 2 row :3 record: X|Y|A5
file: 3 row :1 record: A1|Y|Z
file: 3 row :2 record: A4|Y|Z
解释:
awk -F '|' 'FNR==1{ # FNR will reset for every file
file++ # so whenever FNR==1 increment variable file
}
{
# array name : array
# array key being : file, FNR
# array value : [=12=] which current record/row
array[file, FNR] = [=12=];
# here we find which row count in all available files
max = max > FNR ? max : FNR
}
END{ # end block when all files are read
# start iterating over file
# as we now variable file hold total no files read
for(f=1; f<=file; f++)
{
# iterate now for record from each file
# variable max holds max row count
for(row=1; row<=max; row++)
{
# variable key will now have
# key = file-number SUBSET row-number
key=f SUBSEP row;
# if key exists in array
# print array value
if(key in array)
print "file: "f,"row :"row,"record: "array[key]
}
}
}' f1 f2 f3
其他选项是使用真正的多维数组,如下所示。 gawk
当然具体了。
假设文件名是唯一的,否则使用 FNR==1{ file++}
并使用 file
代替 FILENAME
$ awk --version
GNU Awk 4.2.1, API: 2.0 (GNU MPFR 3.1.6-p2, GNU MP 6.1.2)
Copyright (C) 1989, 1991-2018 Free Software Foundation.
$ awk -F '|' '{
true_multi_array[FILENAME][FNR] = [=13=]
}
END{
for(file in true_multi_array)
for(row in true_multi_array[file])
print "file:",file, "row :" row, "record:" true_multi_array[file][row]
}' f1 f2 f3
file: f1 row :1 record:X|A1|Z
file: f1 row :2 record:X|A2|Z
file: f1 row :3 record:X|A3|Z
file: f1 row :4 record:X|A4|Z
file: f2 row :1 record:X|Y|A3
file: f2 row :2 record:X|Y|A4
file: f2 row :3 record:X|Y|A5
file: f3 row :1 record:A1|Y|Z
file: f3 row :2 record:A4|Y|Z
无论如何使用 GNU awk 按顺序识别文件:
awk '
ARGIND == 1 { do 1st file stuff }
ARGIND == 2 { do 2nd file stuff }
ARGIND == 3 { do 3rd file stuff }
' file1 file2 file3
例如从您提供的 3 个示例输入文件中获取问题中“输出”下的文本:
awk '
ARGIND == 1 { pos = "first" }
ARGIND == 2 { pos = "second" }
ARGIND == 3 { pos = "third" }
{ print "NR:", NR, "FNR" ARGIND ":", NR, "FNR:", FNR, pos " file" }
' file1 file2 file3
NR: 1 FNR1: 1 FNR: 1 first file
NR: 2 FNR1: 2 FNR: 2 first file
NR: 3 FNR1: 3 FNR: 3 first file
NR: 4 FNR1: 4 FNR: 4 first file
NR: 5 FNR2: 5 FNR: 1 second file
NR: 6 FNR2: 6 FNR: 2 second file
NR: 7 FNR2: 7 FNR: 3 second file
NR: 8 FNR3: 8 FNR: 1 third file
NR: 9 FNR3: 9 FNR: 2 third file
或者如果所有文件名都是唯一的,无论它们是否为空,都使用任何 awk:
awk '
FILENAME == ARGV[1] { do 1st file stuff }
FILENAME == ARGV[2] { do 2nd file stuff }
FILENAME == ARGV[3] { do 3rd file stuff }
' file1 file2 file3
或者如果文件不为空则是否唯一(注意 file1
在 arg 列表中两次):
awk '
FNR == 1 { argind++ }
argind == 1 { do 1st file stuff }
argind == 2 { do 2nd file stuff }
argind == 3 { do 3rd file stuff }
' file1 file2 file1
如果一个文件名可以在 arg 列表中多次出现,并且某些文件可能为空,那么使用非 GNU awk 会变得更加棘手,这就是 GNU awk 具有 ARGIND 的原因,例如像(未经测试):
awk '
BEGIN {
for (i=1; i<ARGC; i++) {
fname = ARGV[i]
if ( (getline line < fname) > 0 ) {
# file is not empty so save its position in the args
# list in an array indexed by its name and the number
# of times that name has been seen so far
arginds[fname,++tmpcnt[fname]] = i
}
close(fname)
}
}
FNR == 1 { argind = arginds[FILENAME,++cnt[FILENAME]] }
argind == 1 { do 1st file stuff }
argind == 2 { do 2nd file stuff }
argind == 3 { do 3rd file stuff }
' file1 file2 file1
自学以来awk
;我发现 FNR==NR
方法是处理两个文件的一种非常常用的方法。如果FNR==NR
;那么它是 first file
,当 FNR
重置为 1
并从串联文件中读取每一行时,这意味着 !(FNR==NR)
并且显然是 second file
.
当涉及 三个 或更多文件时,我看不出第二个和第三个文件的方式,因为它们具有相同的 !(FNR==NR)
条件。这让我试图弄清楚怎么会有像 FNR2
和 FNR3
?
所以我实现了一个处理三个文件的方法awk
。假设每个文件都有 FNR1
FNR2
FNR3
。对于我制作的每个单独运行的 for loop
文件。每个循环的条件都相同 NR==FNR#
并且实际上得到了我的预期:
所以我想知道是否有更清醒、简洁的方法可以提供与以下awk
代码
示例文件内容
$ cat file1
X|A1|Z
X|A2|Z
X|A3|Z
X|A4|Z
$ cat file2
X|Y|A3
X|Y|A4
X|Y|A5
$ cat file3
A1|Y|Z
A4|Y|Z
AWK for 循环
$ cat fnrarray.sh
awk -v FS='[|]' '{ for(i=FNR ; i<=NR && i<=FNR && NR==FNR; i++) {x++; print "NR:",NR,"FNR1:",i,"FNR:",FNR,"\tfirst file\t"}
for(i=FNR ; i+x<=NR && i<=FNR && NR==FNR+x; i++) {y++; print "NR:",NR,"FNR2:",i+x,"FNR:",FNR,"\tsecond file\t"}
for(i=FNR ; i+x+y<=NR && i<=FNR && NR==FNR+x+y; i++) {print "NR:",NR,"FNR3:",i+x+y,"FNR:",FNR,"\tthird file\t"}
}' file1 file2 file3
当前和期望的输出
$ sh fnrarray.sh
NR: 1 FNR1: 1 FNR: 1 first file
NR: 2 FNR1: 2 FNR: 2 first file
NR: 3 FNR1: 3 FNR: 3 first file
NR: 4 FNR1: 4 FNR: 4 first file
NR: 5 FNR2: 5 FNR: 1 second file
NR: 6 FNR2: 6 FNR: 2 second file
NR: 7 FNR2: 7 FNR: 3 second file
NR: 8 FNR3: 8 FNR: 1 third file
NR: 9 FNR3: 9 FNR: 2 third file
您可以看到 NR
与 FNR#
对齐,并且 NR
对应 file#
.
另一种方法
我在这里 FNR==1{++f} f==1 {}
找到了这个方法 Handling 3 Files using awk
但是每次读取新行时,此方法都会替换arr1[1]
失败尝试 1
$ awk -v FS='[|]' 'FNR==1{++f} f==1 {split(,arr); print arr1[1]}' file1 file2 file3
A1
A2
A3
A4
for循环成功(arr1[1]
不变)
$ awk -v FS='[|]' '{for(i=FNR ; i<=NR && i<=FNR && NR==FNR; i++) {arr1[++k]=; print arr1[1]}}' file1 file2 file3
A1
A1
A1
A1
When it comes to three or more files I can't see a way which is second and third file as both have the same !(FNR==NR) condition. This made me to try to figure out how can there be something like FNR2 and FNR3?
示例如下:
$ cat f1
X|A1|Z
X|A2|Z
X|A3|Z
X|A4|Z
$ cat f2
X|Y|A3
X|Y|A4
X|Y|A5
$ cat f3
A1|Y|Z
A4|Y|Z
示例输出:
$ awk -F '|' 'FNR==1{file++}{array[file, FNR]=[=11=]; max=max>FNR?max:FNR}END{for(f=1; f<=file; f++){ for(row=1; row<=max; row++){ key=f SUBSEP row; if(key in array)print "file: "f,"row :"row,"record: "array[key] } }}' f1 f2 f3
file: 1 row :1 record: X|A1|Z
file: 1 row :2 record: X|A2|Z
file: 1 row :3 record: X|A3|Z
file: 1 row :4 record: X|A4|Z
file: 2 row :1 record: X|Y|A3
file: 2 row :2 record: X|Y|A4
file: 2 row :3 record: X|Y|A5
file: 3 row :1 record: A1|Y|Z
file: 3 row :2 record: A4|Y|Z
解释:
awk -F '|' 'FNR==1{ # FNR will reset for every file
file++ # so whenever FNR==1 increment variable file
}
{
# array name : array
# array key being : file, FNR
# array value : [=12=] which current record/row
array[file, FNR] = [=12=];
# here we find which row count in all available files
max = max > FNR ? max : FNR
}
END{ # end block when all files are read
# start iterating over file
# as we now variable file hold total no files read
for(f=1; f<=file; f++)
{
# iterate now for record from each file
# variable max holds max row count
for(row=1; row<=max; row++)
{
# variable key will now have
# key = file-number SUBSET row-number
key=f SUBSEP row;
# if key exists in array
# print array value
if(key in array)
print "file: "f,"row :"row,"record: "array[key]
}
}
}' f1 f2 f3
其他选项是使用真正的多维数组,如下所示。 gawk
当然具体了。
假设文件名是唯一的,否则使用 FNR==1{ file++}
并使用 file
FILENAME
$ awk --version
GNU Awk 4.2.1, API: 2.0 (GNU MPFR 3.1.6-p2, GNU MP 6.1.2)
Copyright (C) 1989, 1991-2018 Free Software Foundation.
$ awk -F '|' '{
true_multi_array[FILENAME][FNR] = [=13=]
}
END{
for(file in true_multi_array)
for(row in true_multi_array[file])
print "file:",file, "row :" row, "record:" true_multi_array[file][row]
}' f1 f2 f3
file: f1 row :1 record:X|A1|Z
file: f1 row :2 record:X|A2|Z
file: f1 row :3 record:X|A3|Z
file: f1 row :4 record:X|A4|Z
file: f2 row :1 record:X|Y|A3
file: f2 row :2 record:X|Y|A4
file: f2 row :3 record:X|Y|A5
file: f3 row :1 record:A1|Y|Z
file: f3 row :2 record:A4|Y|Z
无论如何使用 GNU awk 按顺序识别文件:
awk '
ARGIND == 1 { do 1st file stuff }
ARGIND == 2 { do 2nd file stuff }
ARGIND == 3 { do 3rd file stuff }
' file1 file2 file3
例如从您提供的 3 个示例输入文件中获取问题中“输出”下的文本:
awk '
ARGIND == 1 { pos = "first" }
ARGIND == 2 { pos = "second" }
ARGIND == 3 { pos = "third" }
{ print "NR:", NR, "FNR" ARGIND ":", NR, "FNR:", FNR, pos " file" }
' file1 file2 file3
NR: 1 FNR1: 1 FNR: 1 first file
NR: 2 FNR1: 2 FNR: 2 first file
NR: 3 FNR1: 3 FNR: 3 first file
NR: 4 FNR1: 4 FNR: 4 first file
NR: 5 FNR2: 5 FNR: 1 second file
NR: 6 FNR2: 6 FNR: 2 second file
NR: 7 FNR2: 7 FNR: 3 second file
NR: 8 FNR3: 8 FNR: 1 third file
NR: 9 FNR3: 9 FNR: 2 third file
或者如果所有文件名都是唯一的,无论它们是否为空,都使用任何 awk:
awk '
FILENAME == ARGV[1] { do 1st file stuff }
FILENAME == ARGV[2] { do 2nd file stuff }
FILENAME == ARGV[3] { do 3rd file stuff }
' file1 file2 file3
或者如果文件不为空则是否唯一(注意 file1
在 arg 列表中两次):
awk '
FNR == 1 { argind++ }
argind == 1 { do 1st file stuff }
argind == 2 { do 2nd file stuff }
argind == 3 { do 3rd file stuff }
' file1 file2 file1
如果一个文件名可以在 arg 列表中多次出现,并且某些文件可能为空,那么使用非 GNU awk 会变得更加棘手,这就是 GNU awk 具有 ARGIND 的原因,例如像(未经测试):
awk '
BEGIN {
for (i=1; i<ARGC; i++) {
fname = ARGV[i]
if ( (getline line < fname) > 0 ) {
# file is not empty so save its position in the args
# list in an array indexed by its name and the number
# of times that name has been seen so far
arginds[fname,++tmpcnt[fname]] = i
}
close(fname)
}
}
FNR == 1 { argind = arginds[FILENAME,++cnt[FILENAME]] }
argind == 1 { do 1st file stuff }
argind == 2 { do 2nd file stuff }
argind == 3 { do 3rd file stuff }
' file1 file2 file1