比较两个文本文件,如果第二个文件有一行包含第一个文件的两列,则删除该行
Compare two text files and if the second file has a row which contains both the columns of first file delete that row
我有两个文件,如下所示。 file1 有两列,file2 有不同数量的列,具体取决于行。我想比较两个文件,如果 file1 的 $1 和 $2 在 file2 的一行中,我想删除该行。 file2 也是逗号分隔的。如何使用 awk 完成此操作?或任何其他文本处理工具?
文件1
5052 5051
4952 4951
file2
2001, 5052, 7001, 5051, 1000
2002, 5052, 7001, 1500, 2500
2003, 5051, 3500, 4500, 4952
2004, 4952, 4999, 4500, 4951
预期输出:
2002, 5052, 7001, 1500, 2500
2003, 5051, 3500, 4500, 4952
我尝试了下面的 awk 代码但没有 运行。
awk 'NR==FNR{A[]=;A[]=; next} {if ([=13=]=A[] && [=13=]=A[]){next} else {print [=13=]}' file1 file2 >> test.inp
awk 'NR==FNR { map[]="1";map1[]="1";next } {lin=gensub(" ","","g",[=10=]);split(lin,map3,",");ok=1;for (i in map3) { if (map1[map3[i]]==1 || map[map3[i]]==1 ) { ok=0 } } if (ok==1) { print [=10=] } }' file1 file2
解释:
awk 'NR==FNR { # Process the first file
map[]=""; # Set up two arrays, one for
the first space delimited
field and the other for
the second
map1[]="";
next # Skip to the next record
}
{
lin=gensub(" ","","g",[=11=]); # Process the second file and remove
all spaces from the the line
putting the result in a variable
lin
split(lin,map3,","); # Split the variable lin into the
array map3 based on commas as the
separator
ok=1; # Initialise a variable
for (i in map3) {
if (map1[map3[i]] || map[map3[i]]) {
ok=0 # Loop through each entry in the map3
array (on the line) and check if it
exists in map1 or map. If it does
exist, set ok to 0
}
}
if (ok==1) {
print [=11=] # Only if the variable ok is 1,
print
}
}' file1 file2
请您尝试以下操作:
awk 'NR==FNR {
a[] = # save as key, and as value
next # skip the following codes for file1
}
{
len = split(gensub("^ +", "", 1), ary, " *, *")
# remove leading spaces and split on commas,
# then assign ary to the field values of file2
matched = 0 # reset the flag
for (i = 1; i <= len; i++) { # loop over the fields of file2
if (ary[i] in a) { # if in file1 is found
for (j = 1; j <= len; j++) { # then go into a deeper loop to search in file1
if (a[ary[i]] == ary[j]) { # if is also found in the same record
matched = 1 # then set the flag
break # and exit from the inner loop
}
}
}
}
if (!matched) print # print the line unless "matched"
}' file1 file2
所提供示例的结果是 除第 行之外的所有记录:
4651, 4651, 4652, 4752, 4751
假设 file1 中的对在两个字段中的值都不相同:
$ cat tst.awk
NR==FNR {
pairs1[NR] =
pairs2[NR] =
next
}
{
orig = [=10=]
gsub(/[[:space:],]+/," ")
delete vals
for (i=1; i<=NF; i++) {
vals[$i]
}
for (nr in pairs1) {
if ( (pairs1[nr] in vals) && (pairs2[nr] in vals) ) {
next
}
}
print orig
}
$ awk -f tst.awk file1 file2
2002, 5052, 7001, 1500, 2500
2003, 5051, 3500, 4500, 4952
我有两个文件,如下所示。 file1 有两列,file2 有不同数量的列,具体取决于行。我想比较两个文件,如果 file1 的 $1 和 $2 在 file2 的一行中,我想删除该行。 file2 也是逗号分隔的。如何使用 awk 完成此操作?或任何其他文本处理工具?
文件1
5052 5051
4952 4951
file2
2001, 5052, 7001, 5051, 1000
2002, 5052, 7001, 1500, 2500
2003, 5051, 3500, 4500, 4952
2004, 4952, 4999, 4500, 4951
预期输出:
2002, 5052, 7001, 1500, 2500
2003, 5051, 3500, 4500, 4952
我尝试了下面的 awk 代码但没有 运行。
awk 'NR==FNR{A[]=;A[]=; next} {if ([=13=]=A[] && [=13=]=A[]){next} else {print [=13=]}' file1 file2 >> test.inp
awk 'NR==FNR { map[]="1";map1[]="1";next } {lin=gensub(" ","","g",[=10=]);split(lin,map3,",");ok=1;for (i in map3) { if (map1[map3[i]]==1 || map[map3[i]]==1 ) { ok=0 } } if (ok==1) { print [=10=] } }' file1 file2
解释:
awk 'NR==FNR { # Process the first file
map[]=""; # Set up two arrays, one for
the first space delimited
field and the other for
the second
map1[]="";
next # Skip to the next record
}
{
lin=gensub(" ","","g",[=11=]); # Process the second file and remove
all spaces from the the line
putting the result in a variable
lin
split(lin,map3,","); # Split the variable lin into the
array map3 based on commas as the
separator
ok=1; # Initialise a variable
for (i in map3) {
if (map1[map3[i]] || map[map3[i]]) {
ok=0 # Loop through each entry in the map3
array (on the line) and check if it
exists in map1 or map. If it does
exist, set ok to 0
}
}
if (ok==1) {
print [=11=] # Only if the variable ok is 1,
print
}
}' file1 file2
请您尝试以下操作:
awk 'NR==FNR {
a[] = # save as key, and as value
next # skip the following codes for file1
}
{
len = split(gensub("^ +", "", 1), ary, " *, *")
# remove leading spaces and split on commas,
# then assign ary to the field values of file2
matched = 0 # reset the flag
for (i = 1; i <= len; i++) { # loop over the fields of file2
if (ary[i] in a) { # if in file1 is found
for (j = 1; j <= len; j++) { # then go into a deeper loop to search in file1
if (a[ary[i]] == ary[j]) { # if is also found in the same record
matched = 1 # then set the flag
break # and exit from the inner loop
}
}
}
}
if (!matched) print # print the line unless "matched"
}' file1 file2
所提供示例的结果是 除第 行之外的所有记录:
4651, 4651, 4652, 4752, 4751
假设 file1 中的对在两个字段中的值都不相同:
$ cat tst.awk
NR==FNR {
pairs1[NR] =
pairs2[NR] =
next
}
{
orig = [=10=]
gsub(/[[:space:],]+/," ")
delete vals
for (i=1; i<=NF; i++) {
vals[$i]
}
for (nr in pairs1) {
if ( (pairs1[nr] in vals) && (pairs2[nr] in vals) ) {
next
}
}
print orig
}
$ awk -f tst.awk file1 file2
2002, 5052, 7001, 1500, 2500
2003, 5051, 3500, 4500, 4952