awk 如何按 NA 拆分和更改空白
awk how to split and change blank by NA
我在用 awk 做一些事情时遇到了麻烦。我想将一个文件拆分为 2 个文件,它大部分都在工作,但我有最后一个问题:
这是我的输入文件之一:
samplexxx EH Tred GangSTR
dijen006 nofile nofile nofile
dijen006_100 22,30 22,27 19,25
dijen006_75 25,27 29 NA
dijen017 nofile nofile nofile
dijen017_100 75,121 54 24,24
dijen017_75 74,131 72 19,19
dijen081 63,84 32 40,40
dijen081_100 70,115 78 25,41
dijen081_75 79,143 95 24,104
dijen082 47,51 38 15,34
dijen082_100 46,61 52 6,32
dijen082_75 NA 55 17,17
dijen083 30,53 30,40 38,38
dijen083_100 43,53 30,59 23,32
dijen083_75 43,60 18,74 23,71
dijen1013 30 30 20,30
dijen1013_100 30 30 9,19
dijen1013_75 21 33 20,20
dijen1014 9,30 9,30 9,30
dijen1014_100 9,28 9,43 9,11
dijen1014_75 9,28 9,36 9,29
dijen1015 23,30 23,30 23,29
dijen1015_100 23,30 NA 13,22
dijen1015_75 25,27 21,42 22,39
dijen402 25,31 25,31 25,31
dijen402_100 30 29,36 14,30
dijen402_75 25,26 22,39 22,39
我正在使用此代码:
#!/bin/awk -f
#USAGE = awk -v my_var=$ibasename $i .tsv) split_file_allelle.awk $i
BEGIN { FS=OFS="\t" }
NR == 1 {
str1 = str2 = [=11=]
}
NR > 1 {
str1 = str2 =
for (i=2; i<=NF; i++) {
split($i,a,/,/)
str1 = str1 OFS a[1]
str2 = str2 OFS a[2]
}
}
{
print str1 > my_var"_all1.tsv"
print str2 > my_var"_all2.tsv"
}
我有两个文件,其中一个类似,以“,”分隔。您认为这是一种在没有数字的第二个文件中获取类似 'NA' 而不是空白的方法吗?
samplexxx EH Tred GangSTR
dijen006
dijen006_100 30 27 25
dijen006_75 27
dijen017
dijen017_100 121 24
dijen017_75 131 19
dijen081 84 40
dijen081_100 115 41
dijen081_75 143 104
dijen082 51 34
dijen082_100 61 32
dijen082_75 17
dijen083 53 40 38
dijen083_100 53 59 32
dijen083_75 60 74 71
dijen1013 30
dijen1013_100 19
dijen1013_75 20
dijen1014 30 30 30
dijen1014_100 28 43 11
dijen1014_75 28 36 29
dijen1015 30 30 29
dijen1015_100 30 22
dijen1015_75 27 42 39
dijen402 31 31 31
dijen402_100 36 30
dijen402_75 26 39 39
这就是我拥有的,但我想要类似的东西:
samplexxx EH Tred GangSTR
dijen006 NA NA NA
dijen006_100 30 27 25
dijen006_75 27 NA NA
dijen017 NA NA NA
dijen017_100 121 NA 24
....
感谢您的帮助!
BEGIN {
FS = OFS = "\t"
all1 = my_var "_all1.tsv"
all2 = my_var "_all2.tsv"
}
NR == 1 {
str1 = str2 = [=10=]
}
NR > 1 {
str1 = str2 =
for (i=2; i<=NF; i++) {
n = split($i,a,",")
str1 = str1 OFS a[1]
str2 = str2 OFS (n == 1 ? "NA" : a[2])
}
}
{
print str1 > all1
print str2 > all2
}
没有必要将 print str1 > my_var"_all1.tsv"
更改为 print str1 > all1
来解决您询问的特定问题,使用 split()
s return 测试的三元组确实那,但是 print str1 > my_var"_all1.tsv"
是每个 POSIX 的未定义行为,因此它会在某些 awks 中失败,而是需要使用我拥有的变量或在生成文件名的表达式周围使用括号来编写,print str1 > (my_var"_all1.tsv")
。使用变量并进行一次连接而不是每行一次连接更有效。
我在用 awk 做一些事情时遇到了麻烦。我想将一个文件拆分为 2 个文件,它大部分都在工作,但我有最后一个问题:
这是我的输入文件之一:
samplexxx EH Tred GangSTR
dijen006 nofile nofile nofile
dijen006_100 22,30 22,27 19,25
dijen006_75 25,27 29 NA
dijen017 nofile nofile nofile
dijen017_100 75,121 54 24,24
dijen017_75 74,131 72 19,19
dijen081 63,84 32 40,40
dijen081_100 70,115 78 25,41
dijen081_75 79,143 95 24,104
dijen082 47,51 38 15,34
dijen082_100 46,61 52 6,32
dijen082_75 NA 55 17,17
dijen083 30,53 30,40 38,38
dijen083_100 43,53 30,59 23,32
dijen083_75 43,60 18,74 23,71
dijen1013 30 30 20,30
dijen1013_100 30 30 9,19
dijen1013_75 21 33 20,20
dijen1014 9,30 9,30 9,30
dijen1014_100 9,28 9,43 9,11
dijen1014_75 9,28 9,36 9,29
dijen1015 23,30 23,30 23,29
dijen1015_100 23,30 NA 13,22
dijen1015_75 25,27 21,42 22,39
dijen402 25,31 25,31 25,31
dijen402_100 30 29,36 14,30
dijen402_75 25,26 22,39 22,39
我正在使用此代码:
#!/bin/awk -f
#USAGE = awk -v my_var=$ibasename $i .tsv) split_file_allelle.awk $i
BEGIN { FS=OFS="\t" }
NR == 1 {
str1 = str2 = [=11=]
}
NR > 1 {
str1 = str2 =
for (i=2; i<=NF; i++) {
split($i,a,/,/)
str1 = str1 OFS a[1]
str2 = str2 OFS a[2]
}
}
{
print str1 > my_var"_all1.tsv"
print str2 > my_var"_all2.tsv"
}
我有两个文件,其中一个类似,以“,”分隔。您认为这是一种在没有数字的第二个文件中获取类似 'NA' 而不是空白的方法吗?
samplexxx EH Tred GangSTR
dijen006
dijen006_100 30 27 25
dijen006_75 27
dijen017
dijen017_100 121 24
dijen017_75 131 19
dijen081 84 40
dijen081_100 115 41
dijen081_75 143 104
dijen082 51 34
dijen082_100 61 32
dijen082_75 17
dijen083 53 40 38
dijen083_100 53 59 32
dijen083_75 60 74 71
dijen1013 30
dijen1013_100 19
dijen1013_75 20
dijen1014 30 30 30
dijen1014_100 28 43 11
dijen1014_75 28 36 29
dijen1015 30 30 29
dijen1015_100 30 22
dijen1015_75 27 42 39
dijen402 31 31 31
dijen402_100 36 30
dijen402_75 26 39 39
这就是我拥有的,但我想要类似的东西:
samplexxx EH Tred GangSTR
dijen006 NA NA NA
dijen006_100 30 27 25
dijen006_75 27 NA NA
dijen017 NA NA NA
dijen017_100 121 NA 24
....
感谢您的帮助!
BEGIN {
FS = OFS = "\t"
all1 = my_var "_all1.tsv"
all2 = my_var "_all2.tsv"
}
NR == 1 {
str1 = str2 = [=10=]
}
NR > 1 {
str1 = str2 =
for (i=2; i<=NF; i++) {
n = split($i,a,",")
str1 = str1 OFS a[1]
str2 = str2 OFS (n == 1 ? "NA" : a[2])
}
}
{
print str1 > all1
print str2 > all2
}
没有必要将 print str1 > my_var"_all1.tsv"
更改为 print str1 > all1
来解决您询问的特定问题,使用 split()
s return 测试的三元组确实那,但是 print str1 > my_var"_all1.tsv"
是每个 POSIX 的未定义行为,因此它会在某些 awks 中失败,而是需要使用我拥有的变量或在生成文件名的表达式周围使用括号来编写,print str1 > (my_var"_all1.tsv")
。使用变量并进行一次连接而不是每行一次连接更有效。