GAWK 脚本 - 如果 = "" 和 STD 计算,则跳过 $6 OR $7 行
GAWK script- Skip rows $6 OR $7 if = "" and STD calculation
我在 awk 脚本中输入了 2 列(体重、身高)的平均值和标准偏差以及获得的奥运奖牌百分比(按 male/female 求和和分组)。
我现在只缺少标准计算。
有时体重或身高字段有空值。如果其中一个为空,我想跳过它们,以免影响计算。
$6=身高,$7=体重
Header 应该是:
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
输入数据进行测试:
id,name,nationality,sex,date_of_birth,height,weight,sport,gold,silver,bronze,info
736041664,A Jesus Garcia,ESP,male,1969-10-17,,64,athletics,1,0,0,
435962603,Aaron Brown,USA,male,1992-05-27,1.98,79,athletics,0,1,2,
266237702,Aaron Russell,USA,male,1993-06-04,2.05,98,volleyball,0,0,1,
87689776,Aauri Lorena Bokesa,ESP,female,1988-12-14,1.80,62,athletics,0,1,0,
997877719,Ababel Yeshaneh,ETH,female,1991-07-22,1.65,54,athletics,1,0,0,
343694681,Abadi Hadis,ETH,male,1997-11-06,1.70,63,athletics,0,4,0,
376068084,Abbey D'Agostino,USA,female,1992-05-25,1.61,49,athletics,0,0,1,
162792594,Abbey Weitzeil,USA,female,1996-12-03,1.78,68,aquatics,1,1,0,
803161695,Abdelaziz Merzougui,ESP,male,1991-08-30,1.75,,athletics,1,0,1,
脚本是:
BEGIN { FS="," }
NR>1 { medals_all+= ( + + ) # sum of ALL medals
if ( != country) next # if not the country of interest then go to next record
found_country=1
counts[]++ # count of athletes by sex
height_sum[]+= # sum of heights by sex
weight_sum[]+= # sum of weights by sex
medals_sum[]+= ( + + ) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
我在想类似的东西:
if ( | = "" ) next
但是它给我一个错误,我不知道把它放在哪里(在 END 块之后还是之前?)
这不会尝试进行标准偏差计算和 idk 你是如何在你的预期输出中获得这些奖牌百分比数字的,但你应该很容易调整它来完成你需要的任何东西做:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
$(f["nationality"]) == country_code {
sex = $(f["sex"])
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / (NR - 1) ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,33.3333
USA,male,44.2500,foo,44.2500,bar,44.4444
$ awk -v country_code=ESP -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62,foo,62,bar,11.1111
ESP,male,43.6667,foo,43.6667,bar,33.3333
$ awk -v country_code=ETH -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,15.7500,foo,15.7500,bar,44.4444
ETH,female,54,foo,54,bar,11.1111
根据您在下方的评论,这是关于如何计算平均奖牌的另一种可能的解释,但它仍然没有产生您想要的输出,所以我猜您的意思又不同了:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
{
sex = $(f["sex"])
numMedals = ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
allMedals[sex] += numMedals
}
$(f["nationality"]) == country_code {
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += numMedals
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / allMedals[sex] ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,60
USA,male,44.2500,foo,44.2500,bar,36.3636
一个 awk
想法(没有标准偏差的代码)
$ cat athletes.awk
BEGIN { FS="," }
NR>1 { medals_all+= ( + + ) # sum of ALL medals
if ( != country) next # if not the country of interest then go to next record
found_country=1
counts[]++ # count of athletes by sex
height_sum[]+= # sum of heights by sex
weight_sum[]+= # sum of weights by sex
medals_sum[]+= ( + + ) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
正在测试脚本:
$ awk -v country=USA -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,58.5000 Kg,weight_std,1.695 m,height_std,18.7500%
USA,male,88.5000 Kg,weight_std,2.015 m,height_std,25.0000%
$ awk -v country=ESP -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62.0000 Kg,weight_std,1.800 m,height_std,6.2500%
ESP,male,65.5000 Kg,weight_std,1.735 m,height_std,18.7500%
$ awk -v country=ETH -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,63.0000 Kg,weight_std,1.700 m,height_std,25.0000%
ETH,female,54.0000 Kg,weight_std,1.650 m,height_std,6.2500%
$ awk -v country=XXX -f athletes.awk athletesv2.csv
Sorry, country "XXX" not found.
一个统一的解决方案,不再需要用户一次手动输入一个国家:
PROCINFO[ ]
位用于从 mawk
交换到 gawk
同时保留一些排序顺序
==
< test_athletesv2.csv\
\
| WHINY_USERS=1 mawk '
function sum(_,__,___) {
if(+__<-__) {
return sprintf(OFMT,$_)
}
___=""
for(_;_<=__;_+=3) {
___+=$_ }
return +___
}
function mean(_,__) {
return \
sprintf(OFMT,
(+__<-__ ? +$_ :2/__*sum(_,__))\
/(100.0^(_==1)))
}
function sd(_,__,___,____,_____) {
if(+__<-__) {
return "0.0000000"
}
____=""
_____=100^(_==1)
for(_;_<=+__;_+=3) {
____+=(($_)/_____-___)^2
}
return (____/(__/(_=2)))^_--^-_
}
function printreport(_,__,___) {
___=""
print substr(_,__~__,index(_,"=")-1),
substr(_, index(_,"=")+(_=1)),
___=mean(_,__),sd(_++,__,___),
___=mean(_,__),sd(_++,__,___),
sprintf("%8.4f-%%",sum(_,__)*100/_______)
}
BEGIN { _ = ""
PROCINFO[ "sorted_in" \
] = "@ind_str_asc";
___ = 3
______ = " Country,Gender,Weight_avg,Weight_std"\
",Height_avg,Height_std,%-Medals"
SUBSEP = "="
OFS = FS = ","
getline } { sub("$",sprintf("=%.f=%.f=%.f", \
int(100*),,-_\
+(_+=+++)),
_____[____[$___]=$___,])
} END {
_______ = +_
___ = 3
FS = SUBSEP
CONVFMT = OFMT ="%13.7f"
for(_ in ____) {
printf("%s%s%s",ORS,______,ORS)
for(__ in _____) {
if(index(__,_)) {
$+FS=substr(_____[__],—-___)
printreport(__,(-!!___)^(NF==++___)*NF)
}
}
}
}' | column -s',' -t | column -t | lgp3 3
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ESP female 1.8000000 0.0000000 62.0000000 0.0000000 6.2500-%
ESP male 1.1566667 0.4723660 43.6666667 17.8688639 18.7500-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ETH female 1.6500000 0.0000000 54.0000000 0.0000000 6.2500-%
ETH male 1.7000000 0.0000000 63.0000000 0.0000000 25.0000-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
USA female 1.1300000 0.4665119 39.0000000 17.7106371 18.7500-%
USA male 1.3400000 0.5476008 59.0000000 25.3048085 25.0000-%
我在 awk 脚本中输入了 2 列(体重、身高)的平均值和标准偏差以及获得的奥运奖牌百分比(按 male/female 求和和分组)。
我现在只缺少标准计算。
有时体重或身高字段有空值。如果其中一个为空,我想跳过它们,以免影响计算。
$6=身高,$7=体重
Header 应该是:
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
输入数据进行测试:
id,name,nationality,sex,date_of_birth,height,weight,sport,gold,silver,bronze,info
736041664,A Jesus Garcia,ESP,male,1969-10-17,,64,athletics,1,0,0,
435962603,Aaron Brown,USA,male,1992-05-27,1.98,79,athletics,0,1,2,
266237702,Aaron Russell,USA,male,1993-06-04,2.05,98,volleyball,0,0,1,
87689776,Aauri Lorena Bokesa,ESP,female,1988-12-14,1.80,62,athletics,0,1,0,
997877719,Ababel Yeshaneh,ETH,female,1991-07-22,1.65,54,athletics,1,0,0,
343694681,Abadi Hadis,ETH,male,1997-11-06,1.70,63,athletics,0,4,0,
376068084,Abbey D'Agostino,USA,female,1992-05-25,1.61,49,athletics,0,0,1,
162792594,Abbey Weitzeil,USA,female,1996-12-03,1.78,68,aquatics,1,1,0,
803161695,Abdelaziz Merzougui,ESP,male,1991-08-30,1.75,,athletics,1,0,1,
脚本是:
BEGIN { FS="," }
NR>1 { medals_all+= ( + + ) # sum of ALL medals
if ( != country) next # if not the country of interest then go to next record
found_country=1
counts[]++ # count of athletes by sex
height_sum[]+= # sum of heights by sex
weight_sum[]+= # sum of weights by sex
medals_sum[]+= ( + + ) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
我在想类似的东西:
if ( | = "" ) next
但是它给我一个错误,我不知道把它放在哪里(在 END 块之后还是之前?)
这不会尝试进行标准偏差计算和 idk 你是如何在你的预期输出中获得这些奖牌百分比数字的,但你应该很容易调整它来完成你需要的任何东西做:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
$(f["nationality"]) == country_code {
sex = $(f["sex"])
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / (NR - 1) ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,33.3333
USA,male,44.2500,foo,44.2500,bar,44.4444
$ awk -v country_code=ESP -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62,foo,62,bar,11.1111
ESP,male,43.6667,foo,43.6667,bar,33.3333
$ awk -v country_code=ETH -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,15.7500,foo,15.7500,bar,44.4444
ETH,female,54,foo,54,bar,11.1111
根据您在下方的评论,这是关于如何计算平均奖牌的另一种可能的解释,但它仍然没有产生您想要的输出,所以我猜您的意思又不同了:
$ cat tst.awk
BEGIN {
FS = OFS = ","
OFMT = "%.4f"
}
NR==1 {
for (i=1; i<=NF; i++) {
f[$i] = i
}
print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
next
}
{
sex = $(f["sex"])
numMedals = ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
allMedals[sex] += numMedals
}
$(f["nationality"]) == country_code {
ccWeight[sex] += $(f["weight"])
ccHeight[sex] += $(f["height"])
ccMedals[sex] += numMedals
}
END {
for ( sex in ccWeight ) {
avgWeight = ccWeight[sex] / ccMedals[sex]
stdWeight = "foo"
avgHeight = ccWeight[sex] / ccMedals[sex]
stdHeight = "bar"
pctMedals = ( ccMedals[sex] / allMedals[sex] ) * 100
print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
}
}
$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,60
USA,male,44.2500,foo,44.2500,bar,36.3636
一个 awk
想法(没有标准偏差的代码)
$ cat athletes.awk
BEGIN { FS="," }
NR>1 { medals_all+= ( + + ) # sum of ALL medals
if ( != country) next # if not the country of interest then go to next record
found_country=1
counts[]++ # count of athletes by sex
height_sum[]+= # sum of heights by sex
weight_sum[]+= # sum of weights by sex
medals_sum[]+= ( + + ) # sum of medals by sex
}
END { if (found_country != 1) {
printf "Sorry, country \"%s\" not found.\n", country
}
else {
print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
for (sex in counts)
printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
country,sex,
(counts[sex]>0) ? (weight_sum[sex]/counts[sex]) : 0,"weight_std",
(counts[sex]>0) ? (height_sum[sex]/counts[sex]) : 0,"height_std",
(medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
}
}
正在测试脚本:
$ awk -v country=USA -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,58.5000 Kg,weight_std,1.695 m,height_std,18.7500%
USA,male,88.5000 Kg,weight_std,2.015 m,height_std,25.0000%
$ awk -v country=ESP -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62.0000 Kg,weight_std,1.800 m,height_std,6.2500%
ESP,male,65.5000 Kg,weight_std,1.735 m,height_std,18.7500%
$ awk -v country=ETH -f athletes.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,63.0000 Kg,weight_std,1.700 m,height_std,25.0000%
ETH,female,54.0000 Kg,weight_std,1.650 m,height_std,6.2500%
$ awk -v country=XXX -f athletes.awk athletesv2.csv
Sorry, country "XXX" not found.
一个统一的解决方案,不再需要用户一次手动输入一个国家:
PROCINFO[ ]
位用于从mawk
交换到gawk
同时保留一些排序顺序
==
< test_athletesv2.csv\
\
| WHINY_USERS=1 mawk '
function sum(_,__,___) {
if(+__<-__) {
return sprintf(OFMT,$_)
}
___=""
for(_;_<=__;_+=3) {
___+=$_ }
return +___
}
function mean(_,__) {
return \
sprintf(OFMT,
(+__<-__ ? +$_ :2/__*sum(_,__))\
/(100.0^(_==1)))
}
function sd(_,__,___,____,_____) {
if(+__<-__) {
return "0.0000000"
}
____=""
_____=100^(_==1)
for(_;_<=+__;_+=3) {
____+=(($_)/_____-___)^2
}
return (____/(__/(_=2)))^_--^-_
}
function printreport(_,__,___) {
___=""
print substr(_,__~__,index(_,"=")-1),
substr(_, index(_,"=")+(_=1)),
___=mean(_,__),sd(_++,__,___),
___=mean(_,__),sd(_++,__,___),
sprintf("%8.4f-%%",sum(_,__)*100/_______)
}
BEGIN { _ = ""
PROCINFO[ "sorted_in" \
] = "@ind_str_asc";
___ = 3
______ = " Country,Gender,Weight_avg,Weight_std"\
",Height_avg,Height_std,%-Medals"
SUBSEP = "="
OFS = FS = ","
getline } { sub("$",sprintf("=%.f=%.f=%.f", \
int(100*),,-_\
+(_+=+++)),
_____[____[$___]=$___,])
} END {
_______ = +_
___ = 3
FS = SUBSEP
CONVFMT = OFMT ="%13.7f"
for(_ in ____) {
printf("%s%s%s",ORS,______,ORS)
for(__ in _____) {
if(index(__,_)) {
$+FS=substr(_____[__],—-___)
printreport(__,(-!!___)^(NF==++___)*NF)
}
}
}
}' | column -s',' -t | column -t | lgp3 3
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ESP female 1.8000000 0.0000000 62.0000000 0.0000000 6.2500-%
ESP male 1.1566667 0.4723660 43.6666667 17.8688639 18.7500-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
ETH female 1.6500000 0.0000000 54.0000000 0.0000000 6.2500-%
ETH male 1.7000000 0.0000000 63.0000000 0.0000000 25.0000-%
Country Gender Weight_avg Weight_std Height_avg Height_std %-Medals
USA female 1.1300000 0.4665119 39.0000000 17.7106371 18.7500-%
USA male 1.3400000 0.5476008 59.0000000 25.3048085 25.0000-%