GAWK 脚本 - 如果 = "" 和 STD 计算，则跳过 $6 OR $7 行

Question

我在 awk 脚本中输入了 2 列（体重、身高）的平均值和标准偏差以及获得的奥运奖牌百分比（按 male/female 求和和分组）。

我现在只缺少标准计算。

有时体重或身高字段有空值。如果其中一个为空，我想跳过它们，以免影响计算。

$6=身高,$7=体重

Header 应该是：

Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals

输入数据进行测试：

id,name,nationality,sex,date_of_birth,height,weight,sport,gold,silver,bronze,info
736041664,A Jesus Garcia,ESP,male,1969-10-17,,64,athletics,1,0,0,
435962603,Aaron Brown,USA,male,1992-05-27,1.98,79,athletics,0,1,2,
266237702,Aaron Russell,USA,male,1993-06-04,2.05,98,volleyball,0,0,1,
87689776,Aauri Lorena Bokesa,ESP,female,1988-12-14,1.80,62,athletics,0,1,0,
997877719,Ababel Yeshaneh,ETH,female,1991-07-22,1.65,54,athletics,1,0,0,
343694681,Abadi Hadis,ETH,male,1997-11-06,1.70,63,athletics,0,4,0,
376068084,Abbey D'Agostino,USA,female,1992-05-25,1.61,49,athletics,0,0,1,
162792594,Abbey Weitzeil,USA,female,1996-12-03,1.78,68,aquatics,1,1,0,
803161695,Abdelaziz Merzougui,ESP,male,1991-08-30,1.75,,athletics,1,0,1,

脚本是：

BEGIN { FS="," }
NR>1  { medals_all+= ( +  + )          # sum of ALL medals
        if ( != country) next                # if not the country of interest then go to next record
        found_country=1
        counts[]++                           # count of athletes by sex
        height_sum[]+=                     # sum of heights by sex
        weight_sum[]+=                     # sum of weights by sex
        medals_sum[]+= ( +  + )      # sum of medals by sex
      }
END   { if (found_country != 1) {
           printf "Sorry, country \"%s\" not found.\n", country
        }
        else {
           print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
           for (sex in counts)
               printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
                      country,sex,
                      (counts[sex]>0) ? (weight_sum[sex]/counts[sex])    : 0,"weight_std",
                      (counts[sex]>0) ? (height_sum[sex]/counts[sex])    : 0,"height_std",
                      (medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
        }
      }

我在想类似的东西：

if ( |  = "" ) next

但是它给我一个错误，我不知道把它放在哪里（在 END 块之后还是之前？）

Answer 1

这不会尝试进行标准偏差计算和 idk 你是如何在你的预期输出中获得这些奖牌百分比数字的，但你应该很容易调整它来完成你需要的任何东西做：

$ cat tst.awk
BEGIN {
    FS = OFS = ","
    OFMT = "%.4f"
}
NR==1 {
    for (i=1; i<=NF; i++) {
        f[$i] = i
    }
    print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
    next
}
$(f["nationality"]) == country_code {
    sex = $(f["sex"])
    ccWeight[sex] += $(f["weight"])
    ccHeight[sex] += $(f["height"])
    ccMedals[sex] += ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
}
END {
    for ( sex in ccWeight ) {
        avgWeight = ccWeight[sex] / ccMedals[sex]
        stdWeight = "foo"
        avgHeight = ccWeight[sex] / ccMedals[sex]
        stdHeight = "bar"
        pctMedals = ( ccMedals[sex] / (NR - 1) ) * 100

        print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
    }
}

$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,33.3333
USA,male,44.2500,foo,44.2500,bar,44.4444

$ awk -v country_code=ESP -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62,foo,62,bar,11.1111
ESP,male,43.6667,foo,43.6667,bar,33.3333

$ awk -v country_code=ETH -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,15.7500,foo,15.7500,bar,44.4444
ETH,female,54,foo,54,bar,11.1111

根据您在下方的评论，这是关于如何计算平均奖牌的另一种可能的解释，但它仍然没有产生您想要的输出，所以我猜您的意思又不同了：

$ cat tst.awk
BEGIN {
    FS = OFS = ","
    OFMT = "%.4f"
}
NR==1 {
    for (i=1; i<=NF; i++) {
        f[$i] = i
    }
    print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
    next
}
{
    sex = $(f["sex"])
    numMedals = ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
    allMedals[sex] += numMedals
}
$(f["nationality"]) == country_code {
    ccWeight[sex] += $(f["weight"])
    ccHeight[sex] += $(f["height"])
    ccMedals[sex] += numMedals
}
END {
    for ( sex in ccWeight ) {
        avgWeight = ccWeight[sex] / ccMedals[sex]
        stdWeight = "foo"
        avgHeight = ccWeight[sex] / ccMedals[sex]
        stdHeight = "bar"
        pctMedals = ( ccMedals[sex] / allMedals[sex] ) * 100

        print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
    }
}

$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,60
USA,male,44.2500,foo,44.2500,bar,36.3636

Answer 2

一个 awk 想法（没有标准偏差的代码）

$ cat athletes.awk
BEGIN { FS="," }
NR>1  { medals_all+= ( +  + )          # sum of ALL medals
        if ( != country) next                # if not the country of interest then go to next record
        found_country=1
        counts[]++                           # count of athletes by sex
        height_sum[]+=                     # sum of heights by sex
        weight_sum[]+=                     # sum of weights by sex
        medals_sum[]+= ( +  + )      # sum of medals by sex
      }
END   { if (found_country != 1) {
           printf "Sorry, country \"%s\" not found.\n", country
        }
        else {
           print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
           for (sex in counts)
               printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
                      country,sex,
                      (counts[sex]>0) ? (weight_sum[sex]/counts[sex])    : 0,"weight_std",
                      (counts[sex]>0) ? (height_sum[sex]/counts[sex])    : 0,"height_std",
                      (medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
        }
      }

正在测试脚本：

$ awk -v country=USA -f athletes.awk athletesv2.csv

Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,58.5000 Kg,weight_std,1.695 m,height_std,18.7500%
USA,male,88.5000 Kg,weight_std,2.015 m,height_std,25.0000%

$ awk -v country=ESP -f athletes.awk athletesv2.csv

Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62.0000 Kg,weight_std,1.800 m,height_std,6.2500%
ESP,male,65.5000 Kg,weight_std,1.735 m,height_std,18.7500%

$ awk -v country=ETH -f athletes.awk athletesv2.csv

Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,63.0000 Kg,weight_std,1.700 m,height_std,25.0000%
ETH,female,54.0000 Kg,weight_std,1.650 m,height_std,6.2500%

$ awk -v country=XXX -f athletes.awk athletesv2.csv

Sorry, country "XXX" not found.

Answer 3

一个统一的解决方案，不再需要用户一次手动输入一个国家：

PROCINFO[ ] 位用于从 mawk 交换到 gawk 同时保留一些排序顺序

==

< test_athletesv2.csv\
\
| WHINY_USERS=1 mawk '
  function sum(_,__,___) { 
      if(+__<-__) { 
           return sprintf(OFMT,$_) 
      }
      ___=""
      for(_;_<=__;_+=3) {
         ___+=$_ }
      return +___ 
   } 
   function mean(_,__) {
       return \
          sprintf(OFMT,
          (+__<-__ ? +$_ :2/__*sum(_,__))\
                          /(100.0^(_==1)))
   }
   function sd(_,__,___,____,_____) {
       if(+__<-__) {
           return "0.0000000"
       }
       ____=""
       _____=100^(_==1)
       for(_;_<=+__;_+=3) {
           ____+=(($_)/_____-___)^2
       }
       return (____/(__/(_=2)))^_--^-_ 
  }
  function printreport(_,__,___) {
      ___=""
      print substr(_,__~__,index(_,"=")-1),
            substr(_,      index(_,"=")+(_=1)),
            ___=mean(_,__),sd(_++,__,___),
            ___=mean(_,__),sd(_++,__,___),
            sprintf("%8.4f-%%",sum(_,__)*100/_______)  
 } 
 BEGIN {  _ = ""
 PROCINFO[    "sorted_in"  \
          ] = "@ind_str_asc";
        ___ = 3
     ______ = " Country,Gender,Weight_avg,Weight_std"\
              ",Height_avg,Height_std,%-Medals"
     SUBSEP =   "="
        OFS = FS = ","
       
 getline } { sub("$",sprintf("=%.f=%.f=%.f", \
                             int(100*),,-_\
                             +(_+=+++)),
                      _____[____[$___]=$___,]) 
 } END {
      _______ = +_
          ___ =  3
           FS = SUBSEP
      CONVFMT = OFMT ="%13.7f"
 
      for(_ in ____) {
          printf("%s%s%s",ORS,______,ORS)
          for(__ in _____) {
              if(index(__,_)) {
                  $+FS=substr(_____[__],—-___)
                  printreport(__,(-!!___)^(NF==++___)*NF) 
              } 
          }
      }
 }' | column -s',' -t | column -t |   lgp3 3 

Country  Gender  Weight_avg  Weight_std  Height_avg  Height_std  %-Medals
ESP      female  1.8000000   0.0000000   62.0000000  0.0000000   6.2500-%
ESP      male    1.1566667   0.4723660   43.6666667  17.8688639  18.7500-%

Country  Gender  Weight_avg  Weight_std  Height_avg  Height_std  %-Medals
ETH      female  1.6500000   0.0000000   54.0000000  0.0000000   6.2500-%
ETH      male    1.7000000   0.0000000   63.0000000  0.0000000   25.0000-%

Country  Gender  Weight_avg  Weight_std  Height_avg  Height_std  %-Medals
USA      female  1.1300000   0.4665119   39.0000000  17.7106371  18.7500-%
USA      male    1.3400000   0.5476008   59.0000000  25.3048085  25.0000-%

GAWK 脚本 - 如果 = "" 和 STD 计算，则跳过 $6 OR $7 行

GAWK script- Skip rows $6 OR $7 if = "" and STD calculation

csv

awk