如何使用 awk 规范化 csv 的特定列的值？

Question

我有一个包含多个变量的 csv，我只想使用标准差对某些特定列进行归一化。该值减去变量的均值除以变量的标准差。

$z = \frac{x-\overline{x}}{\sigma}$

文件以逗号分隔，仅需使用 awk 对变量 months_loan_duration 和金额.

输入看起来像这样但是有一千行：

checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,6,critical,radio/tv,1169.53
1 - 200 DM,48,repaid,radio/tv,5951.78
,12,critical,education,2096.23

输出会是这样的：

checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-1.236,critical,radio/tv,-0.745
1 - 200 DM,2.248,repaid,radio/tv,0.95
,-0.738,critical,education,-0.417

到目前为止，我尝试了以下未成功的方法：

#! /usr/bin/awk -f
BEGIN{FS=","; OFS=",";numberColumn=NF}
NR!=1
{
for(i=1;i <=  numberColumn;i++)
        {
        total[i]+=$i;
        totalSquared[i]+=$i^2;
        }

for (i=1;i <= numberColumn;i++)
        {
        avg[i]=total[i]/(NR-1);
        std[i]=sqrt((totalSquared[i]/(NR-1))-avg[i]^2);
        }

for (i=1;i <= numberColumn;i++)
        {
        norm[i]=(($i-avg[i])/std[i])
        }
}
{
print ,$norm[2],3,4,$norm[5]
}

Answer 1

读取文件两次会更容易：

awk -F, -v OFS=, '
    NR==FNR {                   # 1st pass: accumulate values
        if (FNR > 1) {
            sx2 +=            # sum of col2
            sxx2 +=  *      # sum of col2^2
            sx5 +=            # sum of col5
            sxx5 +=  *      # sum of col5^2
            n++                 # count of samples
        }
        next
    }
    FNR==1 {                    # 2nd pass, 1st line: calc means and stdevs
        ave2 = sx2 / n          # mean of col2
        var2 = sxx2 / (n - 1) - ave2 * ave2 * n / (n - 1)
        if (var2 < 0) var2 = 0  # avoid rounding error
        sd2 = sqrt(var2)        # stdev of col2
        ave5 = sx5 / n
        var5 = sxx5 / (n - 1) - ave5 * ave5 * n / (n - 1)
        if (var5 < 0) var5 = 0
        sd5 = sqrt(var5)
        print                   # print the header line
    }
    FNR>1 {
        if (sd2 > 0)  = ( - ave2) / sd2
        if (sd5 > 0)  = ( - ave5) / sd5
        print
    }
' input_file.csv input_file.csv

输出：

checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-0.704361,critical,radio/tv,-0.750328
1 - 200 DM,1.14459,repaid,radio/tv,1.13527
,-0.440225,critical,education,-0.384939

请注意计算值与您的预期结果不同。

Answer 2

几千行对于 awk 来说不是那么大的文件：不妨一次加载它 - 这里我创建了它的 23.6 mn rows 合成版本（在两者上都进行了测试gawk 和 mawk) -

虽然总体性能与其他解决方案相似，但此代码避免了显式列出输入文件两次来执行其等效的2 遍处理

输入

     rows = 23,622,127. | UTF8 chars = 799192890. | bytes = 799192890.

     1  checking_balance,months_loan_duration,credit_history,purpose,amount
     2  < 0 DM,889,critical,luna,758.61
     3  ,150,critical,terra,1823.93
     4  1 - 200 DM,883,repaid,stablecoin,2525.55
     5  1 - 200 DM,65,repaid,terra,2405.67
     6  < 0 DM,9,critical,luna,4059.34
     7  < 0 DM,201,critical,stablecoin,5043
     8  1 - 200 DM,549,repaid,terra,471.92
     9  < 0 DM,853,critical,stablecoin,422.78
    10  < 0 DM,659,critical,luna,684.94

代码

    # gawk profile, created Tue May 24 04:11:02 2022

    'function abs(_) { 
         return \
                 +_<-_?-_:_ 
    } BEGIN {
        split(_____=(_=length(FS = RS = "^$"))+_,____,"")
    }
    END {
     1      gsub("\n", ",&")
     1      FS = "["(OFS= ",")"]"
     1     $!_ = $!( __ = _)
     1      __+= --NF

23622126    while ((_____+_) < (__-=_)) {
23622126        ____[___=_____] += ($__)^_
23622126          ____[ -—___ ] += ($__)
23622126          ____[___ * _] += -_^!_
23622126          ____[___-=+_] += ($(__-=_+_^!_))
23622126          ____[ ++___ ] += ($__)^_
        }
     1      ___ = (__=-____[_+_+_])-_^!_

     1       RS = -(abs((____[(_)]/___-(((NR=____[+_^!+_]/__)^_)*__/___)))^_^(_/-_)

            ___ = -(abs((____[_+_]/___-(((RT=____[_+_^!_]/__)^_)*__/___)))^_^(_/-_)
     1      ORS = "\n"
     1      gsub(ORS, "")
     1      OFS = ","
     1      print $(_^=_<_), $(__=++_), $++_, $++_, $++_
     1      OFMT = "%."(__*__+!(__=NF-__-__))"f"
23622126    while (++_ <= __) {
23622126        print $_, (NR-$++_)/RS, $++_, $++_, (RT-$++_)/___
        }
    }'

输出

     out9:  837MiB 0:00:28 [29.2MiB/s] [29.2MiB/s] [ <=> ]
      in0:  762MiB 0:00:00 [2.95GiB/s] [2.95GiB/s] [======>] 100%            
    ( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )
    
     26.98s user 1.58s system 99% cpu 28.681 total

23622127 878032266 878032266 testfile_stdnorm_test_004.txt_out.txt

 1  checking_balance,months_loan_duration,credit_history,purpose,amount
 2  < 0 DM,1.2000,critical,luna,-1.2939
 3  ,-1.2949,critical,terra,-0.6788
 4  1 - 200 DM,1.1798,repaid,stablecoin,-0.2737
 5  1 - 200 DM,-1.5818,repaid,terra,-0.3429
 6  < 0 DM,-1.7709,critical,luna,0.6119
 7  < 0 DM,-1.1227,critical,stablecoin,1.1798
 8  1 - 200 DM,0.0522,repaid,terra,-1.4594
 9  < 0 DM,1.0785,critical,stablecoin,-1.4878

针对较小输入优化的替代解决方案（例如，最多 10^6（100 万）行）

    # gawk profile, created Tue May 24 06:19:24 2022

    # BEGIN rule(s)

    BEGIN {
     1      __ = (FS = RS = "^$") * (ORS = "")
    }

    # END rule(s)

    END {
     1      _ = $__
     1      gsub("[\n][,]","\n_,",_)
     1      sub("^.+amount\n","",_)+gsub("[,][0-9.+-]+[,\n]", "&", _)
     1      _____ = "[^0-9.+-]+"
     1      gsub("^" (_____) "|[^]+","",_)
     1      _____ = __ = split(_,___,_____)
1048575     while (-(--__) < +__) {
1048575         ___["_"] += _=___[(__)]
1048575         ___["="] += _*_
1048575         ___["~"] += _=___[--__]
1048575         ___["^"] += _*_
1048575         ___[":"]++
        }
     1      _ = (__=___[":"])-(____ ^= _<_)
     1      ++____
     1      ___["}"] = -(abs((___["^"]/_)-(((___["{"] = ___["~"] / __)^____)*__/_)))^____^(-(_^(!_)))
     1      ___[")"] = -(abs((___["="]/_)-(((___["("] = ___["_"] / __)^____)*__/_)))^____^(-(_^(!_)))
     1      if (_ < _) {
            for (_ in ___) {
                print "debug", _, ___[_]
            }
        }
     1      ____ = split($(_ < _), ______, ORS = "\n")
     1      _ = index(FS = "[" (OFS = ",") "]", OFS)
     1      print ______[_ ^ (! _)]
1048574     for (__ += __ ^= _ < _; __ < ____; __++) {
1048574         print sprintf("%.*s%s,%+.*f,%s,%s,%+.*f", ! __, $! _ = ______[__], $(_ ~ _), _ + _, (___["{"] - $_) / ___["}"], $++_, $(--_ + _), _ + _, (___["("] - $NF) / ___[")"])
        }
    }


    # Functions, listed alphabetically

     2  function abs(_)
    {
     2      return (+_ < -_ ? -_ : _)
    }

解决方案 # 2 的性能：End-to-End 2^20 行需要 2.57 秒

 rows = 1048575. | UTF8 chars = 39912117. | bytes = 39912117.

 ( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )  
 2.46s user 0.13s system 100% cpu 2.573 total

如何使用 awk 规范化 csv 的特定列的值？

How to normalize the values of specific columns of a csv with awk?

bash

awk