如何使用 awk 规范化 csv 的特定列的值?
How to normalize the values of specific columns of a csv with awk?
我有一个包含多个变量的 csv,我只想使用标准差对某些特定列进行归一化。
该值减去变量的均值除以变量的标准差。
文件以逗号分隔,仅需使用 awk 对变量 months_loan_duration 和 金额.
输入看起来像这样但是有一千行:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,6,critical,radio/tv,1169.53
1 - 200 DM,48,repaid,radio/tv,5951.78
,12,critical,education,2096.23
输出会是这样的:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-1.236,critical,radio/tv,-0.745
1 - 200 DM,2.248,repaid,radio/tv,0.95
,-0.738,critical,education,-0.417
到目前为止,我尝试了以下未成功的方法:
#! /usr/bin/awk -f
BEGIN{FS=","; OFS=",";numberColumn=NF}
NR!=1
{
for(i=1;i <= numberColumn;i++)
{
total[i]+=$i;
totalSquared[i]+=$i^2;
}
for (i=1;i <= numberColumn;i++)
{
avg[i]=total[i]/(NR-1);
std[i]=sqrt((totalSquared[i]/(NR-1))-avg[i]^2);
}
for (i=1;i <= numberColumn;i++)
{
norm[i]=(($i-avg[i])/std[i])
}
}
{
print ,$norm[2],3,4,$norm[5]
}
读取文件两次会更容易:
awk -F, -v OFS=, '
NR==FNR { # 1st pass: accumulate values
if (FNR > 1) {
sx2 += # sum of col2
sxx2 += * # sum of col2^2
sx5 += # sum of col5
sxx5 += * # sum of col5^2
n++ # count of samples
}
next
}
FNR==1 { # 2nd pass, 1st line: calc means and stdevs
ave2 = sx2 / n # mean of col2
var2 = sxx2 / (n - 1) - ave2 * ave2 * n / (n - 1)
if (var2 < 0) var2 = 0 # avoid rounding error
sd2 = sqrt(var2) # stdev of col2
ave5 = sx5 / n
var5 = sxx5 / (n - 1) - ave5 * ave5 * n / (n - 1)
if (var5 < 0) var5 = 0
sd5 = sqrt(var5)
print # print the header line
}
FNR>1 {
if (sd2 > 0) = ( - ave2) / sd2
if (sd5 > 0) = ( - ave5) / sd5
print
}
' input_file.csv input_file.csv
输出:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-0.704361,critical,radio/tv,-0.750328
1 - 200 DM,1.14459,repaid,radio/tv,1.13527
,-0.440225,critical,education,-0.384939
请注意计算值与您的预期结果不同。
几千行对于 awk
来说不是那么大的文件:不妨一次加载它 - 这里我创建了它的 23.6 mn rows
合成版本(在两者上都进行了测试gawk
和 mawk
) -
- 虽然总体性能与其他解决方案相似,但此代码避免了显式列出输入文件 两次 来执行其等效的2 遍处理
输入
rows = 23,622,127. | UTF8 chars = 799192890. | bytes = 799192890.
1 checking_balance,months_loan_duration,credit_history,purpose,amount
2 < 0 DM,889,critical,luna,758.61
3 ,150,critical,terra,1823.93
4 1 - 200 DM,883,repaid,stablecoin,2525.55
5 1 - 200 DM,65,repaid,terra,2405.67
6 < 0 DM,9,critical,luna,4059.34
7 < 0 DM,201,critical,stablecoin,5043
8 1 - 200 DM,549,repaid,terra,471.92
9 < 0 DM,853,critical,stablecoin,422.78
10 < 0 DM,659,critical,luna,684.94
代码
# gawk profile, created Tue May 24 04:11:02 2022
'function abs(_) {
return \
+_<-_?-_:_
} BEGIN {
split(_____=(_=length(FS = RS = "^$"))+_,____,"")
}
END {
1 gsub("\n", ",&")
1 FS = "["(OFS= ",")"]"
1 $!_ = $!( __ = _)
1 __+= --NF
23622126 while ((_____+_) < (__-=_)) {
23622126 ____[___=_____] += ($__)^_
23622126 ____[ -—___ ] += ($__)
23622126 ____[___ * _] += -_^!_
23622126 ____[___-=+_] += ($(__-=_+_^!_))
23622126 ____[ ++___ ] += ($__)^_
}
1 ___ = (__=-____[_+_+_])-_^!_
1 RS = -(abs((____[(_)]/___-(((NR=____[+_^!+_]/__)^_)*__/___)))^_^(_/-_)
___ = -(abs((____[_+_]/___-(((RT=____[_+_^!_]/__)^_)*__/___)))^_^(_/-_)
1 ORS = "\n"
1 gsub(ORS, "")
1 OFS = ","
1 print $(_^=_<_), $(__=++_), $++_, $++_, $++_
1 OFMT = "%."(__*__+!(__=NF-__-__))"f"
23622126 while (++_ <= __) {
23622126 print $_, (NR-$++_)/RS, $++_, $++_, (RT-$++_)/___
}
}'
输出
out9: 837MiB 0:00:28 [29.2MiB/s] [29.2MiB/s] [ <=> ]
in0: 762MiB 0:00:00 [2.95GiB/s] [2.95GiB/s] [======>] 100%
( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )
26.98s user 1.58s system 99% cpu 28.681 total
23622127 878032266 878032266 testfile_stdnorm_test_004.txt_out.txt
1 checking_balance,months_loan_duration,credit_history,purpose,amount
2 < 0 DM,1.2000,critical,luna,-1.2939
3 ,-1.2949,critical,terra,-0.6788
4 1 - 200 DM,1.1798,repaid,stablecoin,-0.2737
5 1 - 200 DM,-1.5818,repaid,terra,-0.3429
6 < 0 DM,-1.7709,critical,luna,0.6119
7 < 0 DM,-1.1227,critical,stablecoin,1.1798
8 1 - 200 DM,0.0522,repaid,terra,-1.4594
9 < 0 DM,1.0785,critical,stablecoin,-1.4878
针对较小输入优化的替代解决方案(例如,最多 10^6(100 万)行)
# gawk profile, created Tue May 24 06:19:24 2022
# BEGIN rule(s)
BEGIN {
1 __ = (FS = RS = "^$") * (ORS = "")
}
# END rule(s)
END {
1 _ = $__
1 gsub("[\n][,]","\n_,",_)
1 sub("^.+amount\n","",_)+gsub("[,][0-9.+-]+[,\n]", "&", _)
1 _____ = "[^0-9.+-]+"
1 gsub("^" (_____) "|[^]+","",_)
1 _____ = __ = split(_,___,_____)
1048575 while (-(--__) < +__) {
1048575 ___["_"] += _=___[(__)]
1048575 ___["="] += _*_
1048575 ___["~"] += _=___[--__]
1048575 ___["^"] += _*_
1048575 ___[":"]++
}
1 _ = (__=___[":"])-(____ ^= _<_)
1 ++____
1 ___["}"] = -(abs((___["^"]/_)-(((___["{"] = ___["~"] / __)^____)*__/_)))^____^(-(_^(!_)))
1 ___[")"] = -(abs((___["="]/_)-(((___["("] = ___["_"] / __)^____)*__/_)))^____^(-(_^(!_)))
1 if (_ < _) {
for (_ in ___) {
print "debug", _, ___[_]
}
}
1 ____ = split($(_ < _), ______, ORS = "\n")
1 _ = index(FS = "[" (OFS = ",") "]", OFS)
1 print ______[_ ^ (! _)]
1048574 for (__ += __ ^= _ < _; __ < ____; __++) {
1048574 print sprintf("%.*s%s,%+.*f,%s,%s,%+.*f", ! __, $! _ = ______[__], $(_ ~ _), _ + _, (___["{"] - $_) / ___["}"], $++_, $(--_ + _), _ + _, (___["("] - $NF) / ___[")"])
}
}
# Functions, listed alphabetically
2 function abs(_)
{
2 return (+_ < -_ ? -_ : _)
}
解决方案 # 2 的性能:End-to-End 2^20 行需要 2.57 秒
rows = 1048575. | UTF8 chars = 39912117. | bytes = 39912117.
( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )
2.46s user 0.13s system 100% cpu 2.573 total
我有一个包含多个变量的 csv,我只想使用标准差对某些特定列进行归一化。 该值减去变量的均值除以变量的标准差。
文件以逗号分隔,仅需使用 awk 对变量 months_loan_duration 和 金额.
输入看起来像这样但是有一千行:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,6,critical,radio/tv,1169.53
1 - 200 DM,48,repaid,radio/tv,5951.78
,12,critical,education,2096.23
输出会是这样的:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-1.236,critical,radio/tv,-0.745
1 - 200 DM,2.248,repaid,radio/tv,0.95
,-0.738,critical,education,-0.417
到目前为止,我尝试了以下未成功的方法:
#! /usr/bin/awk -f
BEGIN{FS=","; OFS=",";numberColumn=NF}
NR!=1
{
for(i=1;i <= numberColumn;i++)
{
total[i]+=$i;
totalSquared[i]+=$i^2;
}
for (i=1;i <= numberColumn;i++)
{
avg[i]=total[i]/(NR-1);
std[i]=sqrt((totalSquared[i]/(NR-1))-avg[i]^2);
}
for (i=1;i <= numberColumn;i++)
{
norm[i]=(($i-avg[i])/std[i])
}
}
{
print ,$norm[2],3,4,$norm[5]
}
读取文件两次会更容易:
awk -F, -v OFS=, '
NR==FNR { # 1st pass: accumulate values
if (FNR > 1) {
sx2 += # sum of col2
sxx2 += * # sum of col2^2
sx5 += # sum of col5
sxx5 += * # sum of col5^2
n++ # count of samples
}
next
}
FNR==1 { # 2nd pass, 1st line: calc means and stdevs
ave2 = sx2 / n # mean of col2
var2 = sxx2 / (n - 1) - ave2 * ave2 * n / (n - 1)
if (var2 < 0) var2 = 0 # avoid rounding error
sd2 = sqrt(var2) # stdev of col2
ave5 = sx5 / n
var5 = sxx5 / (n - 1) - ave5 * ave5 * n / (n - 1)
if (var5 < 0) var5 = 0
sd5 = sqrt(var5)
print # print the header line
}
FNR>1 {
if (sd2 > 0) = ( - ave2) / sd2
if (sd5 > 0) = ( - ave5) / sd5
print
}
' input_file.csv input_file.csv
输出:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-0.704361,critical,radio/tv,-0.750328
1 - 200 DM,1.14459,repaid,radio/tv,1.13527
,-0.440225,critical,education,-0.384939
请注意计算值与您的预期结果不同。
几千行对于 awk
来说不是那么大的文件:不妨一次加载它 - 这里我创建了它的 23.6 mn rows
合成版本(在两者上都进行了测试gawk
和 mawk
) -
- 虽然总体性能与其他解决方案相似,但此代码避免了显式列出输入文件 两次 来执行其等效的2 遍处理
输入
rows = 23,622,127. | UTF8 chars = 799192890. | bytes = 799192890.
1 checking_balance,months_loan_duration,credit_history,purpose,amount
2 < 0 DM,889,critical,luna,758.61
3 ,150,critical,terra,1823.93
4 1 - 200 DM,883,repaid,stablecoin,2525.55
5 1 - 200 DM,65,repaid,terra,2405.67
6 < 0 DM,9,critical,luna,4059.34
7 < 0 DM,201,critical,stablecoin,5043
8 1 - 200 DM,549,repaid,terra,471.92
9 < 0 DM,853,critical,stablecoin,422.78
10 < 0 DM,659,critical,luna,684.94
代码
# gawk profile, created Tue May 24 04:11:02 2022
'function abs(_) {
return \
+_<-_?-_:_
} BEGIN {
split(_____=(_=length(FS = RS = "^$"))+_,____,"")
}
END {
1 gsub("\n", ",&")
1 FS = "["(OFS= ",")"]"
1 $!_ = $!( __ = _)
1 __+= --NF
23622126 while ((_____+_) < (__-=_)) {
23622126 ____[___=_____] += ($__)^_
23622126 ____[ -—___ ] += ($__)
23622126 ____[___ * _] += -_^!_
23622126 ____[___-=+_] += ($(__-=_+_^!_))
23622126 ____[ ++___ ] += ($__)^_
}
1 ___ = (__=-____[_+_+_])-_^!_
1 RS = -(abs((____[(_)]/___-(((NR=____[+_^!+_]/__)^_)*__/___)))^_^(_/-_)
___ = -(abs((____[_+_]/___-(((RT=____[_+_^!_]/__)^_)*__/___)))^_^(_/-_)
1 ORS = "\n"
1 gsub(ORS, "")
1 OFS = ","
1 print $(_^=_<_), $(__=++_), $++_, $++_, $++_
1 OFMT = "%."(__*__+!(__=NF-__-__))"f"
23622126 while (++_ <= __) {
23622126 print $_, (NR-$++_)/RS, $++_, $++_, (RT-$++_)/___
}
}'
输出
out9: 837MiB 0:00:28 [29.2MiB/s] [29.2MiB/s] [ <=> ]
in0: 762MiB 0:00:00 [2.95GiB/s] [2.95GiB/s] [======>] 100%
( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )
26.98s user 1.58s system 99% cpu 28.681 total
23622127 878032266 878032266 testfile_stdnorm_test_004.txt_out.txt
1 checking_balance,months_loan_duration,credit_history,purpose,amount
2 < 0 DM,1.2000,critical,luna,-1.2939
3 ,-1.2949,critical,terra,-0.6788
4 1 - 200 DM,1.1798,repaid,stablecoin,-0.2737
5 1 - 200 DM,-1.5818,repaid,terra,-0.3429
6 < 0 DM,-1.7709,critical,luna,0.6119
7 < 0 DM,-1.1227,critical,stablecoin,1.1798
8 1 - 200 DM,0.0522,repaid,terra,-1.4594
9 < 0 DM,1.0785,critical,stablecoin,-1.4878
针对较小输入优化的替代解决方案(例如,最多 10^6(100 万)行)
# gawk profile, created Tue May 24 06:19:24 2022
# BEGIN rule(s)
BEGIN {
1 __ = (FS = RS = "^$") * (ORS = "")
}
# END rule(s)
END {
1 _ = $__
1 gsub("[\n][,]","\n_,",_)
1 sub("^.+amount\n","",_)+gsub("[,][0-9.+-]+[,\n]", "&", _)
1 _____ = "[^0-9.+-]+"
1 gsub("^" (_____) "|[^]+","",_)
1 _____ = __ = split(_,___,_____)
1048575 while (-(--__) < +__) {
1048575 ___["_"] += _=___[(__)]
1048575 ___["="] += _*_
1048575 ___["~"] += _=___[--__]
1048575 ___["^"] += _*_
1048575 ___[":"]++
}
1 _ = (__=___[":"])-(____ ^= _<_)
1 ++____
1 ___["}"] = -(abs((___["^"]/_)-(((___["{"] = ___["~"] / __)^____)*__/_)))^____^(-(_^(!_)))
1 ___[")"] = -(abs((___["="]/_)-(((___["("] = ___["_"] / __)^____)*__/_)))^____^(-(_^(!_)))
1 if (_ < _) {
for (_ in ___) {
print "debug", _, ___[_]
}
}
1 ____ = split($(_ < _), ______, ORS = "\n")
1 _ = index(FS = "[" (OFS = ",") "]", OFS)
1 print ______[_ ^ (! _)]
1048574 for (__ += __ ^= _ < _; __ < ____; __++) {
1048574 print sprintf("%.*s%s,%+.*f,%s,%s,%+.*f", ! __, $! _ = ______[__], $(_ ~ _), _ + _, (___["{"] - $_) / ___["}"], $++_, $(--_ + _), _ + _, (___["("] - $NF) / ___[")"])
}
}
# Functions, listed alphabetically
2 function abs(_)
{
2 return (+_ < -_ ? -_ : _)
}
解决方案 # 2 的性能:End-to-End 2^20 行需要 2.57 秒
rows = 1048575. | UTF8 chars = 39912117. | bytes = 39912117.
( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )
2.46s user 0.13s system 100% cpu 2.573 total