基于4个字段分组并汇总
Group by based on 4 fields and sum up
我的输入是这样的:
aNumber bNumber startDate timeZone duration currencyType cost dicatedAccused balanceAfter trafficCase teleServiceCode location dataVolume numberOfEvents fafIndicator netWorkID serviceProvideID serviceClass nAno nBno bNumberZnCode fileNamedID Destination Operator unknown3 MainAmount ReAnalyse DEDICATEDACCBALBEF DEDICATEDACCBALAFT ACCOUNTGROUPID SERVICEOFFERINGS SELECTEDCOMMUNITYID BALANCEBEFORE
22677512549 778 2014-07-02 10:16:35.000 NULL NULL localCurrency 0,00 2 11.50 0 3 22676020076 NULL NULL NULL NULL NULL 34 77512549 778 NULL 1131257 OTHER Short Code 126244088 0.0000 0 NULL NULL NULL NULL NULL 11.5000
22675557361 76457227 2014-07-02 10:16:38.000 NULL NULL localCurrency 10,00 2 1009.10 0 3 22676613028 NULL NULL 1 NULL NULL 35 75557361 76457227 NULL 1131257 Airtel Airtel 4132206314 10.0000 0 NULL NULL NULL NULL NULL 1019.1000
22677521277 778 2014-07-04 10:16:42.000 NULL NULL localCurrency 0,00 NULL 0.00 0 4 22676020078 NULL NULL NULL NULL NULL 34 77521277 778 NULL 1131257 OTHER Short Code 130071591 0.0000 0 NULL NULL NULL NULL NULL 0.0000
22676099496 77250331 2014-07-03 10:16:42.000 NULL NULL localCurrency 1,00 9 0.50 0 4 22676613028 NULL NULL NULL NULL NULL 35 76099496 77250331 NULL 1131257 Airtel Airtel 4132218551 0.0000 0 4.0000 3.0000 NULL NULL NULL 0.5000
22667222160 22667262389 2014-07-02 10:16:43.000 NULL NULL localCurrency 10,00 1 16070.00 0 4 22676613028 NULL NULL NULL NULL NULL 35 67222160 67262389 NULL 1131257 Airtel Airtel 4132222628 10.0000 0 NULL NULL NULL NULL NULL 16080.0000
我必须按日期、dicatedAccused、trafficCase 和 teleserviceCode 进行分组,然后根据这个分组的结果,我必须总结持续时间、成本、balanceAfter、MainAmount、Balancebefore。
如果仅基于一个字段求和是没有问题的,但我们必须使用 4 个字段进行分组
这是我正在使用的 awk 脚本
awk 'BEGIN {print "date Duration Cost BalanceAfter MainAmount DedicatedAccBalBefore DedicatedAccBalAfter BalanceBefore"} NR == 1 {next} function showday() {
printf "%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", date, duration, cost, bAfter, main, dedAccbBefore, dedAccbAfter, bBefore} DedicatedAccUsed != {
if (date) showday()
date =
duration = cost = bAfter = main = bBefore = dedAccbBefore = dedAccbAfter = 0}{
sub(/,/, ".", )
duration +=
cost +=
bAfter +=
main += $(NF-7)
dedAccbBefore += $(NF-5)
dedAccbAfter += $(NF-4)
bBefore += $NF}END {showday()}' test.txt | column -t
这是我正在寻找的输出
startDate dicatedAccused trafficCase teleServiceCode duration cost balanceAfter MainAmount BALANCEBEFORE
02/07/2014 2 0 3 0 10 1020.60 10 1020.60
02/07/2014 1 0 4 0 10 16070.00 10 16080
03/07/2014 9 0 4 0 1 0 0.0000 0,5
04/07/2014 NULL 0 4 0 0 0 0.0000 0
获得好的输出:
date dAccused TrafficCase ServiceCode Duration Cost BalanceAfter MainAmount BalanceBefore
2014-07-02 1 0 4 0 10 16070 10 16080
2014-07-03 9 0 4 0 1 0.5 0 0.5
2014-07-04 NULL 0 4 0 0 0 0 0
2014-07-02 2 0 3 0 10 1020.6 10 1030.6
我已经稍微修改了您的脚本以仅使用数组来存储总和,我将其添加到此处并附上评论:
awk '
NR == 1 {next}
{
sub(/,/, ".", )
key=sprintf("%-10s %10s %12s %12s",,,,) # Create the array key for the "group by" style with text formatting for output
duration[key] += # Do the duration sum with the key
cost[key] += # the same for cost
bAfter[key] +=
main[key] += $(NF-7)
dedAccbBefore += $(NF-5) # Unsure of the real use for this one, so not used after but left
dedAccbAfter += $(NF-4)
bBefore[key] += $NF
}
END {
printf "%-10s %10s %12s %12s %10s %10s %10s %10s %10s\n", "date","dAccused","TrafficCase","ServiceCode","Duration","Cost","BalanceAfter","MainAmount","BalanceBefore" # print the header
for (i in duration) { # loop over duration array to get the key as index for all the arrays
printf "%-47s %10s %10s %10s %10s %10s\n", i,duration[i],cost[i],bAfter[i],main[i],bBefore[i] # print the values (key then actual arrays values
}
}' test.txt
希望它足够清楚,如果需要更多详细信息,请告诉我。
我的输入是这样的:
aNumber bNumber startDate timeZone duration currencyType cost dicatedAccused balanceAfter trafficCase teleServiceCode location dataVolume numberOfEvents fafIndicator netWorkID serviceProvideID serviceClass nAno nBno bNumberZnCode fileNamedID Destination Operator unknown3 MainAmount ReAnalyse DEDICATEDACCBALBEF DEDICATEDACCBALAFT ACCOUNTGROUPID SERVICEOFFERINGS SELECTEDCOMMUNITYID BALANCEBEFORE
22677512549 778 2014-07-02 10:16:35.000 NULL NULL localCurrency 0,00 2 11.50 0 3 22676020076 NULL NULL NULL NULL NULL 34 77512549 778 NULL 1131257 OTHER Short Code 126244088 0.0000 0 NULL NULL NULL NULL NULL 11.5000
22675557361 76457227 2014-07-02 10:16:38.000 NULL NULL localCurrency 10,00 2 1009.10 0 3 22676613028 NULL NULL 1 NULL NULL 35 75557361 76457227 NULL 1131257 Airtel Airtel 4132206314 10.0000 0 NULL NULL NULL NULL NULL 1019.1000
22677521277 778 2014-07-04 10:16:42.000 NULL NULL localCurrency 0,00 NULL 0.00 0 4 22676020078 NULL NULL NULL NULL NULL 34 77521277 778 NULL 1131257 OTHER Short Code 130071591 0.0000 0 NULL NULL NULL NULL NULL 0.0000
22676099496 77250331 2014-07-03 10:16:42.000 NULL NULL localCurrency 1,00 9 0.50 0 4 22676613028 NULL NULL NULL NULL NULL 35 76099496 77250331 NULL 1131257 Airtel Airtel 4132218551 0.0000 0 4.0000 3.0000 NULL NULL NULL 0.5000
22667222160 22667262389 2014-07-02 10:16:43.000 NULL NULL localCurrency 10,00 1 16070.00 0 4 22676613028 NULL NULL NULL NULL NULL 35 67222160 67262389 NULL 1131257 Airtel Airtel 4132222628 10.0000 0 NULL NULL NULL NULL NULL 16080.0000
我必须按日期、dicatedAccused、trafficCase 和 teleserviceCode 进行分组,然后根据这个分组的结果,我必须总结持续时间、成本、balanceAfter、MainAmount、Balancebefore。
如果仅基于一个字段求和是没有问题的,但我们必须使用 4 个字段进行分组
这是我正在使用的 awk 脚本
awk 'BEGIN {print "date Duration Cost BalanceAfter MainAmount DedicatedAccBalBefore DedicatedAccBalAfter BalanceBefore"} NR == 1 {next} function showday() {
printf "%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", date, duration, cost, bAfter, main, dedAccbBefore, dedAccbAfter, bBefore} DedicatedAccUsed != {
if (date) showday()
date =
duration = cost = bAfter = main = bBefore = dedAccbBefore = dedAccbAfter = 0}{
sub(/,/, ".", )
duration +=
cost +=
bAfter +=
main += $(NF-7)
dedAccbBefore += $(NF-5)
dedAccbAfter += $(NF-4)
bBefore += $NF}END {showday()}' test.txt | column -t
这是我正在寻找的输出
startDate dicatedAccused trafficCase teleServiceCode duration cost balanceAfter MainAmount BALANCEBEFORE
02/07/2014 2 0 3 0 10 1020.60 10 1020.60
02/07/2014 1 0 4 0 10 16070.00 10 16080
03/07/2014 9 0 4 0 1 0 0.0000 0,5
04/07/2014 NULL 0 4 0 0 0 0.0000 0
获得好的输出:
date dAccused TrafficCase ServiceCode Duration Cost BalanceAfter MainAmount BalanceBefore
2014-07-02 1 0 4 0 10 16070 10 16080
2014-07-03 9 0 4 0 1 0.5 0 0.5
2014-07-04 NULL 0 4 0 0 0 0 0
2014-07-02 2 0 3 0 10 1020.6 10 1030.6
我已经稍微修改了您的脚本以仅使用数组来存储总和,我将其添加到此处并附上评论:
awk '
NR == 1 {next}
{
sub(/,/, ".", )
key=sprintf("%-10s %10s %12s %12s",,,,) # Create the array key for the "group by" style with text formatting for output
duration[key] += # Do the duration sum with the key
cost[key] += # the same for cost
bAfter[key] +=
main[key] += $(NF-7)
dedAccbBefore += $(NF-5) # Unsure of the real use for this one, so not used after but left
dedAccbAfter += $(NF-4)
bBefore[key] += $NF
}
END {
printf "%-10s %10s %12s %12s %10s %10s %10s %10s %10s\n", "date","dAccused","TrafficCase","ServiceCode","Duration","Cost","BalanceAfter","MainAmount","BalanceBefore" # print the header
for (i in duration) { # loop over duration array to get the key as index for all the arrays
printf "%-47s %10s %10s %10s %10s %10s\n", i,duration[i],cost[i],bAfter[i],main[i],bBefore[i] # print the values (key then actual arrays values
}
}' test.txt
希望它足够清楚,如果需要更多详细信息,请告诉我。