计算以输入文本文件的每个字符开头的行数

Count how many lines start with each character of input textfiles

我想写一个 bash 脚本,使用 awk 来确定每个字符有多少行开始。

示例输入: ./script.sh txt1 txt2 text1 text2(文件名也可以是随机的)

txt1

asdaga
dasdag
asdasdag
awqr
zvvbrh
tqetvh
xbrrte

txt2

npoajd
pojta
pskdna
nghir
asdt
bmkgjk

示例输出:

--- txt1 ---
a : 3
b : 0
c : 0
...
z : 1
...
ascii255 : 0

--- txt2 ---
a : 1
b : 1
...
p : 2
...

--- text3 ---
etc

其中 [character] : [number of rows that start with that character] 是正确的格式。

在一个一个打印每个文件后,我还想打印一个集体结果,它遵循相同的格式,所以每个 charactercount 将显示每个文本文件字符的 总和,因此在给定的示例中(仅针对 txt1txt2),输出将是:

a : 4
b : 1
...

(epl: txt1 包含 3 行以 a 开头,txt2 包含 1 行以 a 开头,因此总数为 3+1 = 4)

这是我写的代码,但我卡住了,它不起作用,我对 awk 语法感到困惑:

#!/bin/bash

awk '
    {split([=14=],arr)
    n=length(arr)
    for(i=1;i<=255;i++){
        char[i]=0;
    }
    for(i=1;i<=n;i++){
        actchar=substr(1,1,1);
        char[actchar]++;
        printf("--- %s ---\n",FILENAME);
        for(j=1;j<=255;j++){
            prinf("%c : %s\n",j,char[j]);
        }
    }
'

此解决方案安全地跳过 multi-byte 个字符(如果这是第一个字符); gawk byte-mode 或 unicode-mode 的工作方式相同:

 % pv -q  < "${m3t}" | mawk2 '
        
     function printreport(__,___,_,____) { 
          if (___=="") {
               return ___ 
          }
          printf(" ======= %s ================\n",___)
        
          for (_=2^3*4;_<(4^3*2-1);_++) {
        
              printf("   [ %s ] = %9.f | %15.f \n",
                            ___=sprintf("%c",_),
                         __[___], ____+=__[___]) 
           }
           printf(" =====================================\n"\
                  " ASCII 32(spc)-126(~) sum = %10.f\n\n",____)
        return split("",__) 
     } 
     BEGIN {   FS = substr("^$",\
                _ = !split(___,__))
     } FNR==+_ {
             ___=substr(FILENAME != "-" ? FILENAME \
                 : " /dev/fd/0 :: STDIN ", !-printreport(__,___)) 
     }  { 
            __[substr($!_,_,_)]++ 
     } END { 
            printreport(__,___) } ' "${m3l}" "${m3m}" '/dev/stdin' | ecp;

 
 ======= .../m23lyricsFLT_05.txt ================
   [   ] =         7 |               7 
   [ ! ] =         0 |               7 
   [ " ] =        51 |              58 
   [ # ] =        62 |             120 
   [ $ ] =         3 |             123 
   [ % ] =         0 |             123 
   [ & ] =         0 |             123 
   [ ' ] =       443 |             566 
   [ ( ] =      1766 |            2332 
   [ ) ] =         2 |            2334 
   [ * ] =       944 |            3278 
   [ + ] =         1 |            3279 
   [ , ] =         1 |            3280 
   [ - ] =        75 |            3355 
   [ . ] =        22 |            3377 
   [ / ] =        58 |            3435 
   [ 0 ] =    158142 |          161577 
   [ 1 ] =      2090 |          163667 
   [ 2 ] =       131 |          163798 
   [ 3 ] =        57 |          163855 
   [ 4 ] =        31 |          163886 
   [ 5 ] =        53 |          163939 
   [ 6 ] =        16 |          163955 
   [ 7 ] =        38 |          163993 
   [ 8 ] =        11 |          164004 
   [ 9 ] =        22 |          164026 
   [ : ] =         6 |          164032 
   [ ; ] =         1 |          164033 
   [ < ] =       158 |          164191 
   [ = ] =         0 |          164191 
   [ > ] =         3 |          164194 
   [ ? ] =        18 |          164212 
   [ @ ] =         8 |          164220 
   [ A ] =      1552 |          165772 
   [ B ] =      1407 |          167179 
   [ C ] =      1210 |          168389 
   [ D ] =      1186 |          169575 
   [ E ] =       570 |          170145 
   [ F ] =       568 |          170713 
   [ G ] =       796 |          171509 
   [ H ] =      2211 |          173720 
   [ I ] =      6825 |          180545 
   [ J ] =       397 |          180942 
   [ K ] =       160 |          181102 
   [ L ] =      1516 |          182618 
   [ M ] =       941 |          183559 
   [ N ] =       737 |          184296 
   [ O ] =      1640 |          185936 
   [ P ] =       460 |          186396 
   [ Q ] =        40 |          186436 
   [ R ] =       925 |          187361 
   [ S ] =      2286 |          189647 
   [ T ] =      2119 |          191766 
   [ U ] =       348 |          192114 
   [ V ] =       943 |          193057 
   [ W ] =      2353 |          195410 
   [ X ] =        14 |          195424 
   [ Y ] =      2941 |          198365 
   [ Z ] =        30 |          198395 
   [ [ ] =      3669 |          202064 
   [ \ ] =         0 |          202064 
   [ ] ] =         0 |          202064 
   [ ^ ] =         0 |          202064 
   [ _ ] =         0 |          202064 
   [ ` ] =         0 |          202064 
   [ a ] =       291 |          202355 
   [ b ] =       251 |          202606 
   [ c ] =       246 |          202852 
   [ d ] =       127 |          202979 
   [ e ] =        88 |          203067 
   [ f ] =        74 |          203141 
   [ g ] =       108 |          203249 
   [ h ] =       403 |          203652 
   [ i ] =       572 |          204224 
   [ j ] =        62 |          204286 
   [ k ] =        48 |          204334 
   [ l ] =       204 |          204538 
   [ m ] =       174 |          204712 
   [ n ] =       135 |          204847 
   [ o ] =       363 |          205210 
   [ p ] =        77 |          205287 
   [ q ] =         6 |          205293 
   [ r ] =       292 |          205585 
   [ s ] =       376 |          205961 
   [ t ] =       288 |          206249 
   [ u ] =        98 |          206347 
   [ v ] =       319 |          206666 
   [ w ] =       404 |          207070 
   [ x ] =        11 |          207081 
   [ y ] =       522 |          207603 
   [ z ] =        22 |          207625 
   [ { ] =         4 |          207629 
   [ | ] =         0 |          207629 
   [ } ] =         0 |          207629 
   [ ~ ] =         3 |          207632 
 =====================================
 ASCII 32(spc)-126(~) sum =     207632

 ======= .../m3vid_genie26.txt ================
   [   ] =         0 |               0 
   [ ! ] =         1 |               1 
   [ " ] =         4 |               5 
   [ # ] =       106 |             111 
   [ $ ] =         8 |             119 
   [ % ] =         1 |             120 
   [ & ] =         6 |             126 
   [ ' ] =       294 |             420 
   [ ( ] =       188 |             608 
   [ ) ] =         0 |             608 
   [ * ] =         5 |             613 
   [ + ] =         2 |             615 
   [ , ] =         0 |             615 
   [ - ] =         4 |             619 
   [ . ] =        50 |             669 
   [ / ] =         0 |             669 
   [ 0 ] =        86 |             755 
   [ 1 ] =       521 |            1276 
   [ 2 ] =       457 |            1733 
   [ 3 ] =       198 |            1931 
   [ 4 ] =       178 |            2109 
   [ 5 ] =       150 |            2259 
   [ 6 ] =        86 |            2345 
   [ 7 ] =       126 |            2471 
   [ 8 ] =        91 |            2562 
   [ 9 ] =       123 |            2685 
   [ : ] =         0 |            2685 
   [ ; ] =         0 |            2685 
   [ < ] =        46 |            2731 
   [ = ] =         0 |            2731 
   [ > ] =         3 |            2734 
   [ ? ] =         6 |            2740 
   [ @ ] =         0 |            2740 
   [ A ] =      3190 |            5930 
   [ B ] =      4078 |           10008 
   [ C ] =      3279 |           13287 
   [ D ] =      3330 |           16617 
   [ E ] =      1474 |           18091 
   [ F ] =      2745 |           20836 
   [ G ] =      2337 |           23173 
   [ H ] =      3139 |           26312 
   [ I ] =      5411 |           31723 
   [ J ] =       981 |           32704 
   [ K ] =       893 |           33597 
   [ L ] =      4264 |           37861 
   [ M ] =      4134 |           41995 
   [ N ] =      1972 |           43967 
   [ O ] =      1996 |           45963 
   [ P ] =      2409 |           48372 
   [ Q ] =        94 |           48466 
   [ R ] =      2262 |           50728 
   [ S ] =      6701 |           57429 
   [ T ] =      5794 |           63223 
   [ U ] =       717 |           63940 
   [ V ] =       554 |           64494 
   [ W ] =      4119 |           68613 
   [ X ] =       106 |           68719 
   [ Y ] =      1644 |           70363 
   [ Z ] =       145 |           70508 
   [ [ ] =     20079 |           90587 
   [ \ ] =         0 |           90587 
   [ ] ] =         0 |           90587 
   [ ^ ] =         0 |           90587 
   [ _ ] =         0 |           90587 
   [ ` ] =         0 |           90587 
   [ a ] =       117 |           90704 
   [ b ] =       132 |           90836 
   [ c ] =       128 |           90964 
   [ d ] =        83 |           91047 
   [ e ] =        60 |           91107 
   [ f ] =       114 |           91221 
   [ g ] =       104 |           91325 
   [ h ] =       103 |           91428 
   [ i ] =       143 |           91571 
   [ j ] =        26 |           91597 
   [ k ] =        21 |           91618 
   [ l ] =       117 |           91735 
   [ m ] =       145 |           91880 
   [ n ] =        72 |           91952 
   [ o ] =        67 |           92019 
   [ p ] =        95 |           92114 
   [ q ] =         4 |           92118 
   [ r ] =        68 |           92186 
   [ s ] =       222 |           92408 
   [ t ] =       149 |           92557 
   [ u ] =        16 |           92573 
   [ v ] =        22 |           92595 
   [ w ] =       167 |           92762 
   [ x ] =         2 |           92764 
   [ y ] =        47 |           92811 
   [ z ] =         4 |           92815 
   [ { ] =         0 |           92815 
   [ | ] =         0 |           92815 
   [ } ] =         0 |           92815 
   [ ~ ] =         3 |           92818 
 =====================================
 ASCII 32(spc)-126(~) sum =      92818

 ======= /dev/stdin ================
   [   ] =         0 |               0 
   [ ! ] =         5 |               5 
   [ " ] =      7062 |            7067 
   [ # ] =      3889 |           10956 
   [ $ ] =       308 |           11264 
   [ % ] =       165 |           11429 
   [ & ] =      3210 |           14639 
   [ ' ] =     38770 |           53409 
   [ ( ] =    105671 |          159080 
   [ ) ] =       307 |          159387 
   [ * ] =     11556 |          170943 
   [ + ] =       240 |          171183 
   [ , ] =         0 |          171183 
   [ - ] =     14565 |          185748 
   [ . ] =        27 |          185775 
   [ / ] =      2010 |          187785 
   [ 0 ] =      5489 |          193274 
   [ 1 ] =     51256 |          244530 
   [ 2 ] =     41364 |          285894 
   [ 3 ] =     20015 |          305909 
   [ 4 ] =     12961 |          318870 
   [ 5 ] =      9864 |          328734 
   [ 6 ] =      7294 |          336028 
   [ 7 ] =      6514 |          342542 
   [ 8 ] =      5800 |          348342 
   [ 9 ] =      5525 |          353867 
   [ : ] =         7 |          353874 
   [ ; ] =         0 |          353874 
   [ < ] =      2433 |          356307 
   [ = ] =         0 |          356307 
   [ > ] =       226 |          356533 
   [ ? ] =        17 |          356550 
   [ @ ] =       281 |          356831 
   [ A ] =    375661 |          732492 
   [ B ] =    331981 |         1064473 
   [ C ] =    271228 |         1335701 
   [ D ] =    270206 |         1605907 
   [ E ] =    144476 |         1750383 
   [ F ] =    262067 |         2012450 
   [ G ] =    158453 |         2170903 
   [ H ] =    204592 |         2375495 
   [ I ] =    501327 |         2876822 
   [ J ] =    119037 |         2995859 
   [ K ] =     94295 |         3090154 
   [ L ] =    280855 |         3371009 
   [ M ] =    312797 |         3683806 
   [ N ] =    160272 |         3844078 
   [ O ] =    160304 |         4004382 
   [ P ] =    197434 |         4201816 
   [ Q ] =     19418 |         4221234 
   [ R ] =    163032 |         4384266 
   [ S ] =    494497 |         4878763 
   [ T ] =    461447 |         5340210 
   [ U ] =     51570 |         5391780 
   [ V ] =     79325 |         5471105 
   [ W ] =    269542 |         5740647 
   [ X ] =      6973 |         5747620 
   [ Y ] =    162431 |         5910051 
   [ Z ] =     19564 |         5929615 
   [ [ ] =     36976 |         5966591 
   [ \ ] =         0 |         5966591 
   [ ] ] =       199 |         5966790 
   [ ^ ] =        13 |         5966803 
   [ _ ] =       594 |         5967397 
   [ ` ] =         0 |         5967397 
   [ a ] =     59000 |         6026397 
   [ b ] =     39103 |         6065500 
   [ c ] =     23406 |         6088906 
   [ d ] =     17316 |         6106222 
   [ e ] =      9960 |         6116182 
   [ f ] =     27632 |         6143814 
   [ g ] =     15660 |         6159474 
   [ h ] =     21529 |         6181003 
   [ i ] =     43845 |         6224848 
   [ j ] =      7824 |         6232672 
   [ k ] =      5854 |         6238526 
   [ l ] =     25302 |         6263828 
   [ m ] =     25061 |         6288889 
   [ n ] =     17172 |         6306061 
   [ o ] =     29060 |         6335121 
   [ p ] =     11470 |         6346591 
   [ q ] =      1561 |         6348152 
   [ r ] =     10232 |         6358384 
   [ s ] =     42816 |         6401200 
   [ t ] =     72947 |         6474147 
   [ u ] =      6623 |         6480770 
   [ v ] =      1806 |         6482576 
   [ w ] =     57864 |         6540440 
   [ x ] =       969 |         6541409 
   [ y ] =     38921 |         6580330 
   [ z ] =      1544 |         6581874 
   [ { ] =       272 |         6582146 
   [ | ] =         0 |         6582146 
   [ } ] =         3 |         6582149 
   [ ~ ] =       406 |         6582555 
 =====================================
 ASCII 32(spc)-126(~) sum =    6582555

这可能是您使用任何 awk 尝试做的事情:

$ cat tst.sh
#!/usr/bin/env bash

awk '
    {
        char = substr([=10=],1,1)
        cnt[FILENAME,char]++
    }
    END {
        OFS = " : "
        beg = 97
        end = 122
    
        for ( fileNr=1; fileNr<ARGC; fileNr++ ) {
            fname = ARGV[fileNr]
            print "--- " fname " ---"
            for ( charNr=beg; charNr<=end; charNr++ ) {
                char = sprintf("%c", charNr)
                print char, cnt[fname,char]+0
                tot[char] += cnt[fname,char]
            }
        }
    
        print "--- Total ---"
        for ( charNr=beg; charNr<=end; charNr++ ) {
            char = sprintf("%c", charNr)
            print char, tot[char]
        }
    }
' "${@:--}"

$ ./tst.sh txt1 txt2
--- txt1 ---
a : 3
b : 0
c : 0
d : 1
e : 0
f : 0
g : 0
h : 0
i : 0
j : 0
k : 0
l : 0
m : 0
n : 0
o : 0
p : 0
q : 0
r : 0
s : 0
t : 1
u : 0
v : 0
w : 0
x : 1
y : 0
z : 1
--- txt2 ---
a : 1
b : 1
c : 0
d : 0
e : 0
f : 0
g : 0
h : 0
i : 0
j : 0
k : 0
l : 0
m : 0
n : 2
o : 0
p : 2
q : 0
r : 0
s : 0
t : 0
u : 0
v : 0
w : 0
x : 0
y : 0
z : 0
--- Total ---
a : 4
b : 1
c : 0
d : 1
e : 0
f : 0
g : 0
h : 0
i : 0
j : 0
k : 0
l : 0
m : 0
n : 2
o : 0
p : 2
q : 0
r : 0
s : 0
t : 1
u : 0
v : 0
w : 0
x : 1
y : 0
z : 1

如果您想遍历更大范围的字符,只需更改 begend 变量设置即可。