使用 AWK 组织文件

Question

嗯，我有以下文件：

Progeny Sire    Dam    Sex  PENAS       P35         P41
13254   11908   11421   M   47.275811   1322.828674 1719.183748
13323   11335   11386   M   43.29896    1225.57111  1634.436447
13562   11864   11895   M   47.884191   1228.568357 1615.427502
13338   11335   11970   M   45.780973   1196.32757  1561.900145

我需要转置 PENAS、P35 和 P41 列。这些列将成为新列：年龄。在视觉上，我需要这样的文件：

Progeny Sire    Dam     Sex AGE     Peso
13254   11908   11421   M   PENAS   47.275811
13254   11908   11421   M   P35     1322.828674
13254   11908   11421   M   P41     1719.183748
13323   11335   11386   M   PENAS   43.29896
13323   11335   11386   M   P35     1225.57111
13323   11335   11386   M   P41     1634.436447
13562   11864   11895   M   PENAS   47.884191
13562   11864   11895   M   P35     1228.568357
13562   11864   11895   M   P41     1615.427502
13338   11335   11970   M   PENAS   45.780973
13338   11335   11970   M   P35     1196.32757
13338   11335   11970   M   P41     1561.900145

我试过这个命令，但没有用：

awk 'NR==1{h= OFS  OFS  OFS  OFS  OFS  OFS ; next}
            {a[]=(( in a)?(a[] OFS $NF):(OFS  OFS  OFS  OFS "AGE"));
             if(!( in b)) {h=h OFS ; b[]}}
        END{print h; for(k in a) print k,a[k]}' a.txt | column -t > b

我就卡在这一点上了，有什么建议吗？谢谢。注意，我的原始数据集有 1400 行。

Answer 1

我会这样做：

transpose.awk

NR == 1 { 
  NF     -= 2         # Remove last two header columns
  $NF     = "AGE"     # Add AGE column header
  $(NF+1) = "Peso"    # Add Peso column header
  print               # Print header
  next                # Skip to next line
}

{
  for (i=5; i<=7; i++) {
    if(i==5) s = "PENAS"
    if(i==6) s = "P35"
    if(i==7) s = "P41"

    print , , , , s, $i
  }
}

运行例如这样：

awk -v OFS='\t' -f transpose.awk infile

输出：

Progeny Sire    Dam     Sex AGE     Peso
13254   11908   11421   M   PENAS   47.275811
13254   11908   11421   M   P35     1322.828674
13254   11908   11421   M   P41     1719.183748
13323   11335   11386   M   PENAS   43.29896
13323   11335   11386   M   P35     1225.57111
13323   11335   11386   M   P41     1634.436447
13562   11864   11895   M   PENAS   47.884191
13562   11864   11895   M   P35     1228.568357
13562   11864   11895   M   P41     1615.427502
13338   11335   11970   M   PENAS   45.780973
13338   11335   11970   M   P35     1196.32757
13338   11335   11970   M   P41     1561.900145

警告

请注意 EdMorton 评论中关于乱用 NF 的警告。

Answer 2

这是另一个 awk，不依赖于列数...

$ awk 'NR==1{n=split([=10=],h); 
             for(i=1;i<=NF-3;i++) printf "%s", $i OFS; 
             printf "%s\n", "AGE" OFS "Peso"; next} 
            {split([=10=],p); 
             NF--; 
             for(i=1;i<=3;i++) 
               {$(NF-1)=h[NF-2+i]; 
                $NF=p[NF-2+i]; 
                print}}' file | column -t


Progeny  Sire   Dam    Sex  AGE    Peso
13254    11908  11421  M    PENAS  47.275811
13254    11908  11421  M    P35    1322.828674
13254    11908  11421  M    P41    1719.183748
13323    11335  11386  M    PENAS  43.29896
13323    11335  11386  M    P35    1225.57111
13323    11335  11386  M    P41    1634.436447
13562    11864  11895  M    PENAS  47.884191
13562    11864  11895  M    P35    1228.568357
13562    11864  11895  M    P41    1615.427502
13338    11335  11970  M    PENAS  45.780973
13338    11335  11970  M    P35    1196.32757
13338    11335  11970  M    P41    1561.900145

Answer 3

以下 awk 也可能对您有所帮助。

awk '
FNR==1{
  for(i=5;i<=NF;i++){
    a[++h]=$i};
  NF-=2;
  $NF="AGE Peso";
  print;
  next}
{
  for(j=5;j<=NF;j++){
    printf("%s %s %s %s %s %s\n",,,,,a[++k],$j);
    k=j==NF?k="":k}
}' Input_file | column -t

Answer 4

使用 GNU awk for gensub():

$ cat tst.awk
BEGIN { numPfx=4 }
{ pfx = gensub("((\S+\s+){"numPfx"}).*","\1",1) }
NR==1 {
    split([=10=],ages)
    print pfx, "AGE", "Peso"
    next
}
{
    for (i=numPfx+1; i<=NF; i++) {
        print pfx, ages[i], $i
    }
}

$ awk -f tst.awk file | column -t
Progeny  Sire   Dam    Sex  AGE    Peso
13254    11908  11421  M    PENAS  47.275811
13254    11908  11421  M    P35    1322.828674
13254    11908  11421  M    P41    1719.183748
13323    11335  11386  M    PENAS  43.29896
13323    11335  11386  M    P35    1225.57111
13323    11335  11386  M    P41    1634.436447
13562    11864  11895  M    PENAS  47.884191
13562    11864  11895  M    P35    1228.568357
13562    11864  11895  M    P41    1615.427502
13338    11335  11970  M    PENAS  45.780973
13338    11335  11970  M    P35    1196.32757
13338    11335  11970  M    P41    1561.900145

对于其他 awk，您只需将 gensub() 替换为变量加 sub()，将 \S 替换为 [^[:space:]]，将 \s 替换为 [[:space:]]。

使用 AWK 组织文件

Organize a file with AWK

awk

transpose

organization

dataset

警告