跨列计算每行的最大值和最小值并输出关联的列名

Counting max and min per row across columns and outputting associated column names

我正在尝试跨列计算每行的最大值和最小值(0 除外)并输出关联的列名。

我正在尝试这个:

BEGIN{OFS="\t"}
     NR==1{print ,,"ref","max","ref","min";
         for(i=3;i<=6;++i)BASES[i]=$(i);
     }
     NR>1{l=1;basemax=BASES[3];basemin=BASES[3]; max=; min=;
         for(i=4;i<=6;++i){
             if($i>max){basemax=BASES[i];max=$i;}
             else if($i==max){basemax=basemax","BASES[i];++l}
        }
         for(i=4;i<=6;++i){
             if($i<min && $i !=0){basemmin=BASES[i];mim=$i}
             else if($i==min){basemin=basemin","BASES[i];++l}
         }
         print ,,basemax,max,basemin,min
     }

在这样的输入中

chr     pos     C       T       A       G
NC_044998.1     3732    22      0       7       0
NC_044998.1     3733    22      0       0       0
NC_044998.1     3734    22      3       3       0
NC_044998.1     3735    22      0       0       3
NC_044998.1     3736    0       7       22      3
NC_044998.1     3737    0       0       0       25
NC_044998.1     3738    22      7       0       0
NC_044998.1     3739    7       3       22      25
NC_044998.1     3740    0       22      22      0
NC_044998.1     3741    22      0       0       0

期望的输出是

chr          pos   ref  max  ref    min
NC_044998.1  3732  C    22   A      7
NC_044998.1  3733  C    22   C      22
NC_044998.1  3734  C    22   T,A    3
NC_044998.1  3735  C    22   G      3
NC_044998.1  3736  A    22   G      3
NC_044998.1  3737  G    25   G      25
NC_044998.1  3738  C    22   C      22
NC_044998.1  3739  G    25   C      7
NC_044998.1  3740  T,A  22   T,A    22
NC_044998.1  3741  C    22   C      22

但它输出的是这个

chr          pos   ref  max  ref    min
NC_044998.1  3732  C    22   C      22
NC_044998.1  3733  C    22   C      22
NC_044998.1  3734  C    22   C      22
NC_044998.1  3735  C    22   C      22
NC_044998.1  3736  A    22   C      0
NC_044998.1  3737  G    25   C,T,A  0
NC_044998.1  3738  C    22   C      22
NC_044998.1  3739  G    25   C      7
NC_044998.1  3740  T    22   C,A,G  0
NC_044998.1  3741  C    22   C      22
  • 您在变量名中有一些拼写错误,例如 basemminmim
  • 如果C的计数是0min的值没有机会被更新。
  • 您可以将两个 for 循环合并为一个。
  • 未使用变量l

那么请您尝试以下操作:

awk -v OFS="\t" '
NR==1 {
    print , , "ref", "max", "ref", "min"
    for (i = 3; i <= 6; i++) bases[i] = $i
}
NR>1 {
    basemax = bases[3]; basemin = bases[3]; max = ; min = 
    for (i = 4; i <= 6; i++) {
        if ($i > max) {basemax = bases[i]; max = $i}
        else if ($i == max) {basemax = basemax "," bases[i]}
        if ($i < min && $i != 0 || min == 0) {basemin = bases[i]; min = $i}
        else if ($i == min) {basemin = basemin "," bases[i]}
    }
    print , , basemax, max, basemin, min
}' input_file

输出:

chr     pos     ref     max     ref     min
NC_044998.1     3732    C       22      A       7
NC_044998.1     3733    C       22      C       22
NC_044998.1     3734    C       22      T,A     3
NC_044998.1     3735    C       22      G       3
NC_044998.1     3736    A       22      G       3
NC_044998.1     3737    G       25      G       25
NC_044998.1     3738    C       22      T       7
NC_044998.1     3739    G       25      T       3
NC_044998.1     3740    T,A     22      T,A     22
NC_044998.1     3741    C       22      C       22

请注意输出与您想要的输出略有不同,其中可能包含拼写错误。

使用您显示的示例,请尝试以下 awk 代码。在 GNU awk.

中编写和测试
awk -v startField="3" -v endField="6" '
BEGIN{ OFS="\t"; print "chr             pos     ref     max     ref     min"}
FNR==1{
  for(i=startField;i<=endField;i++){
    heading[i]=$i
  }
  next
}
{
  min=max2=maxInd2=minInd=max=maxInd=minAllInd=maxAllInd=maxAllInd2=""
  for(i=startField;i<=endField;i++){
    if($i!=0){
      minInd=(min>$i?i:(min==$i?minInd","i:(minInd!=""?minInd:i)))
      min=(min>$i?$i:(min!=""?min:$i))
    }
    maxInd=(max<$i?i:(max==$i?maxInd","i:(maxInd!=""?maxInd:i)))
    max=(max<$i?$i:(max!=""?max:$i))
  }
  for(i=startField+1;i<=endField;i++){
    maxInd2=(max2<$i?i:(max2==$i?maxInd2","i:(maxInd2!=""?maxInd2:i)))
    max2=(max2<$i?$i:(max2!=""?max2:$i))
  }
  num1=split(maxInd,arr1,",")
  num2=split(minInd,arr2,",")
  num3=split(maxInd2,arr3,",")
  if(num1>1){
    for(k=1;k<=num1;k++){
       maxAllInd = (maxAllInd?maxAllInd ",":"") heading[arr1[k]]
    }
  }
  else{
    maxAllInd = heading[maxInd]
  }
  if(num2>1){
    for(k=1;k<=num2;k++){
       minAllInd = (minAllInd?minAllInd ",":"") heading[arr2[k]]
    }
  }
  else{
    minAllInd = heading[minInd]
  }
  if(num3>1){
    for(k=1;k<=num3;k++){
       maxAllInd2 = (maxAllInd2?maxAllInd2 ",":"") heading[arr3[k]]
    }
  }
  else{
    maxAllInd2 = heading[maxInd2]
  }
  if(startField>1){
    NF=(startField-1)
      if(min !=0 ){
         print [=10=],maxAllInd,max,minAllInd,min
      }
      if(min == 0 && max2 != 0){
         print [=10=],maxAllInd,max,maxAllInd2,max2
      }
      if(min == 0 && max2 == 0){
         print [=10=],maxAllInd,max,maxAllInd,max
      }  
  }
  else{
      if(min !=0 ){
         print maxAllInd,max,minAllInd,min
      }
      if(min == 0 && max2 != 0){
         print maxAllInd,max,maxAllInd2,max2
      }
      if(min == 0 && max2 == 0){
         print maxAllInd,max,maxAllInd,max
      }       
  }
}
'  Input_file

这个 awk 脚本应该适合你:

cat maxmin.awk

NR == 1 {
   for (i=b; i<=NF; ++i)
      hdr[i] = $i
   print , , "ref", "max", "ref", "min"
   next
}
{
   for (i=b; i<=NF; ++i) {
      max = ($i > max ? $i : max)
      min = ($i && (min == "" || $i < min) ? $i : min)
   }
   for (i=b; i<=NF; ++i) {
      if ($i == min)
         rmin = (rmin ? rmin "," : "") hdr[i]
      if ($i == max)
         rmax = (rmax ? rmax "," : "") hdr[i]
   }
   print , , rmax, max, rmin, min
   max = min = rmax = rmin = ""
}

并将其用作:

awk -v b=3 -f maxmin.awk gg | column -t
chr          pos   ref  max  ref  min
NC_044998.1  3732  C    22   A    7
NC_044998.1  3733  C    22   C    22
NC_044998.1  3734  C    22   T,A  3
NC_044998.1  3735  C    22   G    3
NC_044998.1  3736  A    22   G    3
NC_044998.1  3737  G    25   G    25
NC_044998.1  3738  C    22   T    7
NC_044998.1  3739  G    25   T    3
NC_044998.1  3740  T,A  22   T,A  22
NC_044998.1  3741  C    22   C    22

column -t 仅用于表格输出。