用awk计算滑动window的中值
Calculate median of a sliding window with awk
我需要生成数百万行的滑动 window 并计算第 3 列的中位数。我的数据看起来像这样,第 1 列始终相同,第 2 列等于行号和列3 是我需要中位数的信息:
HiC_scaffold_1 1 34
HiC_scaffold_1 2 34
HiC_scaffold_1 3 36
HiC_scaffold_1 4 37
HiC_scaffold_1 5 38
HiC_scaffold_1 6 39
HiC_scaffold_1 7 40
HiC_scaffold_1 8 40
HiC_scaffold_1 9 40
HiC_scaffold_1 10 41
HiC_scaffold_1 11 41
HiC_scaffold_1 12 41
HiC_scaffold_1 13 44
HiC_scaffold_1 14 44
HiC_scaffold_1 15 55
我需要这样的结果,假设滑动 window 为 4 并四舍五入到最接近的整数。在真实数据集中,我可能会使用 1000 的滑动 window:
HiC_scaffold_1 4 35
HiC_scaffold_1 5 37
HiC_scaffold_1 6 38
HiC_scaffold_1 7 39
HiC_scaffold_1 8 40
HiC_scaffold_1 9 40
HiC_scaffold_1 10 40
HiC_scaffold_1 11 41
HiC_scaffold_1 12 41
HiC_scaffold_1 13 41
HiC_scaffold_1 14 43
HiC_scaffold_1 15 44
我找到了以下脚本 here 来做我想做的事情,但不是中位数:
awk -v OFS="\t" 'BEGIN {
window = 4
slide = 1
}
{
mod = NR % window
if (NR <= window) {
count++
} else {
sum -= array[mod]
}
sum +=
array[mod] =
}
(NR % slide) == 0 {
print , NR, sum / count
}
' file.txt
和这个用 awk 从 here:
计算中位数的脚本
sort -n -k3 file.txt |
awk '{
arr[NR] =
}
END {
if (NR % 2 == 1) {
print arr[(NR + 1) / 2]
} else {
print "\t" "\t" (arr[NR / 2] + arr[NR / 2 + 1]) / 2
}
}
'
但我无法让他们一起工作。另一个问题是中值计算需要经过排序的输入。我也找到了这个 datamash 解决方案,但我不知道如何使滑动 window.
有效地工作
使用 GNU awk 的以下脚本似乎生成了您提供的输出:
awk -v OFS='\t' -v window=4 '
{
# I store the numbers in an array `nums` indexed with `1 ... window`
mod = NR % window + 1;
nums[mod] = ;
}
# If the count of numbers is greater or equal the window,
# we can start calculating the median.
NR >= window {
# Copy the array nums, cause we need to sort it.
for (i = 1; i <= window; ++i) {
copy[i] = nums[i];
}
# Sort the copy.
# asort is a GNU extension if I remember.
# For non-gnu, write a sorting function yourself.
asort(copy);
# Calculate the median.
# I hope that is ok.
half = int( (window + 1) / 2 );
if (window % 2 == 0) {
# You seem to want to round 0.5 up.
# Just add 1 and round down.
median = int( (copy[half] + copy[half + 1] + 1) / 2 );
} else {
median = copy[half];
}
# Output
print , , median
}'
以下假定函数 asort
的可用性由 GNU awk (gawk) 提供。该程序由 wsize 参数化,window 大小——这里是 4:
gawk -v wsize=4 '
BEGIN {
if (wsize % 2 == 0) { m1=wsize/2; m2=m1+1; } else { m1 = m2 = (wsize+1)/2; }
}
function roundedmedian() {
asort(window, a);
return (m1==m2) ? a[m1] : int(0.5 + ((a[m1] + a[m2]) / 2));
}
function push(value) {
window[NR % wsize] = value;
}
NR < wsize { window[NR]=; next; }
{ push();
= roundedmedian();
print [=10=];
}'
使用 GNU awk asort()
:
$ cat tst.awk
BEGIN {
OFS = "\t"
window = 4
befMid = int(window / 2)
aftMid = befMid + (window % 2 ? 0 : 1)
}
{ array[NR % window] = }
NR >= window {
asort(array,vals)
print , , int( (vals[befMid] + vals[aftMid]) / 2 + 0.5 )
}
.
$ awk -f tst.awk file
HiC_scaffold_1 4 35
HiC_scaffold_1 5 37
HiC_scaffold_1 6 38
HiC_scaffold_1 7 39
HiC_scaffold_1 8 40
HiC_scaffold_1 9 40
HiC_scaffold_1 10 40
HiC_scaffold_1 11 41
HiC_scaffold_1 12 41
HiC_scaffold_1 13 41
HiC_scaffold_1 14 43
HiC_scaffold_1 15 44
我需要生成数百万行的滑动 window 并计算第 3 列的中位数。我的数据看起来像这样,第 1 列始终相同,第 2 列等于行号和列3 是我需要中位数的信息:
HiC_scaffold_1 1 34
HiC_scaffold_1 2 34
HiC_scaffold_1 3 36
HiC_scaffold_1 4 37
HiC_scaffold_1 5 38
HiC_scaffold_1 6 39
HiC_scaffold_1 7 40
HiC_scaffold_1 8 40
HiC_scaffold_1 9 40
HiC_scaffold_1 10 41
HiC_scaffold_1 11 41
HiC_scaffold_1 12 41
HiC_scaffold_1 13 44
HiC_scaffold_1 14 44
HiC_scaffold_1 15 55
我需要这样的结果,假设滑动 window 为 4 并四舍五入到最接近的整数。在真实数据集中,我可能会使用 1000 的滑动 window:
HiC_scaffold_1 4 35
HiC_scaffold_1 5 37
HiC_scaffold_1 6 38
HiC_scaffold_1 7 39
HiC_scaffold_1 8 40
HiC_scaffold_1 9 40
HiC_scaffold_1 10 40
HiC_scaffold_1 11 41
HiC_scaffold_1 12 41
HiC_scaffold_1 13 41
HiC_scaffold_1 14 43
HiC_scaffold_1 15 44
我找到了以下脚本 here 来做我想做的事情,但不是中位数:
awk -v OFS="\t" 'BEGIN {
window = 4
slide = 1
}
{
mod = NR % window
if (NR <= window) {
count++
} else {
sum -= array[mod]
}
sum +=
array[mod] =
}
(NR % slide) == 0 {
print , NR, sum / count
}
' file.txt
和这个用 awk 从 here:
计算中位数的脚本sort -n -k3 file.txt |
awk '{
arr[NR] =
}
END {
if (NR % 2 == 1) {
print arr[(NR + 1) / 2]
} else {
print "\t" "\t" (arr[NR / 2] + arr[NR / 2 + 1]) / 2
}
}
'
但我无法让他们一起工作。另一个问题是中值计算需要经过排序的输入。我也找到了这个 datamash 解决方案,但我不知道如何使滑动 window.
有效地工作使用 GNU awk 的以下脚本似乎生成了您提供的输出:
awk -v OFS='\t' -v window=4 '
{
# I store the numbers in an array `nums` indexed with `1 ... window`
mod = NR % window + 1;
nums[mod] = ;
}
# If the count of numbers is greater or equal the window,
# we can start calculating the median.
NR >= window {
# Copy the array nums, cause we need to sort it.
for (i = 1; i <= window; ++i) {
copy[i] = nums[i];
}
# Sort the copy.
# asort is a GNU extension if I remember.
# For non-gnu, write a sorting function yourself.
asort(copy);
# Calculate the median.
# I hope that is ok.
half = int( (window + 1) / 2 );
if (window % 2 == 0) {
# You seem to want to round 0.5 up.
# Just add 1 and round down.
median = int( (copy[half] + copy[half + 1] + 1) / 2 );
} else {
median = copy[half];
}
# Output
print , , median
}'
以下假定函数 asort
的可用性由 GNU awk (gawk) 提供。该程序由 wsize 参数化,window 大小——这里是 4:
gawk -v wsize=4 '
BEGIN {
if (wsize % 2 == 0) { m1=wsize/2; m2=m1+1; } else { m1 = m2 = (wsize+1)/2; }
}
function roundedmedian() {
asort(window, a);
return (m1==m2) ? a[m1] : int(0.5 + ((a[m1] + a[m2]) / 2));
}
function push(value) {
window[NR % wsize] = value;
}
NR < wsize { window[NR]=; next; }
{ push();
= roundedmedian();
print [=10=];
}'
使用 GNU awk asort()
:
$ cat tst.awk
BEGIN {
OFS = "\t"
window = 4
befMid = int(window / 2)
aftMid = befMid + (window % 2 ? 0 : 1)
}
{ array[NR % window] = }
NR >= window {
asort(array,vals)
print , , int( (vals[befMid] + vals[aftMid]) / 2 + 0.5 )
}
.
$ awk -f tst.awk file
HiC_scaffold_1 4 35
HiC_scaffold_1 5 37
HiC_scaffold_1 6 38
HiC_scaffold_1 7 39
HiC_scaffold_1 8 40
HiC_scaffold_1 9 40
HiC_scaffold_1 10 40
HiC_scaffold_1 11 41
HiC_scaffold_1 12 41
HiC_scaffold_1 13 41
HiC_scaffold_1 14 43
HiC_scaffold_1 15 44