将核碱基表示形式从 ASCII 转换为 UCSC .2 位

Converting nucleobase representation from ASCII to UCSC .2bit

明确的 DNA 序列仅由核碱基腺嘌呤 (A)、胞嘧啶 (C)、鸟嘌呤 (G)、胸腺嘧啶 (T) 组成。对于人类消费,碱基可以用大写或小写的相应 char 表示:ACGTacgt。然而,当需要存储长序列时,这种表示是低效的。由于只需要存储四个符号,因此可以为每个符号分配一个 2 位代码。 UCSC 常用的 .2bit-格式 specified 正是这样做的,使用以下编码:T = 0b00, C = 0b01, A = 0b10, G = 0b11.

下面的 C 代码显示了为清楚起见而编写的参考实现。将表示为 char 序列的基因组序列转换的各种开源软件通常使用 256 条目查找 table,由每个 char 序列索引。这也与 char 的内部表示隔离开来。然而,内存访问非常昂贵,即使访问的是片上缓存,而且通用 table 查找很难 SIMDize。因此,如果转换可以通过简单的整数算术来完成,那将是有利的。鉴于 ASCII 是占主导地位的 char 编码,因此可以对其进行限制。

将作为 ASCII 字符给出的核碱基转换为其 .2bit 表示形式的有效计算方法是什么?

/* Convert nucleobases A, C, G, T represented as either uppercase or lowercase
   ASCII characters into UCSC .2bit-presentation. Passing any values other than
   those corresponding to 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't' results in an
   indeterminate response.
*/
unsigned int ascii_to_2bit (unsigned int a)
{
    const unsigned int UCSC_2BIT_A = 2;
    const unsigned int UCSC_2BIT_C = 1;
    const unsigned int UCSC_2BIT_G = 3;
    const unsigned int UCSC_2BIT_T = 0;

    switch (a) {
    case 'A':
    case 'a':
        return UCSC_2BIT_A;
        break;
    case 'C':
    case 'c':
        return UCSC_2BIT_C;
        break;
    case 'G':
    case 'g':
        return UCSC_2BIT_G;
        break;
    case 'T':
    case 't':
    default:
        return UCSC_2BIT_T;
        break;
    }
}

如果仔细观察核碱基 ASCII 字符的二进制代码,就会清楚第 1 位和第 2 位提供了唯一的两位代码:A = 0b01000001 -> 0b00, C = 0b01000011 -> 0b01, G = 0b01000111 -> 0b11, T = 0b01010100 -> 0b10。类似于小写 ASCII 字符,它们仅在第 5 位不同。不幸的是,这个简单的映射与 .2bit 编码不完全匹配,因为 A 和 T 的代码被交换了。解决此问题的一种方法是使用简单的四项排列 table 存储在变量中,可能在优化后分配给寄存器(“寄存器内查找-table”):

unsigned int ascii_to_2bit_perm (unsigned int a)
{
    unsigned int perm = (2 << 0) | (1 << 2) | (0 << 4) | (3 << 6);
    return (perm >> (a & 6)) & 3;
}

另一种方法通过观察 AT 的第 1 位为 0 来纠正生成的“原始”代码,通过简单的位操作即时提取第 1 位和第 2 位,但是1 表示 CG。因此,我们可以通过对输入的第 1 位的 与初始代码的第 1 位进行异或来交换 A 和 T 的编码:

unsigned int ascii_to_2bit_twiddle (unsigned int a)
{
    return ((a >> 1) & 3) ^ (~a & 2);
}

此版本在具有快速位域提取指令的处理器和没有桶形移位器的低端处理器上具有优势,因为只需要右移一位位置。在乱序处理器上,这种方法比排列 table 提供更多的指令级并行性。由于在所有字节通道中使用相同的移位计数,因此适应 SIMD 实现似乎也更容易。

在我专心研究相关 ASCII 字符的二进制编码之前,我研究过使用简单的数学计算。对小乘数和除数的简单暴力搜索产生:

unsigned int ascii_to_2bit_math (unsigned int a)
{
    return ((18 * (a & 31)) % 41) & 3;
}

乘法器 18 对没有快速整数乘法器的处理器很友好。现代编译器可以有效地处理具有编译时常数除数的模计算,并且不需要除法。即便如此,我注意到即使是最好的编译器也难以利用非常有限的输入和输出范围,所以我不得不手动修改它以简化代码:

unsigned int ascii_to_2bit_math (unsigned int a)
{
    unsigned int t = 18 * (a & 31);
    return (t - ((t * 25) >> 10)) & 3;
}

即使采用这种形式并假设单周期乘法的可用性,这似乎通常也无法与之前的两种方法竞争,因为它会产生更多的指令和更长的依赖链。但是,在 64 位平台上,整个计算可以替换为 64 位、32 项查找 table,如果可以将此 64 位 table 有效地放入寄存器,则可以提供具有竞争力的性能,这是 x86-64 的情况,它作为立即加载。

unsigned int ascii_to_2bit_tab (unsigned int a)
{
    uint64_t tab = ((0ULL <<  0) | (2ULL <<  2) | (0ULL <<  4) | (1ULL <<  6) |
                    (3ULL <<  8) | (0ULL << 10) | (2ULL << 12) | (3ULL << 14) |
                    (1ULL << 16) | (3ULL << 18) | (0ULL << 20) | (2ULL << 22) |
                    (3ULL << 24) | (1ULL << 26) | (2ULL << 28) | (0ULL << 30) |
                    (1ULL << 32) | (3ULL << 34) | (1ULL << 36) | (2ULL << 38) |
                    (0ULL << 40) | (1ULL << 42) | (3ULL << 44) | (0ULL << 46) |
                    (2ULL << 48) | (0ULL << 50) | (1ULL << 52) | (3ULL << 54) |
                    (0ULL << 56) | (2ULL << 58) | (3ULL << 60) | (1ULL << 62));
    return (tab >> (2 * (a & 31))) & 3;
}

我附上我的测试框架以供参考:

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

#define ORIGINAL_MATH (1)

unsigned int ascii_to_2bit_perm (unsigned int a)
{
    unsigned int perm = (2 << 0) | (1 << 2) | (0 << 4) | (3 << 6);
    return (perm >> (a & 6)) & 3;
}

unsigned int ascii_to_2bit_twiddle (unsigned int a)
{
    return ((a >> 1) & 3) ^ (~a & 2);
}

unsigned int ascii_to_2bit_math (unsigned int a)
{
#if ORIGINAL_MATH
    return ((18 * (a & 31)) % 41) & 3;
#else // ORIGINAL_MATH
    unsigned int t = 18 * (a & 31);
    return (t - ((t * 25) >> 10)) & 3;
#endif // ORIGINAL_MATH
}

unsigned int ascii_to_2bit_tab (unsigned int a)
{
    uint64_t tab = ((0ULL <<  0) | (2ULL <<  2) | (0ULL <<  4) | (1ULL <<  6) |
                    (3ULL <<  8) | (0ULL << 10) | (2ULL << 12) | (3ULL << 14) |
                    (1ULL << 16) | (3ULL << 18) | (0ULL << 20) | (2ULL << 22) |
                    (3ULL << 24) | (1ULL << 26) | (2ULL << 28) | (0ULL << 30) |
                    (1ULL << 32) | (3ULL << 34) | (1ULL << 36) | (2ULL << 38) |
                    (0ULL << 40) | (1ULL << 42) | (3ULL << 44) | (0ULL << 46) |
                    (2ULL << 48) | (0ULL << 50) | (1ULL << 52) | (3ULL << 54) |
                    (0ULL << 56) | (2ULL << 58) | (3ULL << 60) | (1ULL << 62));
    return (tab >> (2 * (a & 31))) & 3;
}

/* Convert nucleobases A, C, G, T represented as either uppercase or lowercase
   ASCII characters into UCSC .2bit-presentation. Passing any values other than
   those corresponding to 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't' results in an
   inderminate response.
*/
unsigned int ascii_to_2bit (unsigned int a)
{
    const unsigned int UCSC_2BIT_A = 2;
    const unsigned int UCSC_2BIT_C = 1;
    const unsigned int UCSC_2BIT_G = 3;
    const unsigned int UCSC_2BIT_T = 0;

    switch (a) {
    case 'A':
    case 'a':
        return UCSC_2BIT_A;
        break;
    case 'C':
    case 'c':
        return UCSC_2BIT_C;
        break;
    case 'G':
    case 'g':
        return UCSC_2BIT_G;
        break;
    case 'T':
    case 't':
    default:
        return UCSC_2BIT_T;
        break;
    }
}

int main (void)
{
    char nucleobase[8] = {'A', 'C', 'G', 'T', 'a', 'c', 'g', 't'};
    printf ("Testing permutation variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i++) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_perm (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    printf ("Testing bit-twiddling variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i++) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_twiddle (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    printf ("Testing math-based variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i++) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_math (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    printf ("Testing table-based variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i++) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_tab (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    return EXIT_SUCCESS;
}

一个选项如下:

unsigned int ascii_to_2bit (unsigned int a)
{
    return ((0xad - a) & 6) >> 1;
}

这样做的好处是它只需要 8 位,不会溢出,并且不包含非常量移位,因此即使没有特定的 SIMD 指令,它也可以立即用于并行化,例如在 64 位字中输入 8 个字符

unsigned int ascii_to_2bit_8bytes (uint64_t a)
{
    return ((0xadadadadadadadad - a) & 0x0606060606060606) >> 1;
}

在每个字节的底部保留两个输出位。

以下例程将用 ASCII 字符串的内容填充一个 uint32_t 值的数组,其中填充了您声明的字符,并将保存状态以便能够附加第二个、第三个等. 数组的字符串数。它的使用方式通过 main() 例程进行说明,该例程从命令行获取字符串参数并将它们传递给 TOTAL 元素的数组。例程的参数在其上方的注释中进行了描述。

#include <ctype.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define UCSC_2BIT_T  (0)
#define UCSC_2BIT_C  (1)
#define UCSC_2BIT_A  (2)
#define UCSC_2BIT_G  (3)

#define PAIRS_PER_INTEGER   16

/**
 * Converts a string of nucleobases in ASCII form into an array
 * of 32bit integers in 2bitform.  As it should be possible to
 * continue the array of integers, a status reference is
 * maintained in order to determine determine where in the
 * integer to start putting the two bit sequences.  For this,
 * a (*state) variable is maintained (initialize it to 0) to
 * remember where to start putting bitpairs in the array.
 *
 * @param state_ref reference to the state variable to be maintained
 *               with the position on which to put the base in the
 *               array entry.
 * @param dna    string with the ASCII chain of bases.
 * @param twobit array reference to start filling.
 * @param sz     size of the array ***in array entries***.
 * @return the number of array elements written.  This allows to
 *         use a pointer and advance it at each function call
 *         with the number of entries consumed on each call.
 */
ssize_t
dna2twobit(
    int *state_ref,
    char *dna,
    uint32_t twobit[],
    size_t sz)
{
    /* local copy so we only change the state in case of
     * successful execution */
    int state = 30 - *state_ref;
    if (state == 30) *twobit = 0;
    ssize_t total_nb = 0; /* total number of array elements consumed */
    while (*dna && sz) {
        char base = toupper(*dna++);
        uint32_t tb;
        switch (base) {
        case 'A': tb = UCSC_2BIT_A; break;
        case 'C': tb = UCSC_2BIT_C; break;
        case 'T': tb = UCSC_2BIT_T; break;
        case 'G': tb = UCSC_2BIT_G; break;
        default: return -1;
        }
        *twobit |= tb << state;
        state -= 2;
        if (state < 0) {
            --sz; ++twobit;
            state += 32;
            *twobit = 0;
            total_nb++;
        }
    }
    *state_ref = 30 - state;
    return total_nb;
}

此函数可以单独链接到您希望的任何程序中,但我提供了一个 main() 代码来说明该函数的使用。您可以使用在命令行参数中编码的 ASCII 中的实际链来调用程序。该程序将在前一个之后对它们进行编码,以演示多重转换(16 个碱基适合一个 32 位整数,如您在问题中发布的格式定义所述,因此如果编码不是 16 的倍数,状态反映了最后一个已经覆盖了多少位。

代码继续下面的主要功能:


#define TOTAL 16

int main(int argc, char **argv)
{
    int state = 0, i;
    uint32_t twobit_array[TOTAL], *p = twobit_array;
    size_t twobit_cap = TOTAL;

    for (i = 1; i < argc; ++i) {
        printf("Adding the following chain: [%s]\n", argv[i]);
        ssize_t n = dna2twobit(
                        &state,
                        argv[i],
                        p,
                        twobit_cap);
        if (n < 0) {
            fprintf(stderr,
                    "argument invalid: %s\n"
                    "continuing to next\n",
                    argv[i]);
            continue;
        }
        twobit_cap -= n;
        p += n;
    }
    if (!state) --p;
    uint32_t *q = twobit_array;
    size_t off = 0;
    for (int j = 0; q <= p; j++) {
        char *sep = "";
        printf("%09zd: ", off);
        for (int k = 0; k < 4 & q <= p; k++) {
            printf("%s%08x", sep, *q);
            sep = "-";
            q++;
            off += 16;
        }
        printf("\n");
    }
    return 0;
}