C中使用位操作压缩字符串

Compress string using bits Operation in C

我想使用按位运算减少字符的内存存储,

例如

输入:{"ACGT"}

输出:{"0xE4"}

这里我们将二进制数转换为十六进制数

如果 A=00,C=01,G=10,T=11

所以 ACGT = 0xE4 = 11100100b

我想不通所以这就是我到目前为止所做的

enum NucleicAc {
    A = 0,
    C = 1,
    G = 2,
    T = 3,

} ;


struct _DNA {
    char * seq ;
};

DNAString DSCreate(char * mseq) {
    
    DNAString dna = malloc(sizeof(struct _DNA));
    if (!dna){
       exit(-1);
    }
    
    const int length = strlen(mseq);
    
    
  
    //  left shift will create the base , in the case of ACGT --> 0000,0000  
    int base   = 0 << length * 2;
    //loop and set bits
    int i = 0 ;
    int k = 0 ; // the counter where i want to modify the current bit
    //simple looping gor the string 
    for ( ; i < length ; i++ ) {
        switch (*(mseq+i)) {
            case 'A': // 00
                k++;
                break;
            case 'C': // 0 1
                  modifyBit(&base, k, 1);
                  k++;
                break;
                
            case 'G': //10
                k++;
                modifyBit(&base,k , 1);
                break;
                
            case 'T': // 11
                modifyBit(&base, k, 1);
                k++;
                modifyBit(&base, k,1);
                break;
            default:
                break;
                
        } //end of switch
        k++;
     
    }//end of for
    

    char * generatedSeq ;

    //convert the base to hex ??

    return dna;
}

void bin(unsigned n){
    unsigned i;
    for (i = 1 << 7; i > 0; i = i / 2){
        (n & i) ? printf("1") : printf("0");
    }
}

如果我们打印基数,则该值为预期的 11100100b,

如何将十六进制表示形式存储为字符串到结构中的char *mseq? 任何方向或确切的解决方案或有更好的方法吗?

后来我想只使用示例索引来获取字母

DSGet(dsStruct , '0')--> 将 return 'A' 因此 dsStruct 在编码之前包含“ACGT”?

这应该可以满足您的要求。我认为压缩是一个很酷的主意,所以我很快就写了这篇文章。正如@kaylum所提到的,十六进制编码只是一种读取内存中底层数据的方式,它总是只是位。所以,你只需要担心打印语句。

让我知道这是否有效,或者您对我所做的有任何疑问。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

typedef struct {
    unsigned char *bits;
    unsigned long length; // use this to store the number of letters encoded, for safety
} DNA;

typedef enum {
    A = 0,
    C = 1,
    G = 2,
    T = 3
} NucleicAc;

这个 returns 给定索引处的基数,带有一些边界检查

char base_at_index(DNA *dna, unsigned long index) {
    if (index >= dna->length) {
        fputs("error: base_at_index: index out of range", stderr);
        exit(EXIT_FAILURE);
    }
    // offset is index / 4, this gives us the correct byte
    // shift amount is index % 4 to give us the correct 2 bits within the byte.
    // This must then be multiplied by 2 because
    // each base takes 2 bits to encode
    // then we have to bitwise-and this value with 
    // 3 (0000 0011 in binary) to retrieve the bits we want.
    // so, the formula we need is
    // (dna->bits[index / 4] >> (2 * (index % 4))) & 3
    switch((dna->bits[index / 4] >> 2 * (index % 4)) & 3) {
        case A: return 'A';
        case C: return 'C';
        case G: return 'G';
        case T: return 'T';
        default: 
            fputs("error: base_at_index: invalid encoding", stderr);
            exit(EXIT_FAILURE);
    }
}

这将一串碱基编码为字节

/* you can fit four 2-bit DNA codes in each byte (unsigned char).
    len is the maximum number of characters to read. result must be at least len bytes long
*/
void encode_dna(unsigned char *result, char *sequence, unsigned long len) {
    // keep track of what byte we are on in the result
    unsigned result_index = 0;
    // our shift for writing to the correct position in the byte
    unsigned shift = 0;
    // first clear result or else bitwise operations will produce errors
    // this could be removed if you were certain result parameter was zero-filled
    memset(result, 0, len);
    // iterate through characters of the sequence
    while(*sequence) {
        switch (*sequence) {
            // do nothing for 'A' since it is just zero
            case 'A': break;
            case 'C':
            // we are doing a bitwise or with the current byte
            // and C (1) shifted to the appropriate position within
            // the byte, and then assigning the byte with the result
                result[result_index] |= C << shift;
                break;
            case 'G':
                result[result_index] |= G << shift;
                break;
            case 'T':
                result[result_index] |= T << shift;
                break;
            default:
                fputs("error: encode_dna: invalid base pair", stderr);
                exit(EXIT_FAILURE);
        }
        // increase shift amount by 2 to the next 2-bit slot in the byte
        shift += 2;

        // on every 4th iteration, reset our shift to zero since the byte is now full
        // and move to the next byte in our result buffer
        if (shift == 8) {
            shift = 0;
            result_index++;
        }

        // advance sequence to next nucleotide character
        sequence++;
    }
}

这是一个测试

int main(int argc, char **argv) {
    // allocate some storage for encoded DNA 
    unsigned char encoded_dna[32];

    const unsigned long sample_length = 15;
    
    // encode the given sample sequence
    encode_dna(encoded_dna, "ACGTAGTCGTCATAG", sample_length);

    // hh here means half of half word, which is a byte
    // capital X for capitalized hex output
    // here we print some bytes
    printf("0x%hhX\n", encoded_dna[0]); // should output 0xE4
    printf("0x%hhX\n", encoded_dna[1]); // should be 0x78
    printf("0x%hhX\n", encoded_dna[2]); // should be 0x1E
    printf("0x%hhX\n", encoded_dna[3]); // should be 0x23

    DNA test_dna; // allocate a sample DNA structure
    test_dna.bits = encoded_dna;
    test_dna.length = sample_length; // length of the sample sequence above

    // test some indices and see if the results are correct
    printf("test_dna index 4: %c\n", base_at_index(&test_dna, 4));
    printf("test_dna index 7: %c\n", base_at_index(&test_dna, 7));
    printf("test_dna index 12: %c\n", base_at_index(&test_dna, 12));
    return 0;
}

输出:

0xE4
0x78
0x1E
0x23
test_dna index 4: A
test_dna index 7: C
test_dna index 12: T

假设您真的想将 dna 字符串编码为十六进制字符串,并且您希望从左到右读取输入字符串,但从右到左输出十六进制字符,这是一个简单但速度稍慢的实现。

首先,您的 DNAString 需要跟踪列表中是否确实存在偶数或奇数个酸序列。这将使额外的附件更容易。

struct DNAString
{
    char* seq;
    bool odd; // if odd bit is set, then the front char is already allocated and holds one acid
};

现在让我们介绍一个小辅助函数,将 ACGT 转换为 0,1,2,3。

char acid_to_value(char c)
{
    switch (c)
    {
        case 'A': return 0;
        case 'C': return 1;
        case 'G': return 2;
        case 'T': return 3;
    }
    // ASSERT(false)
    return 0;
}

那么核心实现就是将新的十六进制字符“添加到”您正在构建的字符串中。如果字符串已经是奇数长度,代码将通过将其从十六进制转换为整数来“修复”前面的字符,然后将新的酸值移入其中,然后将其转换回十六进制 char

extern char fixup(char previous, char acid);
{
    char hexchars[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' };
    char tmp[2] = { previous, '[=12=]' };
    unsigned long asnumber = strtol(tmp, nullptr, 16);
    asnumber = asnumber & 0x3; // last two bits
    asnumber = asnumber | (acid_to_value(acid) << 2);
    return hexchars[asnumber];
}

void prepend_nucleic_acid_to_hexstring(struct DNAString* dnastring, char acid)
{
    if (dnastring->odd)
    {
        // find the first char in the string and fix it up hexwise
        dnastring->seq[0] = fixup(dnastring->seq[0], acid);
        dnastring->odd = false;
    }
    else
    {
        size_t currentlength = dnastring->seq ? strlen(dnastring->seq) : 0;
        const char* currentstring = dnastring->seq ? dnastring->seq : "";
        char* newseq = (char*)calloc(currentlength + 2, sizeof(char)); // +1 for new char and +1 for null char
        newseq[0] = acid_to_value(acid) + '0';  // prepend the next hex char
        strcpy(newseq + 1, currentstring);  // copy the old string into the new string space

        free(dnastring->seq);
        dnastring->seq = newseq;
        dnastring->odd = true;
    }
}

那么你的 DNACreate 函数真的很简单:

struct DNAString DSCreate(const char* mseq)
{
    DNAString dnastring = { 0 };

    while (*mseq)
    {
        prepend_nucleic_acid_to_hexstring(&dnastring, *mseq);
        mseq++;
    }
    return dnastring;
}

我不认为这种方法是有效的,因为他实际上一直在为每个字符重新分配内存。但它确实使您能够在以后灵活地调用前置函数以进行额外的排序。

然后进行测试:

int main()
{
    struct DNAString dnastring = DSCreate("ACGT");
    printf("0x%s\n", dnastring.seq);
    return 0;

}

您可以通过多种方式对序列进行编码。您的 enum 很好,但是对于您的编码序列,将字节捕获为 unsigned char 的结构、原始序列长度和以字节为单位的编码大小将允许轻松解码。您将获得 4 比 1 压缩(如果您的序列不能被 4 整除,则加 1 个字节)。枚举和结构可以是:

enum { A, C, G, T };

typedef struct {
  unsigned char *seq;
  size_t len, size;
} encoded;

要将字符串中的字符映射到编码值一个简单的函数,returns 匹配字符的枚举值就是您所需要的(不要忘记处理任何错误)

/* convert character to encoded value */
unsigned char getencval (const char c)
{
  if (c == 'A')
    return A;
  else if (c == 'C')
    return C;
  else if (c == 'G')
    return G;
  else if (c == 'T')
    return T;
  
  /* exit on anything other than A, C, G, T */
  fprintf (stderr, "error: invalid sequence character '%c'\n", c);
  exit (EXIT_FAILURE);
}

要对原始序列进行编码,您将使用原始长度 (len) 和保存编码字符串所需的字节数 (size) 填充 encoded 结构。不要忘记接下来的 4 个字符中的任何 1 个字符都需要另一个字节的存储空间。您可以使用简单的加法和除法来计算序列的任何部分 4 字符结尾部分,例如

/* encode sequence of characters as 2-bit pairs (4-characters per-byte)
 * returns encoded struct with allocated .seq member, on failure the .seq
 * member is NULL. User is resposible for freeing .seq member when done.
 */
encoded encode_seq (const char *seq)
{
  size_t  len = strlen(seq),
          size = (len + 3) / 4;             /* integer division intentional */
  encoded enc = { .seq = calloc (1, size),  /* encoded sequence struct */
                  .len = len,
                  .size = size };
  
  if (!enc.seq) { /* validate allication */
    perror ("calloc-enc.seq");
    return enc;
  }
  
  /* loop over each char (i) with byte index (ndx) 
   * shifting each 2-bit pair by (shift * 2) amount.
   */
  for (int i = 0, ndx = 0, shift = 4; seq[i] && seq[i] != '\n'; i++) {
    if (!shift--)           /* decrement shift, reset if 0 */
      shift = 3;
    if (i && i % 4 == 0)    /* after each 4th char, increment ndx */
      ndx += 1;
    
    /* shift each encoded value (multiply by 2 for shift of 6, 4, 2, 0) */
    enc.seq[ndx] |= getencval (seq[i]) << shift * 2;
  }
  
  return enc;   /* return encoded struct with allocated .seq member */
}

要从您的 encoded 结构中获取原始序列,使用查找 table(下面显示完整代码)使它变得轻而易举。您只需遍历所有存储的字节值,附加查找 table 中的相应字符串,直到最后一个字节。对于最后一个字节,您需要确定它是否是部分字符串,如果是,则需要复制多少个字符remain。 (这就是为什么将原始序列长度存储在结构中的原因)。然后简单地使用 strncat 从最后一个字节追加那么多字符,例如

/* decodes encoded sequence. Allocates storage for decoded sequence
 * and loops over each encoded byte using lookup-table to obtain
 * original 4-character string from byte value. User is responsible
 * for freeing returned string when done. Returns NULL on allocation
 * failure.
 */
char *decode_seq (encoded *eseq)
{
  char *seq = malloc (eseq->len + 1);   /* allocate storage for sequence */
  size_t i = 0, offset = 0, remain;
  
  if (!seq) { /* validate allocation */
    perror ("malloc-seq");
    return NULL;
  }
  
  /* loop appending strings from lookup table for all but last byte */
  for (; i < eseq->size - 1; i++) {
    memcpy (seq + offset, lookup[eseq->seq[i]], 4);
    offset += 4;  /* increment offset by 4 */
  }
  
  /* determine the number of characters in last byte */
  remain = eseq->len - (eseq->size - 1) * 4;
  memcpy (seq + offset, lookup[eseq->seq[i]], remain);
  seq[offset + remain] = 0;   /* nul-terminate seq */
  
  return seq;     /* return allocated sequence */
}

添加查找 table 并将所有部分放在一起,解决此问题的一种方法是:

(edit: 查找 table 重新排序以匹配您的字节值编码,优化 decode_seq() 不扫描字符串结尾复制)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

enum { A, C, G, T };

typedef struct {
  unsigned char *seq;
  size_t len, size;
} encoded;

const char lookup[][5] = {
    "AAAA","CAAA","GAAA","TAAA","ACAA","CCAA","GCAA","TCAA",
    "AGAA","CGAA","GGAA","TGAA","ATAA","CTAA","GTAA","TTAA",
    "AACA","CACA","GACA","TACA","ACCA","CCCA","GCCA","TCCA",
    "AGCA","CGCA","GGCA","TGCA","ATCA","CTCA","GTCA","TTCA",
    "AAGA","CAGA","GAGA","TAGA","ACGA","CCGA","GCGA","TCGA",
    "AGGA","CGGA","GGGA","TGGA","ATGA","CTGA","GTGA","TTGA",
    "AATA","CATA","GATA","TATA","ACTA","CCTA","GCTA","TCTA",
    "AGTA","CGTA","GGTA","TGTA","ATTA","CTTA","GTTA","TTTA",
    "AAAC","CAAC","GAAC","TAAC","ACAC","CCAC","GCAC","TCAC",
    "AGAC","CGAC","GGAC","TGAC","ATAC","CTAC","GTAC","TTAC",
    "AACC","CACC","GACC","TACC","ACCC","CCCC","GCCC","TCCC",
    "AGCC","CGCC","GGCC","TGCC","ATCC","CTCC","GTCC","TTCC",
    "AAGC","CAGC","GAGC","TAGC","ACGC","CCGC","GCGC","TCGC",
    "AGGC","CGGC","GGGC","TGGC","ATGC","CTGC","GTGC","TTGC",
    "AATC","CATC","GATC","TATC","ACTC","CCTC","GCTC","TCTC",
    "AGTC","CGTC","GGTC","TGTC","ATTC","CTTC","GTTC","TTTC",
    "AAAG","CAAG","GAAG","TAAG","ACAG","CCAG","GCAG","TCAG",
    "AGAG","CGAG","GGAG","TGAG","ATAG","CTAG","GTAG","TTAG",
    "AACG","CACG","GACG","TACG","ACCG","CCCG","GCCG","TCCG",
    "AGCG","CGCG","GGCG","TGCG","ATCG","CTCG","GTCG","TTCG",
    "AAGG","CAGG","GAGG","TAGG","ACGG","CCGG","GCGG","TCGG",
    "AGGG","CGGG","GGGG","TGGG","ATGG","CTGG","GTGG","TTGG",
    "AATG","CATG","GATG","TATG","ACTG","CCTG","GCTG","TCTG",
    "AGTG","CGTG","GGTG","TGTG","ATTG","CTTG","GTTG","TTTG",
    "AAAT","CAAT","GAAT","TAAT","ACAT","CCAT","GCAT","TCAT",
    "AGAT","CGAT","GGAT","TGAT","ATAT","CTAT","GTAT","TTAT",
    "AACT","CACT","GACT","TACT","ACCT","CCCT","GCCT","TCCT",
    "AGCT","CGCT","GGCT","TGCT","ATCT","CTCT","GTCT","TTCT",
    "AAGT","CAGT","GAGT","TAGT","ACGT","CCGT","GCGT","TCGT",
    "AGGT","CGGT","GGGT","TGGT","ATGT","CTGT","GTGT","TTGT",
    "AATT","CATT","GATT","TATT","ACTT","CCTT","GCTT","TCTT",
    "AGTT","CGTT","GGTT","TGTT","ATTT","CTTT","GTTT","TTTT"};

/* convert character to encoded value */
unsigned char getencval (const char c)
{
  if (c == 'A')
    return A;
  else if (c == 'C')
    return C;
  else if (c == 'G')
    return G;
  else if (c == 'T')
    return T;
  
  /* exit on anything other than A, C, G, T */
  fprintf (stderr, "error: invalid sequence character '%c'\n", c);
  exit (EXIT_FAILURE);
}

/* encode sequence of characters as 2-bit pairs (4-characters per-byte)
 * returns encoded struct with allocated .seq member, on failure the .seq
 * member is NULL. User is resposible for freeing .seq member when done.
 */
encoded encode_seq (const char *seq)
{
  size_t  len = strlen(seq),
          size = (len + 3) / 4;             /* integer division intentional */
  encoded enc = { .seq = calloc (1, size),  /* encoded sequence struct */
                  .len = len,
                  .size = size };
  
  if (!enc.seq) { /* validate allication */
    perror ("calloc-enc.seq");
    return enc;
  }
  
  /* loop over each char (i) with byte index (ndx) 
   * shifting each 2-bit pair by (shift * 2) amount.
   */
  for (int i = 0, ndx = 0, shift = 0; seq[i] && seq[i] != '\n'; i++, shift++) {
    if (shift == 4)         /* reset to 0 */
      shift = 0;
    if (i && i % 4 == 0)    /* after each 4th char, increment ndx */
      ndx += 1;
    
    /* shift each encoded value (multiply by 2 for shift of 0, 2, 4, 6) */
    enc.seq[ndx] |= getencval (seq[i]) << shift * 2;
  }
  
  return enc;   /* return encoded struct with allocated .seq member */
}

/* decodes encoded sequence. Allocates storage for decoded sequence
 * and loops over each encoded byte using lookup-table to obtain
 * original 4-character string from byte value. User is responsible
 * for freeing returned string when done. Returns NULL on allocation
 * failure.
 */
char *decode_seq (encoded *eseq)
{
  char *seq = malloc (eseq->len + 1);   /* allocate storage for sequence */
  size_t i = 0, offset = 0, remain;
  
  if (!seq) { /* validate allocation */
    perror ("malloc-seq");
    return NULL;
  }
  
  /* loop appending strings from lookup table for all but last byte */
  for (; i < eseq->size - 1; i++) {
    memcpy (seq + offset, lookup[eseq->seq[i]], 4);
    offset += 4;  /* increment offset by 4 */
  }
  
  /* determine the number of characters in last byte */
  remain = eseq->len - (eseq->size - 1) * 4;
  memcpy (seq + offset, lookup[eseq->seq[i]], remain);
  seq[offset + remain] = 0;   /* nul-terminate seq */
  
  return seq;     /* return allocated sequence */
}

/* short example program that takes string to encode as 1st argument
 * using "ACGT" if no argument is provided by default
 */
int main (int argc, char **argv) {
  
  char *seq = NULL;
  encoded enc = encode_seq(argc > 1 ? argv[1] : "ACGT");
  
  if (!enc.seq)   /* validate encoded allocation */
    return 1;
  
  /* output original string, length and encoded size */
  printf ("encoded str  : %s\nencoded len  : %zu\nencoded size : %zu\n", 
          argc > 1 ? argv[1] : "ACGT", enc.len, enc.size);
  
  /* loop outputting byte-values of encoded string */
  fputs ("encoded seq  :", stdout);
  for (size_t i = 0; i < enc.size; i++)
     printf (" 0x%02x", enc.seq[i]);
  putchar ('\n');
  
  seq = decode_seq (&enc);              /* decode seq from byte values */
  printf ("decoded seq  : %s\n", seq);  /* output decoded string */
  
  free (seq);         /* don't forget to free what you allocated */
  free (enc.seq);
}

在大多数情况下,与在解码期间计算和构建每个 4 字符的字符串相比,查找 -table 提供了大量的效率优势。在大多数情况下,查找 table 驻留在缓存中可以增强这一点。

您可以编码和解码的 DNA 序列的长度仅受您可用的虚拟内存量的限制。

例子Use/Output

程序将要编码和解码的序列作为第一个参数(默认值"ACGT")。所以默认输出是:

$ ./bin/dnaencodedecode
encoded str  : ACGT
encoded len  : 4
encoded size : 1
encoded seq  : 0xe4
decoded seq  : ACGT

4 字节编码为 1 字节。由于 table 排序,请注意 0x1b 而不是 0xe4 的字节值。

更长的例子:

./bin/dnaencodedecode ACGTGGGTCAGACTTA
encoded str  : ACGTGGGTCAGACTTA
encoded len  : 16
encoded size : 4
encoded seq  : 0xe4 0xea 0x21 0x3d
decoded seq  : ACGTGGGTCAGACTTA

16 个字符以 4 个字节编码。

最后,如果一个序列不能被 4 整除,那么最后一个编码字节中只有部分字符?这也被处理,例如

$ ./bin/dnaencodedecode ACGTGGGTCAGACTTAG
encoded str  : ACGTGGGTCAGACTTAG
encoded len  : 17
encoded size : 5
encoded seq  : 0xe4 0xea 0x21 0x3d 0x02
decoded seq  : ACGTGGGTCAGACTTAG

17 个字符以 5 个字节编码。 (不是纯粹的4比1压缩,而是随着序列大小的增加,最后一个字节中任何部分字符组的重要性都变得可以忽略不计)

就性能而言,对于 100,000 个字符的序列和字节值和字符串的输出替换为一个简单的循环,该循环将解码的 seq 与原始的 argv[1] 进行比较只需要千分之几秒(在带有 SSD 的旧 i7 Gen2 笔记本电脑上)进行编码、解码和验证,例如

$ 
time ./bin/dnaencodedecodet2big $(< dat/dnaseq100k.txt)
encoded len  : 100000
encoded size : 25000
all tests passed

real    0m0.014s
user    0m0.012s
sys     0m0.003s

有很多方法可以做到这一点,但根据你的描述,这就是我想到的你试图完成的。这里有很多,所以你慢慢看吧。

检查一下(代码已注释),如果您有其他问题,请告诉我。请在下方发表评论。