C中使用位操作压缩字符串
Compress string using bits Operation in C
我想使用按位运算减少字符的内存存储,
例如
输入:{"ACGT"}
输出:{"0xE4"}
这里我们将二进制数转换为十六进制数
如果 A=00,C=01,G=10,T=11
所以 ACGT = 0xE4 = 11100100b
我想不通所以这就是我到目前为止所做的
enum NucleicAc {
A = 0,
C = 1,
G = 2,
T = 3,
} ;
struct _DNA {
char * seq ;
};
DNAString DSCreate(char * mseq) {
DNAString dna = malloc(sizeof(struct _DNA));
if (!dna){
exit(-1);
}
const int length = strlen(mseq);
// left shift will create the base , in the case of ACGT --> 0000,0000
int base = 0 << length * 2;
//loop and set bits
int i = 0 ;
int k = 0 ; // the counter where i want to modify the current bit
//simple looping gor the string
for ( ; i < length ; i++ ) {
switch (*(mseq+i)) {
case 'A': // 00
k++;
break;
case 'C': // 0 1
modifyBit(&base, k, 1);
k++;
break;
case 'G': //10
k++;
modifyBit(&base,k , 1);
break;
case 'T': // 11
modifyBit(&base, k, 1);
k++;
modifyBit(&base, k,1);
break;
default:
break;
} //end of switch
k++;
}//end of for
char * generatedSeq ;
//convert the base to hex ??
return dna;
}
void bin(unsigned n){
unsigned i;
for (i = 1 << 7; i > 0; i = i / 2){
(n & i) ? printf("1") : printf("0");
}
}
如果我们打印基数,则该值为预期的 11100100b,
如何将十六进制表示形式存储为字符串到结构中的char *mseq?
任何方向或确切的解决方案或有更好的方法吗?
后来我想只使用示例索引来获取字母
DSGet(dsStruct , '0')--> 将 return 'A' 因此 dsStruct 在编码之前包含“ACGT”?
这应该可以满足您的要求。我认为压缩是一个很酷的主意,所以我很快就写了这篇文章。正如@kaylum所提到的,十六进制编码只是一种读取内存中底层数据的方式,它总是只是位。所以,你只需要担心打印语句。
让我知道这是否有效,或者您对我所做的有任何疑问。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
typedef struct {
unsigned char *bits;
unsigned long length; // use this to store the number of letters encoded, for safety
} DNA;
typedef enum {
A = 0,
C = 1,
G = 2,
T = 3
} NucleicAc;
这个 returns 给定索引处的基数,带有一些边界检查
char base_at_index(DNA *dna, unsigned long index) {
if (index >= dna->length) {
fputs("error: base_at_index: index out of range", stderr);
exit(EXIT_FAILURE);
}
// offset is index / 4, this gives us the correct byte
// shift amount is index % 4 to give us the correct 2 bits within the byte.
// This must then be multiplied by 2 because
// each base takes 2 bits to encode
// then we have to bitwise-and this value with
// 3 (0000 0011 in binary) to retrieve the bits we want.
// so, the formula we need is
// (dna->bits[index / 4] >> (2 * (index % 4))) & 3
switch((dna->bits[index / 4] >> 2 * (index % 4)) & 3) {
case A: return 'A';
case C: return 'C';
case G: return 'G';
case T: return 'T';
default:
fputs("error: base_at_index: invalid encoding", stderr);
exit(EXIT_FAILURE);
}
}
这将一串碱基编码为字节
/* you can fit four 2-bit DNA codes in each byte (unsigned char).
len is the maximum number of characters to read. result must be at least len bytes long
*/
void encode_dna(unsigned char *result, char *sequence, unsigned long len) {
// keep track of what byte we are on in the result
unsigned result_index = 0;
// our shift for writing to the correct position in the byte
unsigned shift = 0;
// first clear result or else bitwise operations will produce errors
// this could be removed if you were certain result parameter was zero-filled
memset(result, 0, len);
// iterate through characters of the sequence
while(*sequence) {
switch (*sequence) {
// do nothing for 'A' since it is just zero
case 'A': break;
case 'C':
// we are doing a bitwise or with the current byte
// and C (1) shifted to the appropriate position within
// the byte, and then assigning the byte with the result
result[result_index] |= C << shift;
break;
case 'G':
result[result_index] |= G << shift;
break;
case 'T':
result[result_index] |= T << shift;
break;
default:
fputs("error: encode_dna: invalid base pair", stderr);
exit(EXIT_FAILURE);
}
// increase shift amount by 2 to the next 2-bit slot in the byte
shift += 2;
// on every 4th iteration, reset our shift to zero since the byte is now full
// and move to the next byte in our result buffer
if (shift == 8) {
shift = 0;
result_index++;
}
// advance sequence to next nucleotide character
sequence++;
}
}
这是一个测试
int main(int argc, char **argv) {
// allocate some storage for encoded DNA
unsigned char encoded_dna[32];
const unsigned long sample_length = 15;
// encode the given sample sequence
encode_dna(encoded_dna, "ACGTAGTCGTCATAG", sample_length);
// hh here means half of half word, which is a byte
// capital X for capitalized hex output
// here we print some bytes
printf("0x%hhX\n", encoded_dna[0]); // should output 0xE4
printf("0x%hhX\n", encoded_dna[1]); // should be 0x78
printf("0x%hhX\n", encoded_dna[2]); // should be 0x1E
printf("0x%hhX\n", encoded_dna[3]); // should be 0x23
DNA test_dna; // allocate a sample DNA structure
test_dna.bits = encoded_dna;
test_dna.length = sample_length; // length of the sample sequence above
// test some indices and see if the results are correct
printf("test_dna index 4: %c\n", base_at_index(&test_dna, 4));
printf("test_dna index 7: %c\n", base_at_index(&test_dna, 7));
printf("test_dna index 12: %c\n", base_at_index(&test_dna, 12));
return 0;
}
输出:
0xE4
0x78
0x1E
0x23
test_dna index 4: A
test_dna index 7: C
test_dna index 12: T
假设您真的想将 dna 字符串编码为十六进制字符串,并且您希望从左到右读取输入字符串,但从右到左输出十六进制字符,这是一个简单但速度稍慢的实现。
首先,您的 DNAString 需要跟踪列表中是否确实存在偶数或奇数个酸序列。这将使额外的附件更容易。
struct DNAString
{
char* seq;
bool odd; // if odd bit is set, then the front char is already allocated and holds one acid
};
现在让我们介绍一个小辅助函数,将 ACGT 转换为 0,1,2,3。
char acid_to_value(char c)
{
switch (c)
{
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
}
// ASSERT(false)
return 0;
}
那么核心实现就是将新的十六进制字符“添加到”您正在构建的字符串中。如果字符串已经是奇数长度,代码将通过将其从十六进制转换为整数来“修复”前面的字符,然后将新的酸值移入其中,然后将其转换回十六进制 char
extern char fixup(char previous, char acid);
{
char hexchars[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' };
char tmp[2] = { previous, '[=12=]' };
unsigned long asnumber = strtol(tmp, nullptr, 16);
asnumber = asnumber & 0x3; // last two bits
asnumber = asnumber | (acid_to_value(acid) << 2);
return hexchars[asnumber];
}
void prepend_nucleic_acid_to_hexstring(struct DNAString* dnastring, char acid)
{
if (dnastring->odd)
{
// find the first char in the string and fix it up hexwise
dnastring->seq[0] = fixup(dnastring->seq[0], acid);
dnastring->odd = false;
}
else
{
size_t currentlength = dnastring->seq ? strlen(dnastring->seq) : 0;
const char* currentstring = dnastring->seq ? dnastring->seq : "";
char* newseq = (char*)calloc(currentlength + 2, sizeof(char)); // +1 for new char and +1 for null char
newseq[0] = acid_to_value(acid) + '0'; // prepend the next hex char
strcpy(newseq + 1, currentstring); // copy the old string into the new string space
free(dnastring->seq);
dnastring->seq = newseq;
dnastring->odd = true;
}
}
那么你的 DNACreate 函数真的很简单:
struct DNAString DSCreate(const char* mseq)
{
DNAString dnastring = { 0 };
while (*mseq)
{
prepend_nucleic_acid_to_hexstring(&dnastring, *mseq);
mseq++;
}
return dnastring;
}
我不认为这种方法是有效的,因为他实际上一直在为每个字符重新分配内存。但它确实使您能够在以后灵活地调用前置函数以进行额外的排序。
然后进行测试:
int main()
{
struct DNAString dnastring = DSCreate("ACGT");
printf("0x%s\n", dnastring.seq);
return 0;
}
您可以通过多种方式对序列进行编码。您的 enum
很好,但是对于您的编码序列,将字节捕获为 unsigned char
的结构、原始序列长度和以字节为单位的编码大小将允许轻松解码。您将获得 4 比 1 压缩(如果您的序列不能被 4 整除,则加 1 个字节)。枚举和结构可以是:
enum { A, C, G, T };
typedef struct {
unsigned char *seq;
size_t len, size;
} encoded;
要将字符串中的字符映射到编码值一个简单的函数,returns 匹配字符的枚举值就是您所需要的(不要忘记处理任何错误)
/* convert character to encoded value */
unsigned char getencval (const char c)
{
if (c == 'A')
return A;
else if (c == 'C')
return C;
else if (c == 'G')
return G;
else if (c == 'T')
return T;
/* exit on anything other than A, C, G, T */
fprintf (stderr, "error: invalid sequence character '%c'\n", c);
exit (EXIT_FAILURE);
}
要对原始序列进行编码,您将使用原始长度 (len
) 和保存编码字符串所需的字节数 (size
) 填充 encoded
结构。不要忘记接下来的 4 个字符中的任何 1 个字符都需要另一个字节的存储空间。您可以使用简单的加法和除法来计算序列的任何部分 4 字符结尾部分,例如
/* encode sequence of characters as 2-bit pairs (4-characters per-byte)
* returns encoded struct with allocated .seq member, on failure the .seq
* member is NULL. User is resposible for freeing .seq member when done.
*/
encoded encode_seq (const char *seq)
{
size_t len = strlen(seq),
size = (len + 3) / 4; /* integer division intentional */
encoded enc = { .seq = calloc (1, size), /* encoded sequence struct */
.len = len,
.size = size };
if (!enc.seq) { /* validate allication */
perror ("calloc-enc.seq");
return enc;
}
/* loop over each char (i) with byte index (ndx)
* shifting each 2-bit pair by (shift * 2) amount.
*/
for (int i = 0, ndx = 0, shift = 4; seq[i] && seq[i] != '\n'; i++) {
if (!shift--) /* decrement shift, reset if 0 */
shift = 3;
if (i && i % 4 == 0) /* after each 4th char, increment ndx */
ndx += 1;
/* shift each encoded value (multiply by 2 for shift of 6, 4, 2, 0) */
enc.seq[ndx] |= getencval (seq[i]) << shift * 2;
}
return enc; /* return encoded struct with allocated .seq member */
}
要从您的 encoded
结构中获取原始序列,使用查找 table(下面显示完整代码)使它变得轻而易举。您只需遍历所有存储的字节值,附加查找 table 中的相应字符串,直到最后一个字节。对于最后一个字节,您需要确定它是否是部分字符串,如果是,则需要复制多少个字符remain
。 (这就是为什么将原始序列长度存储在结构中的原因)。然后简单地使用 strncat
从最后一个字节追加那么多字符,例如
/* decodes encoded sequence. Allocates storage for decoded sequence
* and loops over each encoded byte using lookup-table to obtain
* original 4-character string from byte value. User is responsible
* for freeing returned string when done. Returns NULL on allocation
* failure.
*/
char *decode_seq (encoded *eseq)
{
char *seq = malloc (eseq->len + 1); /* allocate storage for sequence */
size_t i = 0, offset = 0, remain;
if (!seq) { /* validate allocation */
perror ("malloc-seq");
return NULL;
}
/* loop appending strings from lookup table for all but last byte */
for (; i < eseq->size - 1; i++) {
memcpy (seq + offset, lookup[eseq->seq[i]], 4);
offset += 4; /* increment offset by 4 */
}
/* determine the number of characters in last byte */
remain = eseq->len - (eseq->size - 1) * 4;
memcpy (seq + offset, lookup[eseq->seq[i]], remain);
seq[offset + remain] = 0; /* nul-terminate seq */
return seq; /* return allocated sequence */
}
添加查找 table 并将所有部分放在一起,解决此问题的一种方法是:
(edit: 查找 table 重新排序以匹配您的字节值编码,优化 decode_seq() 不扫描字符串结尾复制)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
enum { A, C, G, T };
typedef struct {
unsigned char *seq;
size_t len, size;
} encoded;
const char lookup[][5] = {
"AAAA","CAAA","GAAA","TAAA","ACAA","CCAA","GCAA","TCAA",
"AGAA","CGAA","GGAA","TGAA","ATAA","CTAA","GTAA","TTAA",
"AACA","CACA","GACA","TACA","ACCA","CCCA","GCCA","TCCA",
"AGCA","CGCA","GGCA","TGCA","ATCA","CTCA","GTCA","TTCA",
"AAGA","CAGA","GAGA","TAGA","ACGA","CCGA","GCGA","TCGA",
"AGGA","CGGA","GGGA","TGGA","ATGA","CTGA","GTGA","TTGA",
"AATA","CATA","GATA","TATA","ACTA","CCTA","GCTA","TCTA",
"AGTA","CGTA","GGTA","TGTA","ATTA","CTTA","GTTA","TTTA",
"AAAC","CAAC","GAAC","TAAC","ACAC","CCAC","GCAC","TCAC",
"AGAC","CGAC","GGAC","TGAC","ATAC","CTAC","GTAC","TTAC",
"AACC","CACC","GACC","TACC","ACCC","CCCC","GCCC","TCCC",
"AGCC","CGCC","GGCC","TGCC","ATCC","CTCC","GTCC","TTCC",
"AAGC","CAGC","GAGC","TAGC","ACGC","CCGC","GCGC","TCGC",
"AGGC","CGGC","GGGC","TGGC","ATGC","CTGC","GTGC","TTGC",
"AATC","CATC","GATC","TATC","ACTC","CCTC","GCTC","TCTC",
"AGTC","CGTC","GGTC","TGTC","ATTC","CTTC","GTTC","TTTC",
"AAAG","CAAG","GAAG","TAAG","ACAG","CCAG","GCAG","TCAG",
"AGAG","CGAG","GGAG","TGAG","ATAG","CTAG","GTAG","TTAG",
"AACG","CACG","GACG","TACG","ACCG","CCCG","GCCG","TCCG",
"AGCG","CGCG","GGCG","TGCG","ATCG","CTCG","GTCG","TTCG",
"AAGG","CAGG","GAGG","TAGG","ACGG","CCGG","GCGG","TCGG",
"AGGG","CGGG","GGGG","TGGG","ATGG","CTGG","GTGG","TTGG",
"AATG","CATG","GATG","TATG","ACTG","CCTG","GCTG","TCTG",
"AGTG","CGTG","GGTG","TGTG","ATTG","CTTG","GTTG","TTTG",
"AAAT","CAAT","GAAT","TAAT","ACAT","CCAT","GCAT","TCAT",
"AGAT","CGAT","GGAT","TGAT","ATAT","CTAT","GTAT","TTAT",
"AACT","CACT","GACT","TACT","ACCT","CCCT","GCCT","TCCT",
"AGCT","CGCT","GGCT","TGCT","ATCT","CTCT","GTCT","TTCT",
"AAGT","CAGT","GAGT","TAGT","ACGT","CCGT","GCGT","TCGT",
"AGGT","CGGT","GGGT","TGGT","ATGT","CTGT","GTGT","TTGT",
"AATT","CATT","GATT","TATT","ACTT","CCTT","GCTT","TCTT",
"AGTT","CGTT","GGTT","TGTT","ATTT","CTTT","GTTT","TTTT"};
/* convert character to encoded value */
unsigned char getencval (const char c)
{
if (c == 'A')
return A;
else if (c == 'C')
return C;
else if (c == 'G')
return G;
else if (c == 'T')
return T;
/* exit on anything other than A, C, G, T */
fprintf (stderr, "error: invalid sequence character '%c'\n", c);
exit (EXIT_FAILURE);
}
/* encode sequence of characters as 2-bit pairs (4-characters per-byte)
* returns encoded struct with allocated .seq member, on failure the .seq
* member is NULL. User is resposible for freeing .seq member when done.
*/
encoded encode_seq (const char *seq)
{
size_t len = strlen(seq),
size = (len + 3) / 4; /* integer division intentional */
encoded enc = { .seq = calloc (1, size), /* encoded sequence struct */
.len = len,
.size = size };
if (!enc.seq) { /* validate allication */
perror ("calloc-enc.seq");
return enc;
}
/* loop over each char (i) with byte index (ndx)
* shifting each 2-bit pair by (shift * 2) amount.
*/
for (int i = 0, ndx = 0, shift = 0; seq[i] && seq[i] != '\n'; i++, shift++) {
if (shift == 4) /* reset to 0 */
shift = 0;
if (i && i % 4 == 0) /* after each 4th char, increment ndx */
ndx += 1;
/* shift each encoded value (multiply by 2 for shift of 0, 2, 4, 6) */
enc.seq[ndx] |= getencval (seq[i]) << shift * 2;
}
return enc; /* return encoded struct with allocated .seq member */
}
/* decodes encoded sequence. Allocates storage for decoded sequence
* and loops over each encoded byte using lookup-table to obtain
* original 4-character string from byte value. User is responsible
* for freeing returned string when done. Returns NULL on allocation
* failure.
*/
char *decode_seq (encoded *eseq)
{
char *seq = malloc (eseq->len + 1); /* allocate storage for sequence */
size_t i = 0, offset = 0, remain;
if (!seq) { /* validate allocation */
perror ("malloc-seq");
return NULL;
}
/* loop appending strings from lookup table for all but last byte */
for (; i < eseq->size - 1; i++) {
memcpy (seq + offset, lookup[eseq->seq[i]], 4);
offset += 4; /* increment offset by 4 */
}
/* determine the number of characters in last byte */
remain = eseq->len - (eseq->size - 1) * 4;
memcpy (seq + offset, lookup[eseq->seq[i]], remain);
seq[offset + remain] = 0; /* nul-terminate seq */
return seq; /* return allocated sequence */
}
/* short example program that takes string to encode as 1st argument
* using "ACGT" if no argument is provided by default
*/
int main (int argc, char **argv) {
char *seq = NULL;
encoded enc = encode_seq(argc > 1 ? argv[1] : "ACGT");
if (!enc.seq) /* validate encoded allocation */
return 1;
/* output original string, length and encoded size */
printf ("encoded str : %s\nencoded len : %zu\nencoded size : %zu\n",
argc > 1 ? argv[1] : "ACGT", enc.len, enc.size);
/* loop outputting byte-values of encoded string */
fputs ("encoded seq :", stdout);
for (size_t i = 0; i < enc.size; i++)
printf (" 0x%02x", enc.seq[i]);
putchar ('\n');
seq = decode_seq (&enc); /* decode seq from byte values */
printf ("decoded seq : %s\n", seq); /* output decoded string */
free (seq); /* don't forget to free what you allocated */
free (enc.seq);
}
在大多数情况下,与在解码期间计算和构建每个 4 字符的字符串相比,查找 -table 提供了大量的效率优势。在大多数情况下,查找 table 驻留在缓存中可以增强这一点。
您可以编码和解码的 DNA 序列的长度仅受您可用的虚拟内存量的限制。
例子Use/Output
程序将要编码和解码的序列作为第一个参数(默认值"ACGT"
)。所以默认输出是:
$ ./bin/dnaencodedecode
encoded str : ACGT
encoded len : 4
encoded size : 1
encoded seq : 0xe4
decoded seq : ACGT
4 字节编码为 1 字节。由于 table 排序,请注意 0x1b
而不是 0xe4
的字节值。
更长的例子:
./bin/dnaencodedecode ACGTGGGTCAGACTTA
encoded str : ACGTGGGTCAGACTTA
encoded len : 16
encoded size : 4
encoded seq : 0xe4 0xea 0x21 0x3d
decoded seq : ACGTGGGTCAGACTTA
16 个字符以 4 个字节编码。
最后,如果一个序列不能被 4 整除,那么最后一个编码字节中只有部分字符?这也被处理,例如
$ ./bin/dnaencodedecode ACGTGGGTCAGACTTAG
encoded str : ACGTGGGTCAGACTTAG
encoded len : 17
encoded size : 5
encoded seq : 0xe4 0xea 0x21 0x3d 0x02
decoded seq : ACGTGGGTCAGACTTAG
17 个字符以 5 个字节编码。 (不是纯粹的4比1压缩,而是随着序列大小的增加,最后一个字节中任何部分字符组的重要性都变得可以忽略不计)
就性能而言,对于 100,000 个字符的序列和字节值和字符串的输出替换为一个简单的循环,该循环将解码的 seq
与原始的 argv[1]
进行比较只需要千分之几秒(在带有 SSD 的旧 i7 Gen2 笔记本电脑上)进行编码、解码和验证,例如
$
time ./bin/dnaencodedecodet2big $(< dat/dnaseq100k.txt)
encoded len : 100000
encoded size : 25000
all tests passed
real 0m0.014s
user 0m0.012s
sys 0m0.003s
有很多方法可以做到这一点,但根据你的描述,这就是我想到的你试图完成的。这里有很多,所以你慢慢看吧。
检查一下(代码已注释),如果您有其他问题,请告诉我。请在下方发表评论。
我想使用按位运算减少字符的内存存储,
例如
输入:{"ACGT"}
输出:{"0xE4"}
这里我们将二进制数转换为十六进制数
如果 A=00,C=01,G=10,T=11
所以 ACGT = 0xE4 = 11100100b
我想不通所以这就是我到目前为止所做的
enum NucleicAc {
A = 0,
C = 1,
G = 2,
T = 3,
} ;
struct _DNA {
char * seq ;
};
DNAString DSCreate(char * mseq) {
DNAString dna = malloc(sizeof(struct _DNA));
if (!dna){
exit(-1);
}
const int length = strlen(mseq);
// left shift will create the base , in the case of ACGT --> 0000,0000
int base = 0 << length * 2;
//loop and set bits
int i = 0 ;
int k = 0 ; // the counter where i want to modify the current bit
//simple looping gor the string
for ( ; i < length ; i++ ) {
switch (*(mseq+i)) {
case 'A': // 00
k++;
break;
case 'C': // 0 1
modifyBit(&base, k, 1);
k++;
break;
case 'G': //10
k++;
modifyBit(&base,k , 1);
break;
case 'T': // 11
modifyBit(&base, k, 1);
k++;
modifyBit(&base, k,1);
break;
default:
break;
} //end of switch
k++;
}//end of for
char * generatedSeq ;
//convert the base to hex ??
return dna;
}
void bin(unsigned n){
unsigned i;
for (i = 1 << 7; i > 0; i = i / 2){
(n & i) ? printf("1") : printf("0");
}
}
如果我们打印基数,则该值为预期的 11100100b,
如何将十六进制表示形式存储为字符串到结构中的char *mseq? 任何方向或确切的解决方案或有更好的方法吗?
后来我想只使用示例索引来获取字母
DSGet(dsStruct , '0')--> 将 return 'A' 因此 dsStruct 在编码之前包含“ACGT”?
这应该可以满足您的要求。我认为压缩是一个很酷的主意,所以我很快就写了这篇文章。正如@kaylum所提到的,十六进制编码只是一种读取内存中底层数据的方式,它总是只是位。所以,你只需要担心打印语句。
让我知道这是否有效,或者您对我所做的有任何疑问。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
typedef struct {
unsigned char *bits;
unsigned long length; // use this to store the number of letters encoded, for safety
} DNA;
typedef enum {
A = 0,
C = 1,
G = 2,
T = 3
} NucleicAc;
这个 returns 给定索引处的基数,带有一些边界检查
char base_at_index(DNA *dna, unsigned long index) {
if (index >= dna->length) {
fputs("error: base_at_index: index out of range", stderr);
exit(EXIT_FAILURE);
}
// offset is index / 4, this gives us the correct byte
// shift amount is index % 4 to give us the correct 2 bits within the byte.
// This must then be multiplied by 2 because
// each base takes 2 bits to encode
// then we have to bitwise-and this value with
// 3 (0000 0011 in binary) to retrieve the bits we want.
// so, the formula we need is
// (dna->bits[index / 4] >> (2 * (index % 4))) & 3
switch((dna->bits[index / 4] >> 2 * (index % 4)) & 3) {
case A: return 'A';
case C: return 'C';
case G: return 'G';
case T: return 'T';
default:
fputs("error: base_at_index: invalid encoding", stderr);
exit(EXIT_FAILURE);
}
}
这将一串碱基编码为字节
/* you can fit four 2-bit DNA codes in each byte (unsigned char).
len is the maximum number of characters to read. result must be at least len bytes long
*/
void encode_dna(unsigned char *result, char *sequence, unsigned long len) {
// keep track of what byte we are on in the result
unsigned result_index = 0;
// our shift for writing to the correct position in the byte
unsigned shift = 0;
// first clear result or else bitwise operations will produce errors
// this could be removed if you were certain result parameter was zero-filled
memset(result, 0, len);
// iterate through characters of the sequence
while(*sequence) {
switch (*sequence) {
// do nothing for 'A' since it is just zero
case 'A': break;
case 'C':
// we are doing a bitwise or with the current byte
// and C (1) shifted to the appropriate position within
// the byte, and then assigning the byte with the result
result[result_index] |= C << shift;
break;
case 'G':
result[result_index] |= G << shift;
break;
case 'T':
result[result_index] |= T << shift;
break;
default:
fputs("error: encode_dna: invalid base pair", stderr);
exit(EXIT_FAILURE);
}
// increase shift amount by 2 to the next 2-bit slot in the byte
shift += 2;
// on every 4th iteration, reset our shift to zero since the byte is now full
// and move to the next byte in our result buffer
if (shift == 8) {
shift = 0;
result_index++;
}
// advance sequence to next nucleotide character
sequence++;
}
}
这是一个测试
int main(int argc, char **argv) {
// allocate some storage for encoded DNA
unsigned char encoded_dna[32];
const unsigned long sample_length = 15;
// encode the given sample sequence
encode_dna(encoded_dna, "ACGTAGTCGTCATAG", sample_length);
// hh here means half of half word, which is a byte
// capital X for capitalized hex output
// here we print some bytes
printf("0x%hhX\n", encoded_dna[0]); // should output 0xE4
printf("0x%hhX\n", encoded_dna[1]); // should be 0x78
printf("0x%hhX\n", encoded_dna[2]); // should be 0x1E
printf("0x%hhX\n", encoded_dna[3]); // should be 0x23
DNA test_dna; // allocate a sample DNA structure
test_dna.bits = encoded_dna;
test_dna.length = sample_length; // length of the sample sequence above
// test some indices and see if the results are correct
printf("test_dna index 4: %c\n", base_at_index(&test_dna, 4));
printf("test_dna index 7: %c\n", base_at_index(&test_dna, 7));
printf("test_dna index 12: %c\n", base_at_index(&test_dna, 12));
return 0;
}
输出:
0xE4
0x78
0x1E
0x23
test_dna index 4: A
test_dna index 7: C
test_dna index 12: T
假设您真的想将 dna 字符串编码为十六进制字符串,并且您希望从左到右读取输入字符串,但从右到左输出十六进制字符,这是一个简单但速度稍慢的实现。
首先,您的 DNAString 需要跟踪列表中是否确实存在偶数或奇数个酸序列。这将使额外的附件更容易。
struct DNAString
{
char* seq;
bool odd; // if odd bit is set, then the front char is already allocated and holds one acid
};
现在让我们介绍一个小辅助函数,将 ACGT 转换为 0,1,2,3。
char acid_to_value(char c)
{
switch (c)
{
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
}
// ASSERT(false)
return 0;
}
那么核心实现就是将新的十六进制字符“添加到”您正在构建的字符串中。如果字符串已经是奇数长度,代码将通过将其从十六进制转换为整数来“修复”前面的字符,然后将新的酸值移入其中,然后将其转换回十六进制 char
extern char fixup(char previous, char acid);
{
char hexchars[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' };
char tmp[2] = { previous, '[=12=]' };
unsigned long asnumber = strtol(tmp, nullptr, 16);
asnumber = asnumber & 0x3; // last two bits
asnumber = asnumber | (acid_to_value(acid) << 2);
return hexchars[asnumber];
}
void prepend_nucleic_acid_to_hexstring(struct DNAString* dnastring, char acid)
{
if (dnastring->odd)
{
// find the first char in the string and fix it up hexwise
dnastring->seq[0] = fixup(dnastring->seq[0], acid);
dnastring->odd = false;
}
else
{
size_t currentlength = dnastring->seq ? strlen(dnastring->seq) : 0;
const char* currentstring = dnastring->seq ? dnastring->seq : "";
char* newseq = (char*)calloc(currentlength + 2, sizeof(char)); // +1 for new char and +1 for null char
newseq[0] = acid_to_value(acid) + '0'; // prepend the next hex char
strcpy(newseq + 1, currentstring); // copy the old string into the new string space
free(dnastring->seq);
dnastring->seq = newseq;
dnastring->odd = true;
}
}
那么你的 DNACreate 函数真的很简单:
struct DNAString DSCreate(const char* mseq)
{
DNAString dnastring = { 0 };
while (*mseq)
{
prepend_nucleic_acid_to_hexstring(&dnastring, *mseq);
mseq++;
}
return dnastring;
}
我不认为这种方法是有效的,因为他实际上一直在为每个字符重新分配内存。但它确实使您能够在以后灵活地调用前置函数以进行额外的排序。
然后进行测试:
int main()
{
struct DNAString dnastring = DSCreate("ACGT");
printf("0x%s\n", dnastring.seq);
return 0;
}
您可以通过多种方式对序列进行编码。您的 enum
很好,但是对于您的编码序列,将字节捕获为 unsigned char
的结构、原始序列长度和以字节为单位的编码大小将允许轻松解码。您将获得 4 比 1 压缩(如果您的序列不能被 4 整除,则加 1 个字节)。枚举和结构可以是:
enum { A, C, G, T };
typedef struct {
unsigned char *seq;
size_t len, size;
} encoded;
要将字符串中的字符映射到编码值一个简单的函数,returns 匹配字符的枚举值就是您所需要的(不要忘记处理任何错误)
/* convert character to encoded value */
unsigned char getencval (const char c)
{
if (c == 'A')
return A;
else if (c == 'C')
return C;
else if (c == 'G')
return G;
else if (c == 'T')
return T;
/* exit on anything other than A, C, G, T */
fprintf (stderr, "error: invalid sequence character '%c'\n", c);
exit (EXIT_FAILURE);
}
要对原始序列进行编码,您将使用原始长度 (len
) 和保存编码字符串所需的字节数 (size
) 填充 encoded
结构。不要忘记接下来的 4 个字符中的任何 1 个字符都需要另一个字节的存储空间。您可以使用简单的加法和除法来计算序列的任何部分 4 字符结尾部分,例如
/* encode sequence of characters as 2-bit pairs (4-characters per-byte)
* returns encoded struct with allocated .seq member, on failure the .seq
* member is NULL. User is resposible for freeing .seq member when done.
*/
encoded encode_seq (const char *seq)
{
size_t len = strlen(seq),
size = (len + 3) / 4; /* integer division intentional */
encoded enc = { .seq = calloc (1, size), /* encoded sequence struct */
.len = len,
.size = size };
if (!enc.seq) { /* validate allication */
perror ("calloc-enc.seq");
return enc;
}
/* loop over each char (i) with byte index (ndx)
* shifting each 2-bit pair by (shift * 2) amount.
*/
for (int i = 0, ndx = 0, shift = 4; seq[i] && seq[i] != '\n'; i++) {
if (!shift--) /* decrement shift, reset if 0 */
shift = 3;
if (i && i % 4 == 0) /* after each 4th char, increment ndx */
ndx += 1;
/* shift each encoded value (multiply by 2 for shift of 6, 4, 2, 0) */
enc.seq[ndx] |= getencval (seq[i]) << shift * 2;
}
return enc; /* return encoded struct with allocated .seq member */
}
要从您的 encoded
结构中获取原始序列,使用查找 table(下面显示完整代码)使它变得轻而易举。您只需遍历所有存储的字节值,附加查找 table 中的相应字符串,直到最后一个字节。对于最后一个字节,您需要确定它是否是部分字符串,如果是,则需要复制多少个字符remain
。 (这就是为什么将原始序列长度存储在结构中的原因)。然后简单地使用 strncat
从最后一个字节追加那么多字符,例如
/* decodes encoded sequence. Allocates storage for decoded sequence
* and loops over each encoded byte using lookup-table to obtain
* original 4-character string from byte value. User is responsible
* for freeing returned string when done. Returns NULL on allocation
* failure.
*/
char *decode_seq (encoded *eseq)
{
char *seq = malloc (eseq->len + 1); /* allocate storage for sequence */
size_t i = 0, offset = 0, remain;
if (!seq) { /* validate allocation */
perror ("malloc-seq");
return NULL;
}
/* loop appending strings from lookup table for all but last byte */
for (; i < eseq->size - 1; i++) {
memcpy (seq + offset, lookup[eseq->seq[i]], 4);
offset += 4; /* increment offset by 4 */
}
/* determine the number of characters in last byte */
remain = eseq->len - (eseq->size - 1) * 4;
memcpy (seq + offset, lookup[eseq->seq[i]], remain);
seq[offset + remain] = 0; /* nul-terminate seq */
return seq; /* return allocated sequence */
}
添加查找 table 并将所有部分放在一起,解决此问题的一种方法是:
(edit: 查找 table 重新排序以匹配您的字节值编码,优化 decode_seq() 不扫描字符串结尾复制)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
enum { A, C, G, T };
typedef struct {
unsigned char *seq;
size_t len, size;
} encoded;
const char lookup[][5] = {
"AAAA","CAAA","GAAA","TAAA","ACAA","CCAA","GCAA","TCAA",
"AGAA","CGAA","GGAA","TGAA","ATAA","CTAA","GTAA","TTAA",
"AACA","CACA","GACA","TACA","ACCA","CCCA","GCCA","TCCA",
"AGCA","CGCA","GGCA","TGCA","ATCA","CTCA","GTCA","TTCA",
"AAGA","CAGA","GAGA","TAGA","ACGA","CCGA","GCGA","TCGA",
"AGGA","CGGA","GGGA","TGGA","ATGA","CTGA","GTGA","TTGA",
"AATA","CATA","GATA","TATA","ACTA","CCTA","GCTA","TCTA",
"AGTA","CGTA","GGTA","TGTA","ATTA","CTTA","GTTA","TTTA",
"AAAC","CAAC","GAAC","TAAC","ACAC","CCAC","GCAC","TCAC",
"AGAC","CGAC","GGAC","TGAC","ATAC","CTAC","GTAC","TTAC",
"AACC","CACC","GACC","TACC","ACCC","CCCC","GCCC","TCCC",
"AGCC","CGCC","GGCC","TGCC","ATCC","CTCC","GTCC","TTCC",
"AAGC","CAGC","GAGC","TAGC","ACGC","CCGC","GCGC","TCGC",
"AGGC","CGGC","GGGC","TGGC","ATGC","CTGC","GTGC","TTGC",
"AATC","CATC","GATC","TATC","ACTC","CCTC","GCTC","TCTC",
"AGTC","CGTC","GGTC","TGTC","ATTC","CTTC","GTTC","TTTC",
"AAAG","CAAG","GAAG","TAAG","ACAG","CCAG","GCAG","TCAG",
"AGAG","CGAG","GGAG","TGAG","ATAG","CTAG","GTAG","TTAG",
"AACG","CACG","GACG","TACG","ACCG","CCCG","GCCG","TCCG",
"AGCG","CGCG","GGCG","TGCG","ATCG","CTCG","GTCG","TTCG",
"AAGG","CAGG","GAGG","TAGG","ACGG","CCGG","GCGG","TCGG",
"AGGG","CGGG","GGGG","TGGG","ATGG","CTGG","GTGG","TTGG",
"AATG","CATG","GATG","TATG","ACTG","CCTG","GCTG","TCTG",
"AGTG","CGTG","GGTG","TGTG","ATTG","CTTG","GTTG","TTTG",
"AAAT","CAAT","GAAT","TAAT","ACAT","CCAT","GCAT","TCAT",
"AGAT","CGAT","GGAT","TGAT","ATAT","CTAT","GTAT","TTAT",
"AACT","CACT","GACT","TACT","ACCT","CCCT","GCCT","TCCT",
"AGCT","CGCT","GGCT","TGCT","ATCT","CTCT","GTCT","TTCT",
"AAGT","CAGT","GAGT","TAGT","ACGT","CCGT","GCGT","TCGT",
"AGGT","CGGT","GGGT","TGGT","ATGT","CTGT","GTGT","TTGT",
"AATT","CATT","GATT","TATT","ACTT","CCTT","GCTT","TCTT",
"AGTT","CGTT","GGTT","TGTT","ATTT","CTTT","GTTT","TTTT"};
/* convert character to encoded value */
unsigned char getencval (const char c)
{
if (c == 'A')
return A;
else if (c == 'C')
return C;
else if (c == 'G')
return G;
else if (c == 'T')
return T;
/* exit on anything other than A, C, G, T */
fprintf (stderr, "error: invalid sequence character '%c'\n", c);
exit (EXIT_FAILURE);
}
/* encode sequence of characters as 2-bit pairs (4-characters per-byte)
* returns encoded struct with allocated .seq member, on failure the .seq
* member is NULL. User is resposible for freeing .seq member when done.
*/
encoded encode_seq (const char *seq)
{
size_t len = strlen(seq),
size = (len + 3) / 4; /* integer division intentional */
encoded enc = { .seq = calloc (1, size), /* encoded sequence struct */
.len = len,
.size = size };
if (!enc.seq) { /* validate allication */
perror ("calloc-enc.seq");
return enc;
}
/* loop over each char (i) with byte index (ndx)
* shifting each 2-bit pair by (shift * 2) amount.
*/
for (int i = 0, ndx = 0, shift = 0; seq[i] && seq[i] != '\n'; i++, shift++) {
if (shift == 4) /* reset to 0 */
shift = 0;
if (i && i % 4 == 0) /* after each 4th char, increment ndx */
ndx += 1;
/* shift each encoded value (multiply by 2 for shift of 0, 2, 4, 6) */
enc.seq[ndx] |= getencval (seq[i]) << shift * 2;
}
return enc; /* return encoded struct with allocated .seq member */
}
/* decodes encoded sequence. Allocates storage for decoded sequence
* and loops over each encoded byte using lookup-table to obtain
* original 4-character string from byte value. User is responsible
* for freeing returned string when done. Returns NULL on allocation
* failure.
*/
char *decode_seq (encoded *eseq)
{
char *seq = malloc (eseq->len + 1); /* allocate storage for sequence */
size_t i = 0, offset = 0, remain;
if (!seq) { /* validate allocation */
perror ("malloc-seq");
return NULL;
}
/* loop appending strings from lookup table for all but last byte */
for (; i < eseq->size - 1; i++) {
memcpy (seq + offset, lookup[eseq->seq[i]], 4);
offset += 4; /* increment offset by 4 */
}
/* determine the number of characters in last byte */
remain = eseq->len - (eseq->size - 1) * 4;
memcpy (seq + offset, lookup[eseq->seq[i]], remain);
seq[offset + remain] = 0; /* nul-terminate seq */
return seq; /* return allocated sequence */
}
/* short example program that takes string to encode as 1st argument
* using "ACGT" if no argument is provided by default
*/
int main (int argc, char **argv) {
char *seq = NULL;
encoded enc = encode_seq(argc > 1 ? argv[1] : "ACGT");
if (!enc.seq) /* validate encoded allocation */
return 1;
/* output original string, length and encoded size */
printf ("encoded str : %s\nencoded len : %zu\nencoded size : %zu\n",
argc > 1 ? argv[1] : "ACGT", enc.len, enc.size);
/* loop outputting byte-values of encoded string */
fputs ("encoded seq :", stdout);
for (size_t i = 0; i < enc.size; i++)
printf (" 0x%02x", enc.seq[i]);
putchar ('\n');
seq = decode_seq (&enc); /* decode seq from byte values */
printf ("decoded seq : %s\n", seq); /* output decoded string */
free (seq); /* don't forget to free what you allocated */
free (enc.seq);
}
在大多数情况下,与在解码期间计算和构建每个 4 字符的字符串相比,查找 -table 提供了大量的效率优势。在大多数情况下,查找 table 驻留在缓存中可以增强这一点。
您可以编码和解码的 DNA 序列的长度仅受您可用的虚拟内存量的限制。
例子Use/Output
程序将要编码和解码的序列作为第一个参数(默认值"ACGT"
)。所以默认输出是:
$ ./bin/dnaencodedecode
encoded str : ACGT
encoded len : 4
encoded size : 1
encoded seq : 0xe4
decoded seq : ACGT
4 字节编码为 1 字节。由于 table 排序,请注意 0x1b
而不是 0xe4
的字节值。
更长的例子:
./bin/dnaencodedecode ACGTGGGTCAGACTTA
encoded str : ACGTGGGTCAGACTTA
encoded len : 16
encoded size : 4
encoded seq : 0xe4 0xea 0x21 0x3d
decoded seq : ACGTGGGTCAGACTTA
16 个字符以 4 个字节编码。
最后,如果一个序列不能被 4 整除,那么最后一个编码字节中只有部分字符?这也被处理,例如
$ ./bin/dnaencodedecode ACGTGGGTCAGACTTAG
encoded str : ACGTGGGTCAGACTTAG
encoded len : 17
encoded size : 5
encoded seq : 0xe4 0xea 0x21 0x3d 0x02
decoded seq : ACGTGGGTCAGACTTAG
17 个字符以 5 个字节编码。 (不是纯粹的4比1压缩,而是随着序列大小的增加,最后一个字节中任何部分字符组的重要性都变得可以忽略不计)
就性能而言,对于 100,000 个字符的序列和字节值和字符串的输出替换为一个简单的循环,该循环将解码的 seq
与原始的 argv[1]
进行比较只需要千分之几秒(在带有 SSD 的旧 i7 Gen2 笔记本电脑上)进行编码、解码和验证,例如
$
time ./bin/dnaencodedecodet2big $(< dat/dnaseq100k.txt)
encoded len : 100000
encoded size : 25000
all tests passed
real 0m0.014s
user 0m0.012s
sys 0m0.003s
有很多方法可以做到这一点,但根据你的描述,这就是我想到的你试图完成的。这里有很多,所以你慢慢看吧。
检查一下(代码已注释),如果您有其他问题,请告诉我。请在下方发表评论。