如何在 Python 中创建 GSM-7 编码?
How to create a GSM-7 encoding in Python?
GSM-7 字符集定义为基本映射 table + 扩展字符映射 table (https://en.wikipedia.org/wiki/GSM_03.38#GSM_7-bit_default_alphabet_and_extension_table_of_3GPP_TS_23.038_.2F_GSM_03.38)。
意思是 u'@'
应该映射到 b'\x00'
(长度为 1 的字节串),但是 u'['
应该映射到 b'\x1b<'
或 b'\x1b\x3c'
(字节串长度 2).
我通过扩展 encoding_table
设法使编码部分工作,但我不确定如何使用 decoding_table
..?
为了完整性,这里是编解码器样板:
import codecs
from encodings import normalize_encoding
class GSM7Codec(codecs.Codec):
def encode(self, input, errors='strict'):
return codecs.charmap_encode(input, errors, encoding_table)
def decode(self, input, errors='strict'):
return codecs.charmap_decode(input, errors, decoding_table)
class GSM7IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
class GSM7IncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
class GSM7StreamWriter(codecs.Codec, codecs.StreamWriter): pass
class GSM7StreamReader(codecs.Codec, codecs.StreamReader): pass
_cache = {}
def search_function(encoding):
"""Register the gsm-7 encoding with Python's codecs API. This involves
adding a search function that takes in an encoding name, and returns
a codec for that encoding if it knows one, or None if it doesn't.
"""
if encoding in _cache:
return _cache[encoding]
norm_encoding = normalize_encoding(encoding)
if norm_encoding in ('gsm_7', 'g7', 'gsm7'):
cinfo = codecs.CodecInfo(
name='gsm-7',
encode=GSM7Codec().encode,
decode=GSM7Codec().decode,
incrementalencoder=GSM7IncrementalEncoder,
incrementaldecoder=GSM7IncrementalDecoder,
streamreader=GSM7StreamReader,
streamwriter=GSM7StreamWriter,
)
_cache[norm_encoding] = cinfo
return cinfo
return None
codecs.register(search_function)
这里是 table 定义:
decoding_table = (
u"@£$¥èéùìòÇ\nØø\rÅå" +
u"Δ_ΦΓΛΩΠΨΣΘΞ\x1bÆæßÉ" +
u" !\"#¤%&'()*+,-./" +
u"0123456789:;<=>?" +
u"¡ABCDEFGHIJKLMNO" +
u"PQRSTUVWXYZÄÖÑܧ" +
u"¿abcdefghijklmno" +
u"pqrstuvwxyzäöñüà"
)
encoding_table = codecs.charmap_build(
decoding_table + '[=12=]' * (256 - len(decoding_table))
)
# extending the encoding table with extension characters
encoding_table[ord(u'|')] = '\x1b\x40'
encoding_table[ord(u'^')] = '\x1b\x14'
encoding_table[ord(u'€')] = '\x1b\x65'
encoding_table[ord(u'{')] = '\x1b\x28'
encoding_table[ord(u'}')] = '\x1b\x29'
encoding_table[ord(u'[')] = '\x1b\x3C'
encoding_table[ord(u'~')] = '\x1b\x3D'
encoding_table[ord(u']')] = '\x1b\x3E'
encoding_table[ord(u'\')] = '\x1b\x2F'
编码部分现在可用,但解码不可用:
>>> u'['.encode('g7')
'\x1b<'
>>> _.decode('g7')
u'\x1b<'
>>>
我无法找到有关编写编码的文档的良好来源。
根据@Martijn Pieters 的评论,我将代码更改为:
def decode_gsm7(txt, errors):
ext_table = {
'\x40': u'|',
'\x14': u'^',
'\x65': u'€',
'\x28': u'{',
'\x29': u'}',
'\x3C': u'[',
'\x3D': u'~',
'\x3E': u']',
'\x2F': u'\',
}
chunks = filter(None, txt.split('\x1b')) # split on ESC
res = u''
for chunk in chunks:
res += ext_table[chunk[0]] # first character after ESC
if len(chunk) > 1:
# charmap_decode returns a tuple..
decoded, _ = codecs.charmap_decode(chunk[1:], errors, decoding_table)
res += decoded
return res, len(txt)
class GSM7Codec(codecs.Codec):
def encode(self, txt, errors='strict'):
return codecs.charmap_encode(txt, errors, encoding_table)
def decode(self, txt, errors='strict'):
return decode_gsm7(txt, errors)
这似乎有效:-)
GSM-7 字符集定义为基本映射 table + 扩展字符映射 table (https://en.wikipedia.org/wiki/GSM_03.38#GSM_7-bit_default_alphabet_and_extension_table_of_3GPP_TS_23.038_.2F_GSM_03.38)。
意思是 u'@'
应该映射到 b'\x00'
(长度为 1 的字节串),但是 u'['
应该映射到 b'\x1b<'
或 b'\x1b\x3c'
(字节串长度 2).
我通过扩展 encoding_table
设法使编码部分工作,但我不确定如何使用 decoding_table
..?
为了完整性,这里是编解码器样板:
import codecs
from encodings import normalize_encoding
class GSM7Codec(codecs.Codec):
def encode(self, input, errors='strict'):
return codecs.charmap_encode(input, errors, encoding_table)
def decode(self, input, errors='strict'):
return codecs.charmap_decode(input, errors, decoding_table)
class GSM7IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
class GSM7IncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
class GSM7StreamWriter(codecs.Codec, codecs.StreamWriter): pass
class GSM7StreamReader(codecs.Codec, codecs.StreamReader): pass
_cache = {}
def search_function(encoding):
"""Register the gsm-7 encoding with Python's codecs API. This involves
adding a search function that takes in an encoding name, and returns
a codec for that encoding if it knows one, or None if it doesn't.
"""
if encoding in _cache:
return _cache[encoding]
norm_encoding = normalize_encoding(encoding)
if norm_encoding in ('gsm_7', 'g7', 'gsm7'):
cinfo = codecs.CodecInfo(
name='gsm-7',
encode=GSM7Codec().encode,
decode=GSM7Codec().decode,
incrementalencoder=GSM7IncrementalEncoder,
incrementaldecoder=GSM7IncrementalDecoder,
streamreader=GSM7StreamReader,
streamwriter=GSM7StreamWriter,
)
_cache[norm_encoding] = cinfo
return cinfo
return None
codecs.register(search_function)
这里是 table 定义:
decoding_table = (
u"@£$¥èéùìòÇ\nØø\rÅå" +
u"Δ_ΦΓΛΩΠΨΣΘΞ\x1bÆæßÉ" +
u" !\"#¤%&'()*+,-./" +
u"0123456789:;<=>?" +
u"¡ABCDEFGHIJKLMNO" +
u"PQRSTUVWXYZÄÖÑܧ" +
u"¿abcdefghijklmno" +
u"pqrstuvwxyzäöñüà"
)
encoding_table = codecs.charmap_build(
decoding_table + '[=12=]' * (256 - len(decoding_table))
)
# extending the encoding table with extension characters
encoding_table[ord(u'|')] = '\x1b\x40'
encoding_table[ord(u'^')] = '\x1b\x14'
encoding_table[ord(u'€')] = '\x1b\x65'
encoding_table[ord(u'{')] = '\x1b\x28'
encoding_table[ord(u'}')] = '\x1b\x29'
encoding_table[ord(u'[')] = '\x1b\x3C'
encoding_table[ord(u'~')] = '\x1b\x3D'
encoding_table[ord(u']')] = '\x1b\x3E'
encoding_table[ord(u'\')] = '\x1b\x2F'
编码部分现在可用,但解码不可用:
>>> u'['.encode('g7')
'\x1b<'
>>> _.decode('g7')
u'\x1b<'
>>>
我无法找到有关编写编码的文档的良好来源。
根据@Martijn Pieters 的评论,我将代码更改为:
def decode_gsm7(txt, errors):
ext_table = {
'\x40': u'|',
'\x14': u'^',
'\x65': u'€',
'\x28': u'{',
'\x29': u'}',
'\x3C': u'[',
'\x3D': u'~',
'\x3E': u']',
'\x2F': u'\',
}
chunks = filter(None, txt.split('\x1b')) # split on ESC
res = u''
for chunk in chunks:
res += ext_table[chunk[0]] # first character after ESC
if len(chunk) > 1:
# charmap_decode returns a tuple..
decoded, _ = codecs.charmap_decode(chunk[1:], errors, decoding_table)
res += decoded
return res, len(txt)
class GSM7Codec(codecs.Codec):
def encode(self, txt, errors='strict'):
return codecs.charmap_encode(txt, errors, encoding_table)
def decode(self, txt, errors='strict'):
return decode_gsm7(txt, errors)
这似乎有效:-)