优化字节数组转义性能python
Optimize byte array escaping performance python
我需要对 python 中的字节数组执行自定义转义。但是,在转义过程中python将字节转换为整数,使得性能优化非常困难。我怎样才能加快我的转义功能?
ESCAPE_DICT={
0x00: [0x5C,0x7A], # null -> \z 0x5c 0x7a
0x22: [0x5C,0x71], # " -> \q 0x5c 0x71
0x3B: [0x5C,0x73], # ; -> \s 0x5c 0x73
0x5C: [0x5C,0x5C], # \ -> \ 0x5c 0x5c
0x0A: [0x5C,0x6E], # line-feed -> \n 0x5c 0x6e
0x0C: [0x5C,0x66], # form-feed -> \f 0x5c 0x66
0x0D: [0x5C,0x63], # carr-return -> \c 0x5c 0x63
}
def escape(string: bytes):
str_len=string.__len__()
escaped_list=[]
for i in range(0,str_len):
curr_byte=string[i]
escape = ESCAPE_DICT.get(curr_byte)
if escape is None:
# Don't escape current byte
escaped_list.append(curr_byte)
else:
# Escape current byte
escaped_list.extend(escape)
return bytes(escaped_array)
import re
ESCAPE_DICT = {
b'\x00': rb'\z', # null
b'"': rb'\q',
b';': rb'\s',
b'\': rb'\',
b'\n': rb'\n', # linefeed
b'\f': rb'\f', # formfeed
b'\r': rb'\c', # carriage return
}
ESCAPE_CLASS = '[' + ''.join(r'\x' + e.hex() for e in ESCAPE_DICT) + ']'
ESCAPE_REGEX = re.compile(ESCAPE_CLASS.encode())
def escape(string: bytes) -> bytes:
return re.sub(ESCAPE_REGEX, lambda m: ESCAPE_DICT[m.group(0)], string)
x = b'"abc\ndef\rpqr\x00stu\xyz"'
y = escape(x)
from pprint import pprint
pprint(ESCAPE_CLASS)
pprint(ESCAPE_REGEX)
pprint(x)
pprint(y)
# =>
# '[\x00\x22\x3b\x5c\x0a\x0c\x0d]'
# re.compile(b'[\x00\x22\x3b\x5c\x0a\x0c\x0d]')
# b'"abc\ndef\rpqr\x00stu\xyz"'
# b'\qabc\ndef\cpqr\zstu\\xyz\q'
您可以将 rb
前缀读作“原始字节”。
不过你的越狱有点奇怪。例如,回车 return 通常是 \r
,而不是 \c
,并且 \s
通常代表通用空白。
我需要对 python 中的字节数组执行自定义转义。但是,在转义过程中python将字节转换为整数,使得性能优化非常困难。我怎样才能加快我的转义功能?
ESCAPE_DICT={
0x00: [0x5C,0x7A], # null -> \z 0x5c 0x7a
0x22: [0x5C,0x71], # " -> \q 0x5c 0x71
0x3B: [0x5C,0x73], # ; -> \s 0x5c 0x73
0x5C: [0x5C,0x5C], # \ -> \ 0x5c 0x5c
0x0A: [0x5C,0x6E], # line-feed -> \n 0x5c 0x6e
0x0C: [0x5C,0x66], # form-feed -> \f 0x5c 0x66
0x0D: [0x5C,0x63], # carr-return -> \c 0x5c 0x63
}
def escape(string: bytes):
str_len=string.__len__()
escaped_list=[]
for i in range(0,str_len):
curr_byte=string[i]
escape = ESCAPE_DICT.get(curr_byte)
if escape is None:
# Don't escape current byte
escaped_list.append(curr_byte)
else:
# Escape current byte
escaped_list.extend(escape)
return bytes(escaped_array)
import re
ESCAPE_DICT = {
b'\x00': rb'\z', # null
b'"': rb'\q',
b';': rb'\s',
b'\': rb'\',
b'\n': rb'\n', # linefeed
b'\f': rb'\f', # formfeed
b'\r': rb'\c', # carriage return
}
ESCAPE_CLASS = '[' + ''.join(r'\x' + e.hex() for e in ESCAPE_DICT) + ']'
ESCAPE_REGEX = re.compile(ESCAPE_CLASS.encode())
def escape(string: bytes) -> bytes:
return re.sub(ESCAPE_REGEX, lambda m: ESCAPE_DICT[m.group(0)], string)
x = b'"abc\ndef\rpqr\x00stu\xyz"'
y = escape(x)
from pprint import pprint
pprint(ESCAPE_CLASS)
pprint(ESCAPE_REGEX)
pprint(x)
pprint(y)
# =>
# '[\x00\x22\x3b\x5c\x0a\x0c\x0d]'
# re.compile(b'[\x00\x22\x3b\x5c\x0a\x0c\x0d]')
# b'"abc\ndef\rpqr\x00stu\xyz"'
# b'\qabc\ndef\cpqr\zstu\\xyz\q'
您可以将 rb
前缀读作“原始字节”。
不过你的越狱有点奇怪。例如,回车 return 通常是 \r
,而不是 \c
,并且 \s
通常代表通用空白。