为简体中文标识符构建令牌
Build a token for Simplified Chinese Identifiers
我正在尝试为简体中文标识符构建一个令牌。
简体中文标识符定义in the specification如下:
simplified-Chinese-identifier = first-sChinese-identifier-character *subsequent-sChinese-identifier-character
first-sChinese-identifier-character = (first-Latin-identifier-character / CP936-initialcharacter)
subsequent-sChinese-identifier-character = (subsequent-Latin-identifier-character / CP936-
subsequent-character)
CP936-initial-character = < character ranges specified in section 3.3.5.1.3>
CP936-subsequent-character = < character ranges specified in section 3.3.5.1.3>
这里是UNICODE-BESTFIT and Windows Codepage 936.
比如我在页面中查找%xA3C1
,然后取其对应的代码,即0xff21
。于是,我找到了%xA3C1
、%xA3DA
、%xA3E1
、%xA3FA
、%xA1A2
、%xA1AA
、%xA1AC
的对应代码, %xA1AD
、%xA1B2
、%xA1E6
; %xA1E8
、%xA1EF
、%xA2B1
、%xA2FC
、%xA4A1
、%xFE4F
,并按如下方式构建 CP936-initial-character
:
let cP936_initial_character = [%sedlex.regexp? 0xff21 .. 0xff3a | 0xff41 .. 0xff5a | 0x3001 .. 0x2014 | 0x2016 .. 0x2026 | 0x3014 .. 0x2103 | 0x00a4 .. 0x2605 | 0x2488 .. 0x216b | 0x3041 .. 0xfa29]
但是,问题是有些范围看起来很奇怪,例如,0x00a4 .. 0x2605
和 0x2488 .. 0x216b
顺序不正确; 0x3041 .. 0xfa29
看起来太大了。
有谁知道构建此令牌的正确方法是什么?
关注WindowsBestFit/readme.txt;特别是 WCTABLE
部分中多字节映射记录的描述( WCTABLE
标记标记 Unicode UTF-16 (WideChar) 到“MultiByte”字节的开始…).
以下,部分评论,Python3脚本(抱歉,我不会说话VBA):
- 逐行读取(之前下载的)
bestfit936.txt
文件,
- 解析其
WCTABLE
部分并构建一个 Unicode 代码点数组,其中 gb2312
代码点(代码页 936)匹配 Simplified Chinese Identifiers 和 的规则哪个 Unicode 类别是 字母 ('Ll','Lu','Lo'
)(参见变量 init_chars_16
),
- 对变量
init_chars_16
中的代码点进行排序并创建相应的字符数组(变量 init_chars_utf16
),
- 将变量
init_chars_16
中的代码点分组到最长的连续链(变量 init_chars_groups
),并且
- 打印 几个 字符、它们的代码点和相应的连续范围(提供参数
1
以打印 所有 个字符)。在(不幸的是)1977 个连续范围. 中有 15477 个适用代码点
这仅适用于 CP936-initial-character
,但同样适用于 CP936-subsequent-character
(提供参数 2
,另请参阅 用法 和输出示例)。
from itertools import groupby
from operator import itemgetter
import unicodedata
import sys
if ( len(sys.argv) > 1 and [1,2,'1','2'].__contains__( sys.argv[1]) ):
init_chars_test = int(sys.argv[1])
demo_chars_test = False
else:
init_chars_test = 1
demo_chars_test = True
def first_last( some_list ):
if len( some_list ) > 1:
return '..'.join( [ '0x{:04x}'.format(some_list[0]),
'0x{:04x}'.format(some_list[-1]) ] )
else:
return '0x{:04x}'.format(some_list[0])
if init_chars_test == 1:
unicode_category = ['Ll','Lu','Lo'] # letters
print( str('DEMO ' if demo_chars_test else '')
+ 'CP936-initial-character:', unicode_category, '\n')
init_chars_CP936 = sorted(
list( range( 0xA3C1,0xA3DA +1)) +
list( range( 0xA3E1,0xA3FA +1)) +
list( range( 0xA1A2,0xA1AA +1)) +
list( range( 0xA1AC,0xA1AD +1)) +
list( range( 0xA1B2,0xA1E6 +1)) +
list( range( 0xA1E8,0xA1EF +1)) +
list( range( 0xA2B1,0xA2FC +1)) +
list( range( 0xA4A1,0xFE4F +1)) )
else:
unicode_category=['Ll','Lu','Lo','Nd','Pc'] # letters or numbers or underscore
print( 'CP936-subsequent-character addendum:', unicode_category, '\n')
init_chars_CP936 = sorted(
list( range( 0xA3DF,0xA3DF +1)) +
list( range( 0xA3B0,0xA3B9 +1)) )
wctable = False
init_chars_16 = [] # Unicode UTF-16 codepoints (as integers)
init_chars_undef = [] #
i = 0
with open(r'D:\Downloads\Unicode\bestfit936.txt',
mode='r', encoding='gb2312', errors='backslashreplace') as ff:
for rawline in ff:
i+=1
line = rawline.split('\t')
if line[0].upper() in [
'CODEPAGE','CPINFO','MBTABLE','DBCSRANGE','DBSCTABLE','ENDCODEPAGE']:
wctable = False
if wctable:
if len(line) >1:
code_936 = int(line[1],16) if line[1].lower().startswith('0x') else 0
if code_936 in init_chars_CP936:
code16be = int(line[0],16) if line[0].lower().startswith('0x') else 0
# µ vs. μ error
# 0x00b5 0xa6cc ;μ # at line 24608: 'Micro Sign'
# 0x03bc 0xa6cc ;μ # at line 24718: 'Greek Small Letter Mu'
if ( code16be > 0x00ff # exclude 0x00b5 and permit only letters
and ( unicodedata.category( chr(code16be)) in unicode_category )
# and len( chr(code16be).encode('gb2312','ignore'))> 0
):
# if len(unicodedata.normalize('NFKD',chr(code16be))) == 1:
init_chars_16.append( code16be)
else:
init_chars_undef.append(line[1])
else:
# The WCTABLE tag marks the start of the Unicode UTF-16 (WideChar) to "MultiByte" bytes
wctable = rawline.startswith( 'WCTABLE')
# for debugging purposes if wctable: print(i, rawline)
init_chars_16 = sorted( set( init_chars_16 ) )
init_chars_utf16 = [ chr(x) for x in init_chars_16 ]
init_chars_groups = [] # groups of consecutive code points
# #
for k, g in groupby( enumerate(init_chars_16), lambda ix : ix[0] - ix[1]):
init_chars_groups.append( first_last(list(map(itemgetter(1), g))))
def finalprint(od, do, odg, dog, sep):
global init_chars_utf16, init_chars_16, init_chars_groups
print( ''.join( init_chars_utf16[od:do] )+'\n') # characters
print( ', '.join( '0x{:04x}'.format(x)
for x in init_chars_16[od:do] )+'\n') # their Unicode codepoints
print( sep.join( x
for x in init_chars_groups[odg:dog])+'\n' ) # groups of Unicode codepoints
print( len(init_chars_groups), ':', odg, dog ) # total number of groups and displayed range
if init_chars_test == 1 and demo_chars_test:
finalprint(354, 380, 23, 30, ' ')
else:
finalprint(0, None, 0, None, '|')
print( init_chars_test, len(init_chars_CP936), len(init_chars_undef),
len(init_chars_utf16), len(init_chars_groups) ) # resume
if demo_chars_test:
print( '\nUsage:\n\t%s [ 1 | 2 ]\n' % sys.argv[0])
print( 'Examples:\n\t%s // prints CP936-initial-character DEMO' % sys.argv[0])
print( '\n\t%s 1 // prints CP936-initial-character' % sys.argv[0])
print( '\n\t%s 2 // prints CP936-subsequent-character addendum' % sys.argv[0])
输出:.\SO766804.py
DEMO CP936-initial-character: ['Ll', 'Lu', 'Lo']
一丁七万丈三上下丌不与丐丑专且丕世丘丙业丛东丝丞丢两
0x4e00, 0x4e01, 0x4e03, 0x4e07, 0x4e08, 0x4e09, 0x4e0a, 0x4e0b, 0x4e0c, 0x4e0d, 0x4e0e, 0x4e10, 0x4e11, 0x4e13, 0x4e14, 0x4e15, 0x4e16, 0x4e18, 0x4e19, 0x4e1a, 0x4e1b, 0x4e1c, 0x4e1d, 0x4e1e, 0x4e22, 0x4e24
0x4e00..0x4e01 0x4e03 0x4e07..0x4e0e 0x4e10..0x4e11 0x4e13..0x4e16 0x4e18..0x4e1e 0x4e22
1977 : 23 30
1 23159 2085 15477 1977
Usage:
D:\bat\SO766804.py [ 1 | 2 ]
Examples:
D:\bat\SO766804.py // prints CP936-initial-character DEMO
D:\bat\SO766804.py 1 // prints CP936-initial-character
D:\bat\SO766804.py 2 // prints CP936-subsequent-character addendum
输出:.\SO766804.py 2
CP936-subsequent-character addendum: ['Ll', 'Lu', 'Lo', 'Nd', 'Pc']
0123456789_
0xff10, 0xff11, 0xff12, 0xff13, 0xff14, 0xff15, 0xff16, 0xff17, 0xff18, 0xff19, 0xff3f
0xff10..0xff19|0xff3f
2 : 0 None
2 11 0 11 2
我正在尝试为简体中文标识符构建一个令牌。
简体中文标识符定义in the specification如下:
simplified-Chinese-identifier = first-sChinese-identifier-character *subsequent-sChinese-identifier-character
first-sChinese-identifier-character = (first-Latin-identifier-character / CP936-initialcharacter)
subsequent-sChinese-identifier-character = (subsequent-Latin-identifier-character / CP936-
subsequent-character)
CP936-initial-character = < character ranges specified in section 3.3.5.1.3>
CP936-subsequent-character = < character ranges specified in section 3.3.5.1.3>
这里是UNICODE-BESTFIT and Windows Codepage 936.
比如我在页面中查找%xA3C1
,然后取其对应的代码,即0xff21
。于是,我找到了%xA3C1
、%xA3DA
、%xA3E1
、%xA3FA
、%xA1A2
、%xA1AA
、%xA1AC
的对应代码, %xA1AD
、%xA1B2
、%xA1E6
; %xA1E8
、%xA1EF
、%xA2B1
、%xA2FC
、%xA4A1
、%xFE4F
,并按如下方式构建 CP936-initial-character
:
let cP936_initial_character = [%sedlex.regexp? 0xff21 .. 0xff3a | 0xff41 .. 0xff5a | 0x3001 .. 0x2014 | 0x2016 .. 0x2026 | 0x3014 .. 0x2103 | 0x00a4 .. 0x2605 | 0x2488 .. 0x216b | 0x3041 .. 0xfa29]
但是,问题是有些范围看起来很奇怪,例如,0x00a4 .. 0x2605
和 0x2488 .. 0x216b
顺序不正确; 0x3041 .. 0xfa29
看起来太大了。
有谁知道构建此令牌的正确方法是什么?
关注WindowsBestFit/readme.txt;特别是 WCTABLE
部分中多字节映射记录的描述( WCTABLE
标记标记 Unicode UTF-16 (WideChar) 到“MultiByte”字节的开始…).
以下,部分评论,Python3脚本(抱歉,我不会说话VBA):
- 逐行读取(之前下载的)
bestfit936.txt
文件, - 解析其
WCTABLE
部分并构建一个 Unicode 代码点数组,其中gb2312
代码点(代码页 936)匹配 Simplified Chinese Identifiers 和 的规则哪个 Unicode 类别是 字母 ('Ll','Lu','Lo'
)(参见变量init_chars_16
), - 对变量
init_chars_16
中的代码点进行排序并创建相应的字符数组(变量init_chars_utf16
), - 将变量
init_chars_16
中的代码点分组到最长的连续链(变量init_chars_groups
),并且 - 打印 几个 字符、它们的代码点和相应的连续范围(提供参数
1
以打印 所有 个字符)。在(不幸的是)1977 个连续范围. 中有 15477 个适用代码点
这仅适用于 CP936-initial-character
,但同样适用于 CP936-subsequent-character
(提供参数 2
,另请参阅 用法 和输出示例)。
from itertools import groupby
from operator import itemgetter
import unicodedata
import sys
if ( len(sys.argv) > 1 and [1,2,'1','2'].__contains__( sys.argv[1]) ):
init_chars_test = int(sys.argv[1])
demo_chars_test = False
else:
init_chars_test = 1
demo_chars_test = True
def first_last( some_list ):
if len( some_list ) > 1:
return '..'.join( [ '0x{:04x}'.format(some_list[0]),
'0x{:04x}'.format(some_list[-1]) ] )
else:
return '0x{:04x}'.format(some_list[0])
if init_chars_test == 1:
unicode_category = ['Ll','Lu','Lo'] # letters
print( str('DEMO ' if demo_chars_test else '')
+ 'CP936-initial-character:', unicode_category, '\n')
init_chars_CP936 = sorted(
list( range( 0xA3C1,0xA3DA +1)) +
list( range( 0xA3E1,0xA3FA +1)) +
list( range( 0xA1A2,0xA1AA +1)) +
list( range( 0xA1AC,0xA1AD +1)) +
list( range( 0xA1B2,0xA1E6 +1)) +
list( range( 0xA1E8,0xA1EF +1)) +
list( range( 0xA2B1,0xA2FC +1)) +
list( range( 0xA4A1,0xFE4F +1)) )
else:
unicode_category=['Ll','Lu','Lo','Nd','Pc'] # letters or numbers or underscore
print( 'CP936-subsequent-character addendum:', unicode_category, '\n')
init_chars_CP936 = sorted(
list( range( 0xA3DF,0xA3DF +1)) +
list( range( 0xA3B0,0xA3B9 +1)) )
wctable = False
init_chars_16 = [] # Unicode UTF-16 codepoints (as integers)
init_chars_undef = [] #
i = 0
with open(r'D:\Downloads\Unicode\bestfit936.txt',
mode='r', encoding='gb2312', errors='backslashreplace') as ff:
for rawline in ff:
i+=1
line = rawline.split('\t')
if line[0].upper() in [
'CODEPAGE','CPINFO','MBTABLE','DBCSRANGE','DBSCTABLE','ENDCODEPAGE']:
wctable = False
if wctable:
if len(line) >1:
code_936 = int(line[1],16) if line[1].lower().startswith('0x') else 0
if code_936 in init_chars_CP936:
code16be = int(line[0],16) if line[0].lower().startswith('0x') else 0
# µ vs. μ error
# 0x00b5 0xa6cc ;μ # at line 24608: 'Micro Sign'
# 0x03bc 0xa6cc ;μ # at line 24718: 'Greek Small Letter Mu'
if ( code16be > 0x00ff # exclude 0x00b5 and permit only letters
and ( unicodedata.category( chr(code16be)) in unicode_category )
# and len( chr(code16be).encode('gb2312','ignore'))> 0
):
# if len(unicodedata.normalize('NFKD',chr(code16be))) == 1:
init_chars_16.append( code16be)
else:
init_chars_undef.append(line[1])
else:
# The WCTABLE tag marks the start of the Unicode UTF-16 (WideChar) to "MultiByte" bytes
wctable = rawline.startswith( 'WCTABLE')
# for debugging purposes if wctable: print(i, rawline)
init_chars_16 = sorted( set( init_chars_16 ) )
init_chars_utf16 = [ chr(x) for x in init_chars_16 ]
init_chars_groups = [] # groups of consecutive code points
# #
for k, g in groupby( enumerate(init_chars_16), lambda ix : ix[0] - ix[1]):
init_chars_groups.append( first_last(list(map(itemgetter(1), g))))
def finalprint(od, do, odg, dog, sep):
global init_chars_utf16, init_chars_16, init_chars_groups
print( ''.join( init_chars_utf16[od:do] )+'\n') # characters
print( ', '.join( '0x{:04x}'.format(x)
for x in init_chars_16[od:do] )+'\n') # their Unicode codepoints
print( sep.join( x
for x in init_chars_groups[odg:dog])+'\n' ) # groups of Unicode codepoints
print( len(init_chars_groups), ':', odg, dog ) # total number of groups and displayed range
if init_chars_test == 1 and demo_chars_test:
finalprint(354, 380, 23, 30, ' ')
else:
finalprint(0, None, 0, None, '|')
print( init_chars_test, len(init_chars_CP936), len(init_chars_undef),
len(init_chars_utf16), len(init_chars_groups) ) # resume
if demo_chars_test:
print( '\nUsage:\n\t%s [ 1 | 2 ]\n' % sys.argv[0])
print( 'Examples:\n\t%s // prints CP936-initial-character DEMO' % sys.argv[0])
print( '\n\t%s 1 // prints CP936-initial-character' % sys.argv[0])
print( '\n\t%s 2 // prints CP936-subsequent-character addendum' % sys.argv[0])
输出:.\SO766804.py
DEMO CP936-initial-character: ['Ll', 'Lu', 'Lo']
一丁七万丈三上下丌不与丐丑专且丕世丘丙业丛东丝丞丢两
0x4e00, 0x4e01, 0x4e03, 0x4e07, 0x4e08, 0x4e09, 0x4e0a, 0x4e0b, 0x4e0c, 0x4e0d, 0x4e0e, 0x4e10, 0x4e11, 0x4e13, 0x4e14, 0x4e15, 0x4e16, 0x4e18, 0x4e19, 0x4e1a, 0x4e1b, 0x4e1c, 0x4e1d, 0x4e1e, 0x4e22, 0x4e24
0x4e00..0x4e01 0x4e03 0x4e07..0x4e0e 0x4e10..0x4e11 0x4e13..0x4e16 0x4e18..0x4e1e 0x4e22
1977 : 23 30
1 23159 2085 15477 1977
Usage:
D:\bat\SO766804.py [ 1 | 2 ]
Examples:
D:\bat\SO766804.py // prints CP936-initial-character DEMO
D:\bat\SO766804.py 1 // prints CP936-initial-character
D:\bat\SO766804.py 2 // prints CP936-subsequent-character addendum
输出:.\SO766804.py 2
CP936-subsequent-character addendum: ['Ll', 'Lu', 'Lo', 'Nd', 'Pc']
0123456789_
0xff10, 0xff11, 0xff12, 0xff13, 0xff14, 0xff15, 0xff16, 0xff17, 0xff18, 0xff19, 0xff3f
0xff10..0xff19|0xff3f
2 : 0 None
2 11 0 11 2