Python 将文本解析为音素的规则模式匹配

Python pattern matching of rules to parse text to phonemes

我有一组规则可用于将文本转换为一组音素。这些规则的应用将导致如下转换:

a            uh
ability      ae-b-ih-l-ih-t-ee
aboard       uh-b-oh-r-d
abort        uh-b-oh-r-t
affirmative  ah-f-eh-r-m-ah-t-ih-v
all          aw-l
alter        ah-l-t-r
an           ae-n
and          ae-n-d
Andy         ae-n-d-ee
any          eh-n-ee
anybody      ae-n-ee-b-ah-d-ee
at           ae-t
attacked     uh-t-ae-k-t

我想创建一个可以应用到文本的函数,return 使用转换规则与该文本对应的音素。

一条规则由几个部分组成。第一部分是正在考虑的文本标记。第二部分是在所考虑的标记之前找到的文本标记。第三部分是在所考虑的标记之后找到的文本标记。第四部分是应该导致转换的适当音素。规则可以这样写,不同的部分用斜线隔开:

text found/text before text found/text after text found/phoneme

给定这种形式的规则,将它们应用于文本字符串的好方法是什么?我想尝试构建一个可以解析文本以找到规则匹配的函数。


规则如下:

#  one or more vowels (AEIOUY)
+  one of E, I, Y (a front vowel)
:  zero or more consonants (BCDFGHJKLMNPQRSTVWXZ)
^  one consonant
.  one of B, V, D, G, J, L, M, N, R, W, Z (a voiced consonant)
%  one of ER, E, ES, ED, ING, ELY (a suffix)
&  one of S, C, G, Z, X, J, CH, SH (a siblant)
@  one of T, S, R, D, L, Z, N, J, TH, CH, SH (a consonant influencing following u)

" /// "
"A// /UH"
"ARE/ / /AH-R"
"AR/ /O/UH-R"
"AR//#/EH-R"
"AS/ ^/#/AE-A-S"
"A//WA/UH"
"AW///AW"
"ANY/ ://EH-N-EE"
"A//^+#/AE-A"
"ALLY/#://UH-L-EE"
"AL/ /#/UH-L"
"AGAIN///UH-G-EH-N"
"AG/#:/E/IH-J"
"A//^+:#/AE"
"A/ :/^+/AE-A"
"ARR/ //UH-R"
"ARR///AE-R"
"AR/ ://AH-R"
"AR// /AE-R"
"AR///AH-R"
"AIR///EH-R"
"AI///AE-A"
"AY///AE-A"
"AU///AW"
"AL/#:/ /UH-L"
"ALS/#:/ /UH-L-Z"
"ALK///AW-K"
"AL//^/AW-L"
"ABLE/ ://AE-A-B-UH-L"
"ABLE///UH-B-UH-L"
"ANG//+/AE-A-N-J"
"ATHE/ C/ /AE-TH-EE"
"A//A/AH"
"A///AE"
"BE/ /^#/B-IH"
"BEING///B-EE-IH-N"
"BOTH/ / /B-OH-TH"
"BUS/ /#/B-IH-Z"
"BUIL///B-IH-L"
"B/ / /B-EE"
"B///B"
"CH/ /^/K"
"CH/^E//K"
"CH///CH"
"CI/ S/#/S-AH-EE"
"CI//A/SH"
"CI//O/SH"
"CI//EN/SH"
"C//+/S"
"CK///K"
"COM//%/K-AH-M"
"C/ / /S-EE"
"C///K"
"DED/#:/ /D-IH-D"
"D/.E/ /D"
"D/#^:E/ /T"
"DE/ /^#/D-IH"
"DO/ / /D-OO"
"DOES/ //D-UH-Z"
"DOING/ //D-OO-IH-N"
"DOW/ //D-OH"
"DU//A/J-OO"
"D/ / /D-EE"
"DOUGH///D-OH"
"D///D"
"E/#:/ /"
"E/'^:/ /"
"E/ :/ /EE"
"ED/#/ /D"
"E/#:/D /"
"ER//EV/EH-V"
"EVEN/ EL//EH-V-EH-N"
"EVEN/ S//EH-V-EH-N"
"E//^%/EE"
"E//PH%/EE"
"ERI//#/EE-R-EE"
"ER/#:/#/AE-R"
"ER//#/EH-R"
"ER///AE-R"
"EVEN/ //EE-V-EH-N"
"E/#:/W/"
"EW/@//OO"
"EW///Y-OO"
"E//O/EE"
"ES/#:&/ /IH-Z"
"E/#:/S /"
"ELY/#://L-EE"
"EMENT/#://M-EH-N-T"
"EFUL///F-U-L"
"EE///EE"
"EARN///AE-R-N"
"EAR/ /^/AE-R"
"EAD///EH-D"
"EA/#:/ /EE-UH"
"EA//SU/EH"
"EA///EE"
"EIGH///AE-A"
"EI///EE"
"EYE/ //AH-EE"
"EY///EE"
"EU///Y-OO"
"E/ / /EE"
"E/^/ /"
"E///EH"
"FUL///F-U-L"
"F/F//"
"F/ / /EH-F"
"F///F"
"GIV///G-IH-V"
"G/ /I^/G"
"GE//T/G-EH"
"GGES/SU//G-J-EH-SS"
"G/G//"
"G/ B#//G"
"G//+/J"
"GREAT///G-R-AE-A-T"
"GH/#//"
"G/ / /G-EE"
"G///G"
"HAV/ //H-AE-V"
"HERE/ //H-EE-R"
"HOUR/ //OH-AE-R"
"HOW///H-OH"
"H//#/H"
"H/ / /H-AE-CH"
"H///"
"IN/ //IH-N"
"I/ / /AH-EE"
"IN//D/IH-N"
"IER///EE-AE-R"
"IED/#:R//EE-D"
"IED// /AH-EE-D"
"IEN///EE-EH-N"
"IE//T/AH-EE-EH"
"I/ :/%/AH-EE"
"I//%/EE"
"IE///EE"
"INE/N//AH-EE-N"
"IME/T//AH-EE-M"
"I//^+:#/IH"
"IR//#/AH-EE-R"
"IS//%/AH-EE-S"
"IX//%/IH-K-S"
"IZ//%/AH-EE-Z"
"I//D%/AH-EE"
"I/+^/^+/IH"
"I//T%/AH-EE"
"I/#^:/^+/IH"
"I//^+/AH-EE"
"IR///AE-R"
"IGH///AH-EE"
"ILD///AH-EE-L-D"
"IGN// /AH-EE-N"
"IGN//^/AH-EE-N"
"IGN//%/AH-EE-N"
"IQUE///EE-K"
"I///IH"
"J/ / /J-A-EE"
"J///J"
"K//N/"
"K/ / /K-A-EE"
"K///K"
"LO//C#/L-OH"
"L/L//"
"L/#^:/%/UH-L"
"LEAD///L-EE-D"
"L/ / /AE-L"
"L///L"
"MOV///M-OO-V"
"M/ / /EH-M"
"M///M"
"NG/E/+/N-J"
"NG//R/N"
"NG//#/N"
"NGL//%/N-UH-L"
"NG///N"
"NK///N-K"
"NOW/ / /N-OH"
"N/ / /EH-N"
"N/N//"
"N///N"
"OF// /UH-V"
"OROUGH///AE-R-OH"
"OR/ F/TY/OH-R"
"OR/#:/ /AE-R"
"ORS/#:/ /AE-R-Z"
"OR///AW-R"
"ONE/ //W-UH-N"
"OW//EL/OH"
"OW///OH"
"OVER/ //OH-V-AE-R"
"OV///UH-V"
"O//^%/OH"
"O//^EN/OH"
"O//^I#/OH"
"OL//D/OH-L"
"OUGHT///AH-T"
"OUGH///UH-F"
"OU/ /^L/UH"
"OU/ //OH"
"OU/H/S#/OH"
"OUS///UH-S"
"OUR/ F//OH-R"
"OUR///AW-R"
"OUD///U-D"
"OUP///OO-P"
"OU///OH"
"OY///AW-EE"
"OING///OH-IH-N"
"OI///AW-EE"
"OOR///OH-R"
"OOK///U-K"
"OOD///U-D"
"OO///OO"
"O//E/OH"
"O// /OH"
"OA// /OH"
"ONLY/ //OH-N-L-EE"
"ONCE/ //W-UH-N-S"
"ON'T// /OH-N-T"
"O/C/N/AH"
"O//NG/AH"
"O/^:/N/UH"
"ON/I//UH-N"
"ON/#:/ /UH-N"
"ON/#^//UH-N"
"O//ST /OH"
"OF//^/AW-F"
"OTHER///UH-TH-AE-R"
"OSS// /AW-S"
"OM/#^:/ /UH-M"
"O///AH"
"PH///F"
"PEOP///P-EE-P"
"POW///P-OH"
"PUT// /P-U-T"
"P/ / /P-EE"
"P/P//"
"P///P"
"QUAR///K-W-AW-R"
"QU/ //K-W"
"QU///K"
"Q/ / /K-OO"
"Q///K"
"RE/ /^#/R-EE"
"R/ / /AH"
"R/R//"
"R///R"
"SH///SH"
"SION/#//ZH-UH-N"
"SOME///S-AH-M"
"SUR/#/#/ZH-AE-R"
"SUR//#/SH-AE-R"
"SU/#/#/ZH-OO"
"SSU/#/#/SH-OO"
"SED/#/ /Z-D"
"S/#/#/Z"
"SAID///S-EH-D"
"SION/^//SH-UH-N"
"S/S//"
"S/./ /Z"
"S/#:.E/ /Z"
"S/#^:##/ /Z"
"S/#^:#/ /S"
"S/U/ /S"
"S/ :#/ /Z"
"SCH/ //S-K"
"S//C+/"
"SM/#//Z-M"
"SN/#/ /Z-UH-N"
"S/ / /EH-S"
"S///S"
"THE/ / /TH-UH"
"TO// /T-OO"
"THAT///TH-AE-T"
"THIS/ / /TH-IH-S"
"THEY/ //TH-AE-A"
"THERE/ //TH-EH-R"
"THER///TH-AE-R"
"THEIR///TH-EH-EH"
"THAN/ / /TH-AE-N"
"THEM/ / /TH-EH-M"
"THESE// /TH-EE-Z"
"THEN/ //TH-EH-N"
"THROUGH///TH-R-OO"
"THOSE///TH-OH-Z"
"THOUGH// /TH-OH"
"THUS/ //TH-UH-S"
"TH///TH"
"TED/#:/ /T-IH-D"
"TI/S/#N/CH"
"TI//O/SH"
"TI//A/T"
"TIEN///SH-UH-N"
"TUR//#/CH-AE-R"
"TU//A/CH-OO"
"TWO/ //T-OO"
"T/ / /T-EE"
"T/T//"
"T///T"
"UN/ /I/Y-OO-N"
"UN/ //UH-N"
"UPON/ //UH-P-AW-N"
"UR/@/#/AE-R"
"UR//#/Y-AE-R"
"UR///AE-R"
"U//^ /UH"
"U//^^/UH"
"UY///AH-EE"
"U/ G/#/"
"U/G/%/"
"U/G/#/W"
"U/#N//Y-OO"
"UI/@//OO"
"U/@//UH"
"U///Y-OO"
"VIEW///V-Y-OO"
"V/ / /V-EE"
"V///V"
"WHERE/ //W-AE-R"
"WA//S/W-AH"
"WA//T/W-AH"
"WHERE///WH-EH-R"
"WHAT///WH-AH-T"
"WHOL///H-OH-L"
"WHO///H-OO"
"WH///WH"
"WAR///W-AH-R"
"WOR///W-AE-R"
"WR///R"
"W/ / /D-AH-B-L-Y-OO"
"W///W"
"X//^/EH-K-S"
"X/ / /EH-K-S"
"X/ /#/Z-EH"
"X///K-S"
"YOUNG///Y-UH-N"
"YOU/ //Y-OO"
"YES/ //Y-EH-S"
"Y/ / /WH-UH-Y"
"Y/ //Y"
"Y/#^:/ /EE"
"Y/#^:/I/EE"
"Y/ :/ /AH-EE"
"Y/ :/#/AH-EE"
"Y/ :/^+:#/IH"
"Y/ :/^#/AH-EE"
"Y///IH"
"ZZ///T-Z"
"Z/ / /Z-EH-D"
"Z///Z"

事实证明,lookbehind 要求模式具有固定大小,这不符合您的规则,因此我们必须稍微复杂一些。

首先让我们定义语法和正则表达式之间的转换:

rule_syntax = {
    '#': r'[AEIOUY]+',
    '+': r'[EIY]',
    ':': r'[BCDFGHJKLMNPQRSTVWXZ]*',
    '^': r'[BCDFGHJKLMNPQRSTVWXZ]',
    '.': r'[BVDGJLMNRWZ]',
    '%': r'(?:ER|E|ES|ED|ING|EL)',
    '&': r'(?:[SCGZXJ]|CH|SH)',
    '@': r'(?:[TSRDLZNJ]|TH|CH|SH)',
}

以及从该映射创建正则表达式片段的函数:

def mkregex(rule):
    regex = r""
    for ch in rule:
        regex += rule_syntax.get(ch, ch)
    return regex

我不确定你想如何处理带空格的规则,我已经注释掉了 ' /// ' 规则以获得下面的结果。

现在我们实现一个函数,将您的规则语法转换为 "interesting" 元组:

def mkrule(ruletxt):
    txt, before, after, phoneme = ruletxt.split('/')
    rule = r""

    if before:
        # use a non-capturing group to match the 'before' text
        rule += r'(?:' + mkregex(before) + ')'

    # create a capturing group for the text in question
    rule += r'(?P<found>' + txt + ')'  

    if after:
        # add a lookahead pattern
        rule += r'(?=' + mkregex(after) + ')'

    # return a tuple containing
    #   - the regex created from the rule
    #   - a lower-cased version of the phonemes between dashes
    #   - the original rule (for explaining and debugging)
    return rule, "-%s-" % phoneme.lower(), ruletxt

我们将采用的方法是用音素迭代替换匹配的规则。为了确保我们不替换已经转换的文本(即音素),我们将使输入字符串大写,而音素小写。为了防止音素 运行 相互融合,我们在每一侧添加了一个 -(我们必须在最后清理它)。

将所有规则转换为有趣的元组:

rules = [mkrule(r) for r in [
    #" /// ",          # this rule creates problems
    "A// /UH",
    "ARE/ / /AH-R",
    "AR/ /O/UH-R",
    "AR//#/EH-R",
    "AS/ ^/#/AE-A-S",
    "A//WA/UH",
    "AW///AW",
    ...
]]

我们快完成了,只是一个从单个规则替换找到的文本的函数:

def match_and_replace(word, rule, phonemes):
    # a rule can match multiple times, find all of them
    matches = [(m.start(), m.end()) for m in re.finditer(rule, word)]
    matches.reverse()  # we're going to replace in-place, so start from behind
    chars = list(word)  # convert to list of chars since strings are immutable
    for start, end in matches:
        chars[start:end] = phonemes
    return ''.join(chars)  # convert back to string

最后,从单词中提取'phonemes'的函数:

def phonemes(word, explain=False):
    # rule engines should always be able to explain their results ;-)
    if explain:
        print "word  :", word

    result = " %s " % word.upper()  # add space around word to give the rules containing spaces something to work with
    step = 0

    # iterate over all the interesting tuples
    for rule, phoneme, ruletxt in rules:
        # for each rule, tmp is the string where all matches for `rule` have been replaced by `phoneme`
        tmp = match_and_replace(result, rule, phoneme)
        if explain and tmp != result:
            step += 1
            print 'step %d: %r ---> %r  [rule: %r (%r)]' % (
                step, result, tmp, ruletxt, rule
            )
        result = tmp

    # remove artifacts
    res, _count = re.subn(r'-+', '-', result.replace(' ', '').strip('-'))
    if explain:
        print "result:", res
        print
    return res

有了这个我得到以下结果:

>>> phonemes('abort', explain=True)
word  : abort
step 1: ' ABORT ' ---> ' -ae-BORT '  [rule: 'A///AE' ('(?P<found>A)')]
step 2: ' -ae-BORT ' ---> ' -ae--b-ORT '  [rule: 'B///B' ('(?P<found>B)')]
step 3: ' -ae--b-ORT ' ---> ' -ae--b--aw-r-T '  [rule: 'OR///AW-R' ('(?P<found>OR)')]
step 4: ' -ae--b--aw-r-T ' ---> ' -ae--b--aw-r--t- '  [rule: 'T///T' ('(?P<found>T)')]
result: ae-b-aw-r-t

您需要对规则进行合理排序以获得所需结果,或者使用更复杂的算法来找到所有可能匹配的规则排列,然后找到最佳排列。