Python 正则表达式捕获组扩展

Python regex capturing group extension

我正在使用命名捕获组,使用 (?P<name>) 以及与冠状病毒大流行相关的动词和词干列表。

import regex
import pandas as pd


data = {'id':[1, 2, 3, 4, 5], 'text':['The pandemy is spreading', 'He is fighting Covid-19', 'The pandemic virus spreads', 'This sentence is about a different topic' , 'How do we stop the virus ?']}
df = pd.DataFrame(data)

def covid_lang(text):    
    predicates = ['avoid', 'contain', 'track', 'spread', 'contact', 'stop', 'combat', 'fight']
    subjects = ['Corona', 'corona', 'Covid-19', 'epidem', 'infect', 'virus', 'pandem', 'disease', 'outbreak']

    p1 = fr'(?<=\b(?P<predicate>{"|".join(predicates)}))[^\.]*(?P<subject>{"|".join(subjects)}[a-z]*)'

    result = []
    for m in regex.finditer(p1, text, regex.S):
        result.append([m.group('predicate'), m.group('subject')])

    p2 = fr'\b(?P<subject>{"|".join(subjects)})[^\.]*(?<=\b(?P<predicate>{"|".join(predicates)}))'
    for m in regex.finditer(p2, text, regex.S):
        result.append([m.group('subject'), m.group('predicate')])

    return result

df['result'] = df['text'].apply(covid_lang)

当有匹配时,我想 return 作为主语,不仅是词干,而且是整个词(即 'pandemic' 和 'pandemy' 而不是'pandem')。我试过在单词列表之后添加 [a-z]* ,这样当单词结束时捕获组就会停止,但它不会改变任何东西。

另外,是否可以在单个查询中加入两个查询(谓词在主题之前,主题在谓词之前)?我试过使用 (p1)|(p2) 但它不适用于命名的捕获组。

最后,是否可以在一个单词中包含 Coronacorona 等大小写字母?

这三个都应该做到:

from xml.etree.ElementPath import prepare_descendant

import regex
import pandas as pd

data = {'id':[1, 2, 3, 4, 5], 'text':['The pandemy is spreading', 'He is fighting Covid-19', 'The pandemic virus spreads', 'This sentence is about a different topic' , 'How do we stop the virus ?']}
df = pd.DataFrame(data)

def expand_word(word):
    return f'({word}[a-z]*)'

def construct_named_group_from_list_of_words(word_type, word_list):
    expanded_word_regex_list = [expand_word(stem) for stem in word_list]
    word_in_named_group = fr'(?P<{word_type}>{"|".join(expanded_word_regex_list)})'
    return word_in_named_group

def covid_lang(text):
    predicates = ['avoid', 'contain', 'track', 'spread', 'contact', 'stop', 'combat', 'fight']
    subjects = ['corona', 'covid-19', 'epidem', 'infect', 'virus', 'pandem', 'disease', 'outbreak']

    predicate_in_named_group = construct_named_group_from_list_of_words("predicate", predicates)
    subject_in_named_group = construct_named_group_from_list_of_words("subject", subjects)

    result = []

    p1 = fr'(?<=\b{predicate_in_named_group})[^\.]*{subject_in_named_group}'
    p2 = fr'\b{subject_in_named_group}[^\.]*(?<=\b{predicate_in_named_group})'

    p = fr'({p1})|({p2})'

    for m in regex.finditer(p, text, regex.S | regex.IGNORECASE):
        result.append([m.group('predicate'), m.group('subject')])


    return result


df['result'] = df['text'].apply(covid_lang)

print(df)

输出:

   id                                      text                  result
0   1                  The pandemy is spreading  [[spreading, pandemy]]
1   2                   He is fighting Covid-19     [[fight, Covid-19]]
2   3                The pandemic virus spreads   [[spreads, pandemic]]
3   4  This sentence is about a different topic                      []
4   5                How do we stop the virus ?         [[stop, virus]]

但我不确定你是否总是想先输出谓词?如果没有,应该这样做:

from xml.etree.ElementPath import prepare_descendant

import regex
import pandas as pd


data = {'id':[1, 2, 3, 4, 5], 'text':['The pandemy is spreading', 'He is fighting Covid-19', 'The pandemic virus spreads', 'This sentence is about a different topic' , 'How do we stop the virus ?']}
df = pd.DataFrame(data)

def expand_word(word):
    return f'({word}[a-z]*)'

def construct_named_group_from_list_of_words(word_type, word_list):
    expanded_word_regex_list = [expand_word(stem) for stem in word_list]
    word_in_named_group = fr'(?P<{word_type}>{"|".join(expanded_word_regex_list)})'
    return word_in_named_group

def covid_lang(text):
    predicates = ['avoid', 'contain', 'track', 'spread', 'contact', 'stop', 'combat', 'fight']
    subjects = ['corona', 'covid-19', 'epidem', 'infect', 'virus', 'pandem', 'disease', 'outbreak']

    predicate_in_named_group = construct_named_group_from_list_of_words("predicate", predicates)
    subject_in_named_group = construct_named_group_from_list_of_words("subject", subjects)

    result = []

    p1 = fr'(?<=\b{predicate_in_named_group})[^\.]*{subject_in_named_group}'
    p2 = fr'\b{subject_in_named_group}[^\.]*(?<=\b{predicate_in_named_group})'

    for m in regex.finditer(p1, text, regex.S | regex.IGNORECASE):
        result.append([m.group('predicate'), m.group('subject')])

    for m in regex.finditer(p2, text, regex.S | regex.IGNORECASE):
        result.append([m.group('subject'), m.group('predicate')])

    return result


df['result'] = df['text'].apply(covid_lang)

print(df)

输出:

   id                                      text                  result
0   1                  The pandemy is spreading  [[pandemy, spreading]]
1   2                   He is fighting Covid-19     [[fight, Covid-19]]
2   3                The pandemic virus spreads   [[pandemic, spreads]]
3   4  This sentence is about a different topic                      []
4   5                How do we stop the virus ?         [[stop, virus]]