替换 python 中匹配正则表达式的单词
Replacing words matching regular expressions in python
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl)
in pattern]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
rep=RegexpReplacer()
print rep.replace("can't is a contradicton")
我从 Python Jacob Perkins 的《Text Processing with NLTK 2.0 Cookbook》中复制了这段代码
但是我的预期输出是:
不能是矛盾
实际输出为:
不能是矛盾
我无法查明 t
中的错误
要么使用原始字符串要么转义引号,但不能同时转义。
>>> print r'won\'t'
won\'t
>>> print 'won\'t'
won't
或者,如果您更喜欢原始字符串:
>>> print r"won't"
won't
您的代码存在一些缩进问题和拼写错误 - 我不太确定解释器是如何为您提供任何输出的。在我修复这些问题后,我得到了您预期的输出。
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
# Fixed this line - "patterns", not "pattern"
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
# Fixed indentation here
return s
rep=RegexpReplacer()
print rep.replace("can't is a contradicton")
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl)
in pattern]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
rep=RegexpReplacer()
print rep.replace("can't is a contradicton")
我从 Python Jacob Perkins 的《Text Processing with NLTK 2.0 Cookbook》中复制了这段代码
但是我的预期输出是: 不能是矛盾
实际输出为: 不能是矛盾
我无法查明 t
中的错误要么使用原始字符串要么转义引号,但不能同时转义。
>>> print r'won\'t'
won\'t
>>> print 'won\'t'
won't
或者,如果您更喜欢原始字符串:
>>> print r"won't"
won't
您的代码存在一些缩进问题和拼写错误 - 我不太确定解释器是如何为您提供任何输出的。在我修复这些问题后,我得到了您预期的输出。
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
# Fixed this line - "patterns", not "pattern"
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
# Fixed indentation here
return s
rep=RegexpReplacer()
print rep.replace("can't is a contradicton")