在函数中使用关键字参数使 n-gram 的生成成为可选的
Using keyword arguments in function to make generation of n-grams optional
我的 xml 文件的外观示例:
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="folia.xsl"?>
<FoLiA xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://ilk.uvt.nl/folia" xml:id="untitled" generator="libfolia-v0.10">
<metadata type="native">
<annotations>
<token-annotation annotator="ucto" annotatortype="auto" datetime="2017-04-17T14:50:04" set="tokconfig-nl"/>
<pos-annotation annotator="frog-mbpos-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-mbpos-cgn"/>
<lemma-annotation annotator="frog-mblem-1.1" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-mblem-nl"/>
<chunking-annotation annotator="frog-chunker-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-chunker-nl"/>
<entity-annotation annotator="frog-mwu-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-mwu-nl"/>
<entity-annotation annotator="frog-ner-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-ner-nl"/>
<morphological-annotation annotator="frog-mbma-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-mbma-nl"/>
<dependency-annotation annotator="frog-depparse-1.0" annotatortype="auto" set="http://ilk.uvt.nl/folia/sets/frog-depparse-nl"/>
</annotations>
</metadata>
<text xml:id="untitled.text">
<p xml:id="untitled.p.1">
<s xml:id="untitled.p.1.s.1">
<w xml:id="untitled.p.1.s.1.w.1" class="WORD">
<t>De</t>
<pos class="LID(bep,stan,rest)" confidence="0.999701" head="LID">
<feat class="bep" subset="lwtype"/>
<feat class="stan" subset="naamval"/>
<feat class="rest" subset="npagr"/>
</pos>
<lemma class="de"/>
<morphology>
<morpheme>
<t offset="0">de</t>
</morpheme>
</morphology>
</w>
我正在制作一个函数,从 xml 文件中生成单字、双字和三字词。我想让 n-gram 可选,这样您就可以选择是想要所有 n-gram 还是只想要 unigram。我的函数的结果是单词 n-gram 的矢量化相对频率。我通过在我的参数中使用关键字参数(使用 True 和 False)来尝试这个。我得到一本空字典,所以我一定做错了什么。这是我所拥有的。有人可以告诉我我做错了什么吗?
import re
import xml.etree.ElementTree as ET
def word_ngrams(frogged_xmlfile, unigrams=True, bigrams=True, trigrams=True):
vector = {}
tree = ET.parse(frogged_xmlfile) #enter the xml tree
root = tree.getroot()
tokens = []
words = []
regex = re.compile(r'[^0-9] |[^(\.|\,|\?|\:|\;|\!)]')
for node in root.iter('w'):
for w in node.findall('t'):
tokens.append(w.text)
for word in tokens:
if regex.search(word):
words.append(word)
if (unigrams):
for n in [1]: #unigrams
grams = ngrams(words, n)
fdist = FreqDist(grams)
total = sum(c for g,c in fdist.items())
for gram, count in fdist.items():
vector['w'+str(n)+'+'+' '.join(gram)] = count/total
if (bigrams):
for n in [2]: #bigrams
grams = ngrams(tokens, n)
fdist = FreqDist(grams)
total = sum(c for g,c in fdist.items())
for gram, count in fdist.items():
vector['w'+str(n)+'+'+' '.join(gram)] = count/total
if (trigrams):
for n in [3]: #trigrams
grams = ngrams(tokens, n)
fdist = FreqDist(grams)
total = sum(c for g,c in fdist.items())
for gram, count in fdist.items():
vector['w'+str(n)+'+'+' '.join(gram)] = count/total
return vector
print(word_ngrams('romanfragment_frogged.xml', unigrams = True, bigrams = False, trigrams = False))
您的搜索忽略了文档默认名称space,因此它永远找不到匹配的标签。
你的正则表达式真的很糟糕 -
"[^0-9] " # not-a-digit, followed by space
"|" # OR
"[^(\.|\,|\?|\:|\;|\!)]" # bad syntax, but I think you mean not one of .,?:;!
它将接受任何标点符号后跟 space(作为非数字),或任何数字或其他字符或白色 space(作为非标点符号)!基本上唯一不匹配的是 "a string consisting entirely of punctuation characters".
我猜你真正想要的是 "a string containing at least one letter and no non-letter characters",但请随时纠正我。
您的代码不包含 ngrams()
或 FreqDist()
,因此我无法对其进行测试。
for gram, count ...
的缩进看起来不正确 - 我认为应该再缩进一层。
你有很多不必要的重复代码。
试试这个:
# import re
import xml.etree.ElementTree as ET
FOLIA_NAMESPACE = {
'default': 'http://ilk.uvt.nl/folia',
'xlink': 'http://www.w3.org/1999/xlink'
}
def is_word(s):
return s.isalpha()
# as a regex:
# return re.match("[A-Za-z]+$", s) is not None
def load_words(folia_xml_file, is_word=is_word, namespace=FOLIA_NAMESPACE):
root = ET.parse(folia_xml_file).getroot()
tokens = root.findall(".//default:w/default:t", namespace)
return [t.text for t in tokens if is_word(t.text)]
def make_ngram_vectors(words, n_values=[1,2,3]):
vectors = {}
for n in n_values:
grams = ngrams(words, n)
fdist = FreqDist(grams)
total = sum(count for gram,count in fdist.items())
for gram,count in fdist.items():
key = "w{}+{}".format(n, " ".join(gram))
vectors[key] = count / total
return vectors
def main():
words = load_words("romanfragment_frogged.xml")
vectors = make_ngram_vectors(words, [1])
print(vectors)
if __name__ == "__main__":
main()
编辑:
如果您查看 xml 文件顶部的 <FoLiA>
标签,您将看到 xmlns=
(link 定义文档的默认名称 space,即有哪些标签可用)和 xmlns:xlink=
(备用 XLink 名称space,它定义了 xlink:href
和 xlink:show
等标签 - 请参阅 https://www.w3schools.com/xml/xml_xlink.asp)。
ElementTree 喜欢扩展 namespaces 内联,因此您的标签看起来像 {http://ilk.uvt.nl/folia}w
。传递 namespace 字典让我们可以使用更易读的格式,例如 default:w
。
要获得与原始函数相同的 input/output 格式,您可以使用包装函数,例如:
def word_ngrams(folia_xml_file, unigrams=True, bigrams=True, trigrams=True):
# condense parameters into n_values
n_values = []
if unigrams:
n_values.append(1)
if bigrams:
n_values.append(2)
if trigrams:
n_values.append(3)
words = load_words(folia_xml_file)
return make_ngram_vectors(words, n_values)
我的 xml 文件的外观示例:
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="folia.xsl"?>
<FoLiA xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://ilk.uvt.nl/folia" xml:id="untitled" generator="libfolia-v0.10">
<metadata type="native">
<annotations>
<token-annotation annotator="ucto" annotatortype="auto" datetime="2017-04-17T14:50:04" set="tokconfig-nl"/>
<pos-annotation annotator="frog-mbpos-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-mbpos-cgn"/>
<lemma-annotation annotator="frog-mblem-1.1" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-mblem-nl"/>
<chunking-annotation annotator="frog-chunker-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-chunker-nl"/>
<entity-annotation annotator="frog-mwu-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-mwu-nl"/>
<entity-annotation annotator="frog-ner-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-ner-nl"/>
<morphological-annotation annotator="frog-mbma-1.0" annotatortype="auto" datetime="2017-04-17T14:50:04" set="http://ilk.uvt.nl/folia/sets/frog-mbma-nl"/>
<dependency-annotation annotator="frog-depparse-1.0" annotatortype="auto" set="http://ilk.uvt.nl/folia/sets/frog-depparse-nl"/>
</annotations>
</metadata>
<text xml:id="untitled.text">
<p xml:id="untitled.p.1">
<s xml:id="untitled.p.1.s.1">
<w xml:id="untitled.p.1.s.1.w.1" class="WORD">
<t>De</t>
<pos class="LID(bep,stan,rest)" confidence="0.999701" head="LID">
<feat class="bep" subset="lwtype"/>
<feat class="stan" subset="naamval"/>
<feat class="rest" subset="npagr"/>
</pos>
<lemma class="de"/>
<morphology>
<morpheme>
<t offset="0">de</t>
</morpheme>
</morphology>
</w>
我正在制作一个函数,从 xml 文件中生成单字、双字和三字词。我想让 n-gram 可选,这样您就可以选择是想要所有 n-gram 还是只想要 unigram。我的函数的结果是单词 n-gram 的矢量化相对频率。我通过在我的参数中使用关键字参数(使用 True 和 False)来尝试这个。我得到一本空字典,所以我一定做错了什么。这是我所拥有的。有人可以告诉我我做错了什么吗?
import re
import xml.etree.ElementTree as ET
def word_ngrams(frogged_xmlfile, unigrams=True, bigrams=True, trigrams=True):
vector = {}
tree = ET.parse(frogged_xmlfile) #enter the xml tree
root = tree.getroot()
tokens = []
words = []
regex = re.compile(r'[^0-9] |[^(\.|\,|\?|\:|\;|\!)]')
for node in root.iter('w'):
for w in node.findall('t'):
tokens.append(w.text)
for word in tokens:
if regex.search(word):
words.append(word)
if (unigrams):
for n in [1]: #unigrams
grams = ngrams(words, n)
fdist = FreqDist(grams)
total = sum(c for g,c in fdist.items())
for gram, count in fdist.items():
vector['w'+str(n)+'+'+' '.join(gram)] = count/total
if (bigrams):
for n in [2]: #bigrams
grams = ngrams(tokens, n)
fdist = FreqDist(grams)
total = sum(c for g,c in fdist.items())
for gram, count in fdist.items():
vector['w'+str(n)+'+'+' '.join(gram)] = count/total
if (trigrams):
for n in [3]: #trigrams
grams = ngrams(tokens, n)
fdist = FreqDist(grams)
total = sum(c for g,c in fdist.items())
for gram, count in fdist.items():
vector['w'+str(n)+'+'+' '.join(gram)] = count/total
return vector
print(word_ngrams('romanfragment_frogged.xml', unigrams = True, bigrams = False, trigrams = False))
您的搜索忽略了文档默认名称space,因此它永远找不到匹配的标签。
你的正则表达式真的很糟糕 -
"[^0-9] " # not-a-digit, followed by space "|" # OR "[^(\.|\,|\?|\:|\;|\!)]" # bad syntax, but I think you mean not one of .,?:;!
它将接受任何标点符号后跟 space(作为非数字),或任何数字或其他字符或白色 space(作为非标点符号)!基本上唯一不匹配的是 "a string consisting entirely of punctuation characters".
我猜你真正想要的是 "a string containing at least one letter and no non-letter characters",但请随时纠正我。
您的代码不包含
ngrams()
或FreqDist()
,因此我无法对其进行测试。for gram, count ...
的缩进看起来不正确 - 我认为应该再缩进一层。你有很多不必要的重复代码。
试试这个:
# import re
import xml.etree.ElementTree as ET
FOLIA_NAMESPACE = {
'default': 'http://ilk.uvt.nl/folia',
'xlink': 'http://www.w3.org/1999/xlink'
}
def is_word(s):
return s.isalpha()
# as a regex:
# return re.match("[A-Za-z]+$", s) is not None
def load_words(folia_xml_file, is_word=is_word, namespace=FOLIA_NAMESPACE):
root = ET.parse(folia_xml_file).getroot()
tokens = root.findall(".//default:w/default:t", namespace)
return [t.text for t in tokens if is_word(t.text)]
def make_ngram_vectors(words, n_values=[1,2,3]):
vectors = {}
for n in n_values:
grams = ngrams(words, n)
fdist = FreqDist(grams)
total = sum(count for gram,count in fdist.items())
for gram,count in fdist.items():
key = "w{}+{}".format(n, " ".join(gram))
vectors[key] = count / total
return vectors
def main():
words = load_words("romanfragment_frogged.xml")
vectors = make_ngram_vectors(words, [1])
print(vectors)
if __name__ == "__main__":
main()
编辑:
如果您查看 xml 文件顶部的 <FoLiA>
标签,您将看到 xmlns=
(link 定义文档的默认名称 space,即有哪些标签可用)和 xmlns:xlink=
(备用 XLink 名称space,它定义了 xlink:href
和 xlink:show
等标签 - 请参阅 https://www.w3schools.com/xml/xml_xlink.asp)。
ElementTree 喜欢扩展 namespaces 内联,因此您的标签看起来像 {http://ilk.uvt.nl/folia}w
。传递 namespace 字典让我们可以使用更易读的格式,例如 default:w
。
要获得与原始函数相同的 input/output 格式,您可以使用包装函数,例如:
def word_ngrams(folia_xml_file, unigrams=True, bigrams=True, trigrams=True):
# condense parameters into n_values
n_values = []
if unigrams:
n_values.append(1)
if bigrams:
n_values.append(2)
if trigrams:
n_values.append(3)
words = load_words(folia_xml_file)
return make_ngram_vectors(words, n_values)