python 语料库分析
Corpus analysis with python
我是自然语言处理的新生,有一个简单的语料分析任务。给定一个输入文件 (MovieCorpus.txt),我们被分配计算以下统计数据:
- 句子数量、标记、类型(引理)
- 句子长度、类型、词性的分布
import nltk
import spacy as sp
from nltk import word_tokenize
# Setting Spacy Modelsp
nlp = sp.load('en_core_web_sm')
# Movie Corpus
with open ('MovieCorpus.txt','r') as f:
read_data = f.read().splitlines()
# Tokenize, POS, Lemma
tokens = []
lemma = []
pos = []
for doc in nlp.pipe(read_data):
if doc.is_parsed:
tokens.append([n.text for n in doc])
lemma.append([n.lemma_ for n in doc])
pos.append([n.pos_ for n in doc])
else:
tokens.append(None)
lemma.append(None)
pos.append(None)
ls = len(read_data)
print("The amount of sentences is %d:" %ls)
lt = len(tokens)
print("The amount of tokens is %d:" %lt)
ll = len(lemma)
print("The amount of lemmas is %d:" %ll)
这是试图回答这些问题,但由于文件非常大(>300.000 个句子),因此需要很长时间才能分析。我做错了什么吗?我应该使用 NLTK 而不是 spacy 吗?
import pandas as pd
import nltk
from nltk import word_tokenize
# Movie Corpus
with open ('MovieCorpus.txt','r') as f:
read_data = f.read().splitlines()
df = pd.DataFrame({"text": read_data}) # Assuming your data has no header
data = data.head(10)
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
data['lemma'] = data.text.apply(lemmatize_text)
data["tokens"] = data.text.apply(nltk.word_tokenize)
data["posR"] = data.tokens.apply(lambda x: nltk.pos_tag(x))
tags = [[tag for word, tag in _] for _ in data["posR"].to_list()]
data["pos"] = tags
print(data)
从现在开始,您应该能够自己完成所有其他任务。
我是自然语言处理的新生,有一个简单的语料分析任务。给定一个输入文件 (MovieCorpus.txt),我们被分配计算以下统计数据:
- 句子数量、标记、类型(引理)
- 句子长度、类型、词性的分布
import nltk
import spacy as sp
from nltk import word_tokenize
# Setting Spacy Modelsp
nlp = sp.load('en_core_web_sm')
# Movie Corpus
with open ('MovieCorpus.txt','r') as f:
read_data = f.read().splitlines()
# Tokenize, POS, Lemma
tokens = []
lemma = []
pos = []
for doc in nlp.pipe(read_data):
if doc.is_parsed:
tokens.append([n.text for n in doc])
lemma.append([n.lemma_ for n in doc])
pos.append([n.pos_ for n in doc])
else:
tokens.append(None)
lemma.append(None)
pos.append(None)
ls = len(read_data)
print("The amount of sentences is %d:" %ls)
lt = len(tokens)
print("The amount of tokens is %d:" %lt)
ll = len(lemma)
print("The amount of lemmas is %d:" %ll)
这是试图回答这些问题,但由于文件非常大(>300.000 个句子),因此需要很长时间才能分析。我做错了什么吗?我应该使用 NLTK 而不是 spacy 吗?
import pandas as pd
import nltk
from nltk import word_tokenize
# Movie Corpus
with open ('MovieCorpus.txt','r') as f:
read_data = f.read().splitlines()
df = pd.DataFrame({"text": read_data}) # Assuming your data has no header
data = data.head(10)
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
data['lemma'] = data.text.apply(lemmatize_text)
data["tokens"] = data.text.apply(nltk.word_tokenize)
data["posR"] = data.tokens.apply(lambda x: nltk.pos_tag(x))
tags = [[tag for word, tag in _] for _ in data["posR"].to_list()]
data["pos"] = tags
print(data)
从现在开始,您应该能够自己完成所有其他任务。