使用 NLTK,如何在文本中搜索概念
Using NLTK, how to search for concepts in a text
我对 Python 和 NLTK 都是新手。因此,我尝试使用 NLTK 在文本中查看某些概念的表示。我有一个 CSV 文件,看起来像这样 image
而且我想看看出现的频率,例如,自由、勇气和所有其他概念。我还想知道如何确保代码查找 bi 和 trigrams。但是,我下面的代码只允许我在文本中查找 单个 单词列表 (Preps.txt like this).
我期望的输出是这样的:
概念 = 文本中的频率,即自由 = 10,勇气 = 20
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/Users/Muhsa/Myfolder/Concepts' #this is where the texts I want to study are located
Concepts= PlaintextCorpusReader(corpus_root, '.*')
Concepts.fileids()
for fileid in Concepts.fileids():
text3 = Concepts.words(fileid)
from nltk import word_tokenize
from nltk import FreqDist
text3 = Concepts.words(fileid)
preps = open('preps.txt', encoding="utf-8")
rawpreps = preps.read() #preps refer to the file that has the list of words
tokens = word_tokenize(rawpreps)
texty = nltk.Text(tokens)
fdist = nltk.FreqDist(w.lower() for w in text3)
for m in texty:
print(m + ':', fdist[m], end=' ')
我稍微重新组织了您的代码。我假设每个概念词都有 1 个文件,并且 'preps.txt' 只包含勇气词而不包含其他词。
希望通俗易懂
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk import word_tokenize
from nltk import FreqDist
# Load the courage vocabulary
with open('preps.txt', encoding="utf-8") as file:
content = file.read() #preps refer to the file that has the list of words
courage_words = content.split('\n') # This is a list of words
# load freedom and development words in the same fashion
# Load the corpus
corpus_root = '/Users/Muhsa/Myfolder/Concepts' #this is where the texts I want to study are located
corpus = PlaintextCorpusReader(corpus_root, '.*')
# Count the number of word in the whole corpus that are also in the courage vocabulry
courage_freq = len([w for w in corpus.words() if w in courage_words])
print('Corpus contains {} courage words'.format(courage_freq))
# For each file in the corpus
for file_id in corpus.fileids():
# Count the number of word in the file that are also in courage word
file_freq = len([w for w in corpus.words(file_id) if w in courage_words])
print(file_id, file_freq)
或更好
# Load concept vocabulary in different files, in a python dictionary
concept_voc = {}
for file_path in ['courage.txt', 'freedom.txt', 'development.txt']:
concept_name = file_path.replace('.txt', '')
with open(file_path) as f:
voc = f.read().split('\n')
concept_voc[concept_name] = voc
# Load concept vocabulary in a csv file, each column is one vocabulary, the first line is the "name"
df = pd.read_csv('to_dict.csv')
convept_voc = df.to_dict('columns')
# concept_voc['courage'] returns the list of courage words
# And then for each concept compute the frequency as before
for concept in concept_voc:
voc = concept_voc[concept]
corpus_freq = len([w for w in corpus.words() if w in voc])
print(concept, '=', corpus_freq)
我对 Python 和 NLTK 都是新手。因此,我尝试使用 NLTK 在文本中查看某些概念的表示。我有一个 CSV 文件,看起来像这样 image
而且我想看看出现的频率,例如,自由、勇气和所有其他概念。我还想知道如何确保代码查找 bi 和 trigrams。但是,我下面的代码只允许我在文本中查找 单个 单词列表 (Preps.txt like this).
我期望的输出是这样的: 概念 = 文本中的频率,即自由 = 10,勇气 = 20
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/Users/Muhsa/Myfolder/Concepts' #this is where the texts I want to study are located
Concepts= PlaintextCorpusReader(corpus_root, '.*')
Concepts.fileids()
for fileid in Concepts.fileids():
text3 = Concepts.words(fileid)
from nltk import word_tokenize
from nltk import FreqDist
text3 = Concepts.words(fileid)
preps = open('preps.txt', encoding="utf-8")
rawpreps = preps.read() #preps refer to the file that has the list of words
tokens = word_tokenize(rawpreps)
texty = nltk.Text(tokens)
fdist = nltk.FreqDist(w.lower() for w in text3)
for m in texty:
print(m + ':', fdist[m], end=' ')
我稍微重新组织了您的代码。我假设每个概念词都有 1 个文件,并且 'preps.txt' 只包含勇气词而不包含其他词。
希望通俗易懂
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk import word_tokenize
from nltk import FreqDist
# Load the courage vocabulary
with open('preps.txt', encoding="utf-8") as file:
content = file.read() #preps refer to the file that has the list of words
courage_words = content.split('\n') # This is a list of words
# load freedom and development words in the same fashion
# Load the corpus
corpus_root = '/Users/Muhsa/Myfolder/Concepts' #this is where the texts I want to study are located
corpus = PlaintextCorpusReader(corpus_root, '.*')
# Count the number of word in the whole corpus that are also in the courage vocabulry
courage_freq = len([w for w in corpus.words() if w in courage_words])
print('Corpus contains {} courage words'.format(courage_freq))
# For each file in the corpus
for file_id in corpus.fileids():
# Count the number of word in the file that are also in courage word
file_freq = len([w for w in corpus.words(file_id) if w in courage_words])
print(file_id, file_freq)
或更好
# Load concept vocabulary in different files, in a python dictionary
concept_voc = {}
for file_path in ['courage.txt', 'freedom.txt', 'development.txt']:
concept_name = file_path.replace('.txt', '')
with open(file_path) as f:
voc = f.read().split('\n')
concept_voc[concept_name] = voc
# Load concept vocabulary in a csv file, each column is one vocabulary, the first line is the "name"
df = pd.read_csv('to_dict.csv')
convept_voc = df.to_dict('columns')
# concept_voc['courage'] returns the list of courage words
# And then for each concept compute the frequency as before
for concept in concept_voc:
voc = concept_voc[concept]
corpus_freq = len([w for w in corpus.words() if w in voc])
print(concept, '=', corpus_freq)