使用目录作为带有 python `textblob` 的 tf-idf 的输入
Using directory as input for tf-idf with python `textblob`
我正在尝试修改此代码(找到源代码 here)以遍历文件目录,而不是对输入进行硬编码。
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob)
def idf(word, bloblist):
return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
document1 = tb("""Today, the weather is 30 degrees in Celcius. It is really hot""")
document2 = tb("""I can't believe the traffic headed to the beach. It is really a circus out there.'""")
document3 = tb("""There are so many tolls on this road. I recommend taking the interstate.""")
bloblist = [document1, document2, document3]
for i, blob in enumerate(bloblist):
print("Document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words:
score_weight = score * 100
print("\t{}, {}".format(word, round(score_weight, 5)))
我想在一个目录中使用一个输入的txt文件,而不是每个都硬编码document
。
例如,假设我有一个目录 foo
,其中包含三个文件 file1
、file2
、file3
.
文件 1 包含 document1
包含的内容,即
文件 1:
Today, the weather is 30 degrees in Celcius. It is really hot
文件 2 包含 document2
包含的内容,即
I can't believe the traffic headed to the beach. It is really a circus out there.
文件 3 包含 document3
包含的内容,即
There are so many tolls on this road. I recommend taking the interstate.
我不得不使用 glob
来实现我想要的结果,我想出了以下代码改编,它正确识别了文件,但不像原始代码那样单独处理它们:
file_names = glob.glob("/path/to/foo/*")
files = map(open,file_names)
documents = [file.read() for file in files]
[file.close() for file in files]
bloblist = [documents]
for i, blob in enumerate(bloblist):
print("Document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words:
score_weight = score * 100
print("\t{}, {}".format(word, round(score_weight, 5)))
如何使用 glob
维护每个文件的分数?
使用目录中的文件作为输入后的预期结果将与原始代码相同[space] 的结果被截断为前 3 位:
Document 1
Celcius, 3.37888
30, 3.37888
hot, 3.37888
Document 2
there, 2.38509
out, 2.38509
headed, 2.38509
Document 3
on, 3.11896
this, 3.11896
many, 3.11896
一个类似的问题here没有完全解决问题。我想知道如何调用文件来计算 idf
但单独维护它们以计算完整的 tf-idf
?
在您的第一个代码示例中,您使用 tb()
的结果填充 bloblist
,在您的第二个示例中 - 输入 tb()
(仅字符串)。
尝试将 bloblist = [documents]
替换为 bloblist = map(tb, documents)
。
您也可以像这样 file_names = sorted(glob.glob("/path/to/foo/*"))
对文件名列表进行排序以使两个版本的输出匹配。
我不确定你到底想达到什么目的。
您可以有一个数组并将结果附加到该数组:
scores = []
bloblist = [documents]
for i, blob in enumerate(bloblist):
... do your evaluation ..
scores.append(score_weight)
print scores
@AnnaBonazzi 在这里提供了一个代码片段,https://gist.github.com/sloria/6407257,
import os, glob
folder = "/path/to/folder/"
os.chdir(folder)
files = glob.glob("*.txt") # Makes a list of all files in folder
bloblist = []
for file1 in files:
with open (file1, 'r') as f:
data = f.read() # Reads document content into a string
document = tb(data.decode("utf-8")) # Makes TextBlob object
bloblist.append(document)
我修改了它供我使用 (Python 3):
import os, glob
bloblist = []
def make_corpus(input_dir):
""" Based on code snippet from https://gist.github.com/sloria/6407257 """
global doc ## used outside this method
input_folder = "input"
os.chdir(input_folder)
files = glob.glob("*.*") ## or "*.txt", etc.
for doc in files:
# print('doc:', doc) ## prints filename (doc)
with open (doc, 'r') as f:
data = f.read() ## read document content into a string
document = tb(data) ## make TextBlob object
bloblist.append(document)
# print('bloblist:\n', bloblist) ## copious output ...
print('len(bloblist):', len(bloblist))
make_corpus('input') ## input directory 'input'
更新 1:
我个人在使用 Python glob 模块时遇到困难,因为我经常 (i) 文件名没有扩展名(例如 01),并且 (ii) 想要递归嵌套目录。
乍一看,"glob" 方法似乎是一个简单的解决方案。但是,当尝试遍历 glob 返回的文件时,我经常遇到错误(例如)
IsADirectoryError: [Errno 21] Is a directory: ...
当循环遇到 glob 返回的目录(不是文件)名称时。
在我看来,只要稍加努力,下面的方法就更可靠了:
import os
bloblist = []
def make_corpus(input_dir):
for root, subdirs, files in os.walk(input_dir):
for filename in files:
f = os.path.join(root, filename)
print('file:', f)
with open(os.path.join(root, filename)) as f:
for line in f:
# print(line, end='')
bloblist.append(line)
# print('bloblist:\n', bloblist)
print('len(bloblist):', len(bloblist), '\n')
make_corpus('input') ## 'input' = input dir
更新二:
最后一个方法(Linux shell find
命令,适用于 Python 3):
import sh ## pip install sh
def make_corpus(input_dir):
'''find (here) matches filenames, excludes directory names'''
corpus = []
file_list = []
#FILES = sh.find(input_dir, '-type', 'f', '-iname', '*.txt') ## find all .txt files
FILES = sh.find(input_dir, '-type', 'f', '-iname', '*') ## find any file
print('FILES:', FILES) ## caveat: files in FILES are '\n'-terminated ...
for filename in FILES:
#print(filename, end='')
# file_list.append(filename) ## when printed, each filename ends with '\n'
filename = filename.rstrip('\n') ## ... this addresses that issue
file_list.append(filename)
with open(filename) as f:
#print('file:', filename)
# ----------------------------------------
# for general use:
#for line in f:
#print(line)
#corpus.append(line)
# ----------------------------------------
# for this particular example (Question, above):
data = f.read()
document = tb(data)
corpus.append(document)
print('file_list:', file_list)
print('corpus length (lines):', len(corpus))
with open('output/corpus', 'w') as f: ## write to file
for line in corpus:
f.write(line)
我正在尝试修改此代码(找到源代码 here)以遍历文件目录,而不是对输入进行硬编码。
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob)
def idf(word, bloblist):
return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
document1 = tb("""Today, the weather is 30 degrees in Celcius. It is really hot""")
document2 = tb("""I can't believe the traffic headed to the beach. It is really a circus out there.'""")
document3 = tb("""There are so many tolls on this road. I recommend taking the interstate.""")
bloblist = [document1, document2, document3]
for i, blob in enumerate(bloblist):
print("Document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words:
score_weight = score * 100
print("\t{}, {}".format(word, round(score_weight, 5)))
我想在一个目录中使用一个输入的txt文件,而不是每个都硬编码document
。
例如,假设我有一个目录 foo
,其中包含三个文件 file1
、file2
、file3
.
文件 1 包含 document1
包含的内容,即
文件 1:
Today, the weather is 30 degrees in Celcius. It is really hot
文件 2 包含 document2
包含的内容,即
I can't believe the traffic headed to the beach. It is really a circus out there.
文件 3 包含 document3
包含的内容,即
There are so many tolls on this road. I recommend taking the interstate.
我不得不使用 glob
来实现我想要的结果,我想出了以下代码改编,它正确识别了文件,但不像原始代码那样单独处理它们:
file_names = glob.glob("/path/to/foo/*")
files = map(open,file_names)
documents = [file.read() for file in files]
[file.close() for file in files]
bloblist = [documents]
for i, blob in enumerate(bloblist):
print("Document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words:
score_weight = score * 100
print("\t{}, {}".format(word, round(score_weight, 5)))
如何使用 glob
维护每个文件的分数?
使用目录中的文件作为输入后的预期结果将与原始代码相同[space] 的结果被截断为前 3 位:
Document 1
Celcius, 3.37888
30, 3.37888
hot, 3.37888
Document 2
there, 2.38509
out, 2.38509
headed, 2.38509
Document 3
on, 3.11896
this, 3.11896
many, 3.11896
一个类似的问题here没有完全解决问题。我想知道如何调用文件来计算 idf
但单独维护它们以计算完整的 tf-idf
?
在您的第一个代码示例中,您使用 tb()
的结果填充 bloblist
,在您的第二个示例中 - 输入 tb()
(仅字符串)。
尝试将 bloblist = [documents]
替换为 bloblist = map(tb, documents)
。
您也可以像这样 file_names = sorted(glob.glob("/path/to/foo/*"))
对文件名列表进行排序以使两个版本的输出匹配。
我不确定你到底想达到什么目的。 您可以有一个数组并将结果附加到该数组:
scores = []
bloblist = [documents]
for i, blob in enumerate(bloblist):
... do your evaluation ..
scores.append(score_weight)
print scores
@AnnaBonazzi 在这里提供了一个代码片段,https://gist.github.com/sloria/6407257,
import os, glob
folder = "/path/to/folder/"
os.chdir(folder)
files = glob.glob("*.txt") # Makes a list of all files in folder
bloblist = []
for file1 in files:
with open (file1, 'r') as f:
data = f.read() # Reads document content into a string
document = tb(data.decode("utf-8")) # Makes TextBlob object
bloblist.append(document)
我修改了它供我使用 (Python 3):
import os, glob
bloblist = []
def make_corpus(input_dir):
""" Based on code snippet from https://gist.github.com/sloria/6407257 """
global doc ## used outside this method
input_folder = "input"
os.chdir(input_folder)
files = glob.glob("*.*") ## or "*.txt", etc.
for doc in files:
# print('doc:', doc) ## prints filename (doc)
with open (doc, 'r') as f:
data = f.read() ## read document content into a string
document = tb(data) ## make TextBlob object
bloblist.append(document)
# print('bloblist:\n', bloblist) ## copious output ...
print('len(bloblist):', len(bloblist))
make_corpus('input') ## input directory 'input'
更新 1:
我个人在使用 Python glob 模块时遇到困难,因为我经常 (i) 文件名没有扩展名(例如 01),并且 (ii) 想要递归嵌套目录。
乍一看,"glob" 方法似乎是一个简单的解决方案。但是,当尝试遍历 glob 返回的文件时,我经常遇到错误(例如)
IsADirectoryError: [Errno 21] Is a directory: ...
当循环遇到 glob 返回的目录(不是文件)名称时。
在我看来,只要稍加努力,下面的方法就更可靠了:
import os
bloblist = []
def make_corpus(input_dir):
for root, subdirs, files in os.walk(input_dir):
for filename in files:
f = os.path.join(root, filename)
print('file:', f)
with open(os.path.join(root, filename)) as f:
for line in f:
# print(line, end='')
bloblist.append(line)
# print('bloblist:\n', bloblist)
print('len(bloblist):', len(bloblist), '\n')
make_corpus('input') ## 'input' = input dir
更新二:
最后一个方法(Linux shell find
命令,适用于 Python 3):
import sh ## pip install sh
def make_corpus(input_dir):
'''find (here) matches filenames, excludes directory names'''
corpus = []
file_list = []
#FILES = sh.find(input_dir, '-type', 'f', '-iname', '*.txt') ## find all .txt files
FILES = sh.find(input_dir, '-type', 'f', '-iname', '*') ## find any file
print('FILES:', FILES) ## caveat: files in FILES are '\n'-terminated ...
for filename in FILES:
#print(filename, end='')
# file_list.append(filename) ## when printed, each filename ends with '\n'
filename = filename.rstrip('\n') ## ... this addresses that issue
file_list.append(filename)
with open(filename) as f:
#print('file:', filename)
# ----------------------------------------
# for general use:
#for line in f:
#print(line)
#corpus.append(line)
# ----------------------------------------
# for this particular example (Question, above):
data = f.read()
document = tb(data)
corpus.append(document)
print('file_list:', file_list)
print('corpus length (lines):', len(corpus))
with open('output/corpus', 'w') as f: ## write to file
for line in corpus:
f.write(line)