比较、删除和计算文本文件中的单词
Compare, remove, and count words in Text file
我想比较两个文本文件 f1.txt 和 f2.txt,从 f2.txt 中删除在两个文件中找到的常用词并将新的 f2.txt 降序排列按频率排序
我的做法:
- 列出 f1.txt 和 f2.txt 的单词。
- 从文本输入中删除不需要的字符。
- 比较两个列表并从 f2.txt
生成的列表中删除常用词
- 按频率
对f2.txt生成的列表中的单词进行排序
with open(sys.argv[1]) as f1,open(sys.argv[2]) as f2:
passage = f2.read()
common = f1.read()
words = re.findall(r'\w+', passage)
common_words = re.findall(r'\w+', common)
passage_text = [words.lower() for words in words]
final = set(passage_text) - set(common_words)
word_count = Counter(final)
for word, count in word_count.items():
print(word, ":", count)
我希望输出是这样的:
Foo: 12
Bar: 11
Baz: 3
Longword: 1
但我得到每个单词的计数频率为 1
您的值 final
仅包含独特的单词(每个单词一个),这就是 Counter
仅显示 1 次的原因。您需要使用这组单词过滤 passage_text
并将过滤后的列表传递给 Counter:
import re
from collections import Counter
passage = '''
Foo and Bar and Baz or Longword
Bar or Baz
Foo foo foo
'''
common = '''and or'''
words = re.findall(r'\w+', passage)
common_words = re.findall(r'\w+', common)
passage_text = [words.lower() for words in words]
final_set = set(passage_text) - set(common_words)
word_count = Counter([w for w in passage_text if w in final_set])
for word, count in sorted(word_count.items(), key=lambda k: -k[1]): # or word_count.most_common()
print(word, ":", count)
打印:
foo : 4
bar : 2
baz : 2
longword : 1
这里有两种计算文本文件中字数的方法。
from re import split
def process_line(words, word_dict):
for word in words:
if word in word_dict:
word_dict[word] += 1
else:
word_dict[word] = 1
def process_dict(word_dict):
temp_list = []
for key, value in word_dict.items():
temp_list.append((value, key))
temp_list.sort()
return temp_list
def format_print(input_list, reverse, word_num):
if reverse:
input_list.sort(reverse=True)
print("\n", ("[Unique Words: " + str(word_num) + "]").center(35, "="))
print("-"*35 + "\n", "%-16s %s %16s" % ("Word", "|", "Count"), "\n", "-"*35)
for count, word in input_list:
print("%-16s %s %16d" % (word, "|", count))
def word_count(_file, max_to_min=False):
txt = open(_file, "rU")
word_dict = {}
for line in txt:
if line.replace(" ", "") != ("\n" or None):
process_line(filter(None, split("[^a-zA-Z']+", line.lower())), word_dict)
txt.close()
final_list = process_dict(word_dict)
format_print(final_list, max_to_min, len(word_dict))
word_count("C:\your_path_here\Test.txt", True)
#########################################################
from collections import Counter
import re
def openfile(filename):
fh = open(filename, "r+")
str = fh.read()
fh.close()
return str
def removegarbage(str):
# Replace one or more non-word (non-alphanumeric) chars with a space
str = re.sub(r'\W+', ' ', str)
str = str.lower()
return str
def getwordbins(words):
cnt = Counter()
for word in words:
cnt[word] += 1
return cnt
def main(filename, topwords):
txt = openfile(filename)
txt = removegarbage(txt)
words = txt.split(' ')
bins = getwordbins(words)
for key, value in bins.most_common(topwords):
print(key,value)
main('C:\your_path_here\Test.txt', 500)
这是一种比较两个文本文件并保留共同元素的方法。
with open('C:\your_path_here\text1.txt', 'r') as file1:
with open('C:\your_path_here\text2.txt', 'r') as file2:
same = set(file1).intersection(file2)
same.discard('\n')
with open('C:\your_path_here\some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)
# For differences, use the code below:
with open('C:\your_path_here\text1.txt', 'r') as file1:
with open('C:\your_path_here\text2.txt', 'r') as file2:
same = set(file1).symmetric_difference(file2)
same.discard('\n')
with open('C:\your_path_here\some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)
我想比较两个文本文件 f1.txt 和 f2.txt,从 f2.txt 中删除在两个文件中找到的常用词并将新的 f2.txt 降序排列按频率排序
我的做法:
- 列出 f1.txt 和 f2.txt 的单词。
- 从文本输入中删除不需要的字符。
- 比较两个列表并从 f2.txt 生成的列表中删除常用词
- 按频率 对f2.txt生成的列表中的单词进行排序
with open(sys.argv[1]) as f1,open(sys.argv[2]) as f2:
passage = f2.read()
common = f1.read()
words = re.findall(r'\w+', passage)
common_words = re.findall(r'\w+', common)
passage_text = [words.lower() for words in words]
final = set(passage_text) - set(common_words)
word_count = Counter(final)
for word, count in word_count.items():
print(word, ":", count)
我希望输出是这样的:
Foo: 12
Bar: 11
Baz: 3
Longword: 1
但我得到每个单词的计数频率为 1
您的值 final
仅包含独特的单词(每个单词一个),这就是 Counter
仅显示 1 次的原因。您需要使用这组单词过滤 passage_text
并将过滤后的列表传递给 Counter:
import re
from collections import Counter
passage = '''
Foo and Bar and Baz or Longword
Bar or Baz
Foo foo foo
'''
common = '''and or'''
words = re.findall(r'\w+', passage)
common_words = re.findall(r'\w+', common)
passage_text = [words.lower() for words in words]
final_set = set(passage_text) - set(common_words)
word_count = Counter([w for w in passage_text if w in final_set])
for word, count in sorted(word_count.items(), key=lambda k: -k[1]): # or word_count.most_common()
print(word, ":", count)
打印:
foo : 4
bar : 2
baz : 2
longword : 1
这里有两种计算文本文件中字数的方法。
from re import split
def process_line(words, word_dict):
for word in words:
if word in word_dict:
word_dict[word] += 1
else:
word_dict[word] = 1
def process_dict(word_dict):
temp_list = []
for key, value in word_dict.items():
temp_list.append((value, key))
temp_list.sort()
return temp_list
def format_print(input_list, reverse, word_num):
if reverse:
input_list.sort(reverse=True)
print("\n", ("[Unique Words: " + str(word_num) + "]").center(35, "="))
print("-"*35 + "\n", "%-16s %s %16s" % ("Word", "|", "Count"), "\n", "-"*35)
for count, word in input_list:
print("%-16s %s %16d" % (word, "|", count))
def word_count(_file, max_to_min=False):
txt = open(_file, "rU")
word_dict = {}
for line in txt:
if line.replace(" ", "") != ("\n" or None):
process_line(filter(None, split("[^a-zA-Z']+", line.lower())), word_dict)
txt.close()
final_list = process_dict(word_dict)
format_print(final_list, max_to_min, len(word_dict))
word_count("C:\your_path_here\Test.txt", True)
#########################################################
from collections import Counter
import re
def openfile(filename):
fh = open(filename, "r+")
str = fh.read()
fh.close()
return str
def removegarbage(str):
# Replace one or more non-word (non-alphanumeric) chars with a space
str = re.sub(r'\W+', ' ', str)
str = str.lower()
return str
def getwordbins(words):
cnt = Counter()
for word in words:
cnt[word] += 1
return cnt
def main(filename, topwords):
txt = openfile(filename)
txt = removegarbage(txt)
words = txt.split(' ')
bins = getwordbins(words)
for key, value in bins.most_common(topwords):
print(key,value)
main('C:\your_path_here\Test.txt', 500)
这是一种比较两个文本文件并保留共同元素的方法。
with open('C:\your_path_here\text1.txt', 'r') as file1:
with open('C:\your_path_here\text2.txt', 'r') as file2:
same = set(file1).intersection(file2)
same.discard('\n')
with open('C:\your_path_here\some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)
# For differences, use the code below:
with open('C:\your_path_here\text1.txt', 'r') as file1:
with open('C:\your_path_here\text2.txt', 'r') as file2:
same = set(file1).symmetric_difference(file2)
same.discard('\n')
with open('C:\your_path_here\some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)