在 python 中创建词长频率 table
Creating a word length frequency table in python
我有以下代码:
import re
def get_filename():
"""gets the file"""
filename = input("Please enter filename: ")
return filename
def get_words_from_file(filename):
"""getting the data and printing it word by word"""
infile = open(filename, 'r', encoding='utf-8')
outfile = infile.read().splitlines()
words = []
reading = False
for let in outfile:
if let.startswith("*** START OF")and reading == False:
reading = True
elif let.startswith("*** END OF SYNTHETIC TEST CASE ***") or let.startswith("*** END"):
return words
elif reading:
let = let.lower()
words.extend(re.findall("[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+", let))
return words
def calculate(words):
"""gjhwjghwg2"""
all_times = []
max_word_length = 0
number_of_words = len(words)
average = sum(len(word) for word in words) / number_of_words
for word in words:
if len(word)>max_word_length:
max_word_length=len(word)
frequency = {word: 0 for word in words}
for word in words:
frequency[word] += 1
max_frequency = max(frequency.values())
result = (number_of_words, average, max_word_length, max_frequency)
return result
def get_frequency(words):
"""ghjhgwejhgwjgw"""
len_count = []
frequency_1 = {word: 0 for word in words}
for word in words:
frequency_1[word] += 1
answer = (frequency_1, len_count)
return answer
def print_results(stats_tuple, lengthy):
"""calculate the goods"""
(frequency_1, len_count) = lengthy
(number_of_words, average, max_word_length, max_frequency) = stats_tuple
print("")
print("Word summary (all words):")
print(" Number of words = {0}".format(number_of_words))
print(" Average word length = {:.2f}".format(average))
print(" Maximum word length = {0}".format(max_word_length))
print(" Maximum frequency = {0}".format(max_frequency))
print("")
print(" Len Freq")
for word_len in range(1, max(len_count) + 1):
print(f'{frequency_1}\t{len_count.get(frequency_1, 0)}')
def main():
"""ghkghwgjkwhgw"""
filename = get_filename()
data = get_words_from_file(filename)
stats = calculate(data)
lengthy = get_frequency(data)
print_results(stats, lengthy)
main()
在不导入任何其他内容的情况下,我如何制作一个 table 打印长度然后打印频率。
例如:
在文本为“a blah ba ba”的文件中,它将打印:
Len Freq
1 1
2 2
3 0
4 1
让我困惑的是如何将所有单词的长度加在一起,我应该用所有相同长度的单词制作一个新列表然后计算该列表的长度,还是有更好的方法。
您可以使用 collections
模块中的 Counter
。然后循环从 1 到最大计数的长度。如果计数字典中不存在长度,则写入 0
,否则写入相应的值。
from collections import Counter
filename = 'test.txt'
with open(filename) as fin:
words = fin.read().split()
counts = Counter(len(word) for word in words)
print('Len Freq')
for length in range(1, max(counts.keys()) + 1):
print(f'{length:4d} {counts.get(length, 0):5d}')
如果输入文件包含:
a blah ba ba
然后这给出:
Len Freq
1 1
2 2
3 0
4 1
请注意,为了清楚起见,max(counts.keys())
是这样写的,但可以使用 max(counts)
代替(因为如果您遍历字典,max
将在此处执行 - 并且Counter
对象本质上是一个字典 - 然后你遍历它的键)。
此示例假设您有一个可以存储在内存中的相对较小的文件。如果你有一个更大的文件,那么你可以做类似下面的事情,以避免这样做:
from collections import Counter
filename = 'test.txt'
def iter_lengths(filename):
with open(filename) as fin:
for line in fin:
for word in line.split():
yield len(word)
counts = Counter(iter_lengths(filename))
print('Len Freq')
for length in range(1, max(counts) + 1):
print(f'{length:4d} {counts.get(length, 0):5d}')
len_count = {}
with open(filename, "r") as file:
for line in file:
for word in line.split():
word_len = len(word)
if not word_len in len_count:
len_count[word_len] = 1
else:
len_count[word_len] += 1
然后你可以打印两列:
print("Len\tFreq")
for word_len in range(1, max(len_count) + 1):
print(f'{word_len}\t{len_count.get(word_len, 0)}')
编辑:处理空文件:
max_len_count = 0 if not len_count else max(len_count)
for word_len in range(1, max_len_count + 1):
print(f'{word_len}\t{len_count.get(word_len, 0)}')
我有以下代码:
import re
def get_filename():
"""gets the file"""
filename = input("Please enter filename: ")
return filename
def get_words_from_file(filename):
"""getting the data and printing it word by word"""
infile = open(filename, 'r', encoding='utf-8')
outfile = infile.read().splitlines()
words = []
reading = False
for let in outfile:
if let.startswith("*** START OF")and reading == False:
reading = True
elif let.startswith("*** END OF SYNTHETIC TEST CASE ***") or let.startswith("*** END"):
return words
elif reading:
let = let.lower()
words.extend(re.findall("[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+", let))
return words
def calculate(words):
"""gjhwjghwg2"""
all_times = []
max_word_length = 0
number_of_words = len(words)
average = sum(len(word) for word in words) / number_of_words
for word in words:
if len(word)>max_word_length:
max_word_length=len(word)
frequency = {word: 0 for word in words}
for word in words:
frequency[word] += 1
max_frequency = max(frequency.values())
result = (number_of_words, average, max_word_length, max_frequency)
return result
def get_frequency(words):
"""ghjhgwejhgwjgw"""
len_count = []
frequency_1 = {word: 0 for word in words}
for word in words:
frequency_1[word] += 1
answer = (frequency_1, len_count)
return answer
def print_results(stats_tuple, lengthy):
"""calculate the goods"""
(frequency_1, len_count) = lengthy
(number_of_words, average, max_word_length, max_frequency) = stats_tuple
print("")
print("Word summary (all words):")
print(" Number of words = {0}".format(number_of_words))
print(" Average word length = {:.2f}".format(average))
print(" Maximum word length = {0}".format(max_word_length))
print(" Maximum frequency = {0}".format(max_frequency))
print("")
print(" Len Freq")
for word_len in range(1, max(len_count) + 1):
print(f'{frequency_1}\t{len_count.get(frequency_1, 0)}')
def main():
"""ghkghwgjkwhgw"""
filename = get_filename()
data = get_words_from_file(filename)
stats = calculate(data)
lengthy = get_frequency(data)
print_results(stats, lengthy)
main()
在不导入任何其他内容的情况下,我如何制作一个 table 打印长度然后打印频率。
例如: 在文本为“a blah ba ba”的文件中,它将打印:
Len Freq
1 1
2 2
3 0
4 1
让我困惑的是如何将所有单词的长度加在一起,我应该用所有相同长度的单词制作一个新列表然后计算该列表的长度,还是有更好的方法。
您可以使用 collections
模块中的 Counter
。然后循环从 1 到最大计数的长度。如果计数字典中不存在长度,则写入 0
,否则写入相应的值。
from collections import Counter
filename = 'test.txt'
with open(filename) as fin:
words = fin.read().split()
counts = Counter(len(word) for word in words)
print('Len Freq')
for length in range(1, max(counts.keys()) + 1):
print(f'{length:4d} {counts.get(length, 0):5d}')
如果输入文件包含:
a blah ba ba
然后这给出:
Len Freq
1 1
2 2
3 0
4 1
请注意,为了清楚起见,max(counts.keys())
是这样写的,但可以使用 max(counts)
代替(因为如果您遍历字典,max
将在此处执行 - 并且Counter
对象本质上是一个字典 - 然后你遍历它的键)。
此示例假设您有一个可以存储在内存中的相对较小的文件。如果你有一个更大的文件,那么你可以做类似下面的事情,以避免这样做:
from collections import Counter
filename = 'test.txt'
def iter_lengths(filename):
with open(filename) as fin:
for line in fin:
for word in line.split():
yield len(word)
counts = Counter(iter_lengths(filename))
print('Len Freq')
for length in range(1, max(counts) + 1):
print(f'{length:4d} {counts.get(length, 0):5d}')
len_count = {}
with open(filename, "r") as file:
for line in file:
for word in line.split():
word_len = len(word)
if not word_len in len_count:
len_count[word_len] = 1
else:
len_count[word_len] += 1
然后你可以打印两列:
print("Len\tFreq")
for word_len in range(1, max(len_count) + 1):
print(f'{word_len}\t{len_count.get(word_len, 0)}')
编辑:处理空文件:
max_len_count = 0 if not len_count else max(len_count)
for word_len in range(1, max_len_count + 1):
print(f'{word_len}\t{len_count.get(word_len, 0)}')