如果不导入,处理文件的时间很长
Time to process a file is very long without import
我的代码如下:
import re
def get_filename():
"""gets the file"""
filename = input("Please enter filename: ")
return filename
def get_words_from_file(filename):
"""getting the data and printing it word by word"""
infile = open(filename, 'r', encoding='utf-8')
outfile = infile.read().splitlines()
words = []
reading = False
for let in outfile:
if let.startswith("*** START OF")and reading == False:
reading = True
elif let.startswith("*** END OF SYNTHETIC TEST CASE ***") or let.startswith("*** END"):
return words
elif reading:
let = let.lower()
words.extend(re.findall("[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+", let))
return words
def calculate(words):
"""gjhwjghwg2"""
all_times = []
max_word_length = 0
number_of_words = len(words)
average = sum(len(word) for word in words) / number_of_words
for word in words:
if len(word)>max_word_length:
max_word_length=len(word)
for word in words:
total =words.count(word)
all_times.append(total)
max_frequency = max(all_times)
result = (number_of_words, average, max_word_length, max_frequency)
return result
def print_results(stats_tuple):
"""calculate the goods"""
(number_of_words, average, max_word_length, max_frequency) = stats_tuple
print("")
print("Word summary (all words):")
print(" Number of words = {0}".format(number_of_words))
print(" Average word length = {:.2f}".format(average))
print(" Maximum word length = {0}".format(max_word_length))
print(" Maximum frequency = {0}".format(max_frequency))
def main():
"""ghkghwgjkwhgw"""
filename = get_filename()
data = get_words_from_file(filename)
stats = calculate(data)
print_results(stats)
main()
我有一个非常大的文本文件,所以当我尝试 运行 它时,需要很长时间。只是想知道我是否需要更改某些内容以减少花费的时间。该代码在其他地方运行良好,但此文本文件有 75,000 个单词。
据我所见,我认为
for word in words:
total =words.count(word)
all_times.append(total)
是问题所在,因为它的运行时间是 O(len(words)**2)。把这个改成
怎么样
frequency = {word: 0 for word in words}
for word in words:
frequency[word] += 1
max_frequency = max(frequency.values())
注意:我没有测试这段代码。
在get_words_from_file
中:
- 不要读取整个文件然后拆分行 - 只需遍历行
- 编译您的正则表达式模式一次并使用它
- 你真的需要那个
lower()
电话吗?
您有一个包含 N-words 的文本文件。您正在迭代它 5 次
- get_words_from_file
- average = sum(len(word) for word in words) / number_of_words
3) for word in words:
if len(word)>max_word_length:
max_word_length=len(word)
4) for word in words:
total =words.count(word) # here is the fifth time
all_times.append(total)
总而言之,你的时间复杂度是2N+N^2; O(N^2).
你可以通过只做两次迭代来节省很多时间。
在单词的第一次迭代中,制作一个字典 key=the word value=number of appearances
dict[str,int]
第二次迭代将计算所有其他度量。
在最坏的情况下(如果所有的词都不同),时间复杂度将只有2N。
大多数时候,由于所有的单词重复,它会快得多。
我的代码如下:
import re
def get_filename():
"""gets the file"""
filename = input("Please enter filename: ")
return filename
def get_words_from_file(filename):
"""getting the data and printing it word by word"""
infile = open(filename, 'r', encoding='utf-8')
outfile = infile.read().splitlines()
words = []
reading = False
for let in outfile:
if let.startswith("*** START OF")and reading == False:
reading = True
elif let.startswith("*** END OF SYNTHETIC TEST CASE ***") or let.startswith("*** END"):
return words
elif reading:
let = let.lower()
words.extend(re.findall("[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+", let))
return words
def calculate(words):
"""gjhwjghwg2"""
all_times = []
max_word_length = 0
number_of_words = len(words)
average = sum(len(word) for word in words) / number_of_words
for word in words:
if len(word)>max_word_length:
max_word_length=len(word)
for word in words:
total =words.count(word)
all_times.append(total)
max_frequency = max(all_times)
result = (number_of_words, average, max_word_length, max_frequency)
return result
def print_results(stats_tuple):
"""calculate the goods"""
(number_of_words, average, max_word_length, max_frequency) = stats_tuple
print("")
print("Word summary (all words):")
print(" Number of words = {0}".format(number_of_words))
print(" Average word length = {:.2f}".format(average))
print(" Maximum word length = {0}".format(max_word_length))
print(" Maximum frequency = {0}".format(max_frequency))
def main():
"""ghkghwgjkwhgw"""
filename = get_filename()
data = get_words_from_file(filename)
stats = calculate(data)
print_results(stats)
main()
我有一个非常大的文本文件,所以当我尝试 运行 它时,需要很长时间。只是想知道我是否需要更改某些内容以减少花费的时间。该代码在其他地方运行良好,但此文本文件有 75,000 个单词。
据我所见,我认为
for word in words:
total =words.count(word)
all_times.append(total)
是问题所在,因为它的运行时间是 O(len(words)**2)。把这个改成
怎么样 frequency = {word: 0 for word in words}
for word in words:
frequency[word] += 1
max_frequency = max(frequency.values())
注意:我没有测试这段代码。
在get_words_from_file
中:
- 不要读取整个文件然后拆分行 - 只需遍历行
- 编译您的正则表达式模式一次并使用它
- 你真的需要那个
lower()
电话吗?
您有一个包含 N-words 的文本文件。您正在迭代它 5 次
- get_words_from_file
- average = sum(len(word) for word in words) / number_of_words
3) for word in words:
if len(word)>max_word_length:
max_word_length=len(word)
4) for word in words:
total =words.count(word) # here is the fifth time
all_times.append(total)
总而言之,你的时间复杂度是2N+N^2; O(N^2).
你可以通过只做两次迭代来节省很多时间。 在单词的第一次迭代中,制作一个字典 key=the word value=number of appearances
dict[str,int]
第二次迭代将计算所有其他度量。
在最坏的情况下(如果所有的词都不同),时间复杂度将只有2N。
大多数时候,由于所有的单词重复,它会快得多。