优化代码以绘制字数统计图
Optimizing code to graph word counts
我刚刚完成了一个程序,该程序从书籍中读取文本并绘制字数统计图,x 轴是一本书的字数,y 轴是第二本书的字数。它可以工作,但速度出奇地慢,我希望得到一些关于如何优化它的提示。我认为我最关心的是为两本书之间的相似词创建一个字典,以及为一本书中而不是另一本书中的词创建一个字典。这个实现给程序增加了很多运行时间,我想找到一种 pythonic 的方法来改进它。下面是代码:
import re # regular expressions
import io
import collections
from matplotlib import pyplot as plt
# xs=[x1,x2,...,xn]
# Number of occurences of the word in book 1
# use
# ys=[y1.y2,...,yn]
# Number of occurences of the word in book 2
# plt.plot(xs,ys)
# save as svg or pdf files
word_pattern = re.compile(r'\w+')
# with version ensures closing even if there are failures
with io.open("swannsway.txt") as f:
text = f.read() # read as a single large string
book1 = word_pattern.findall(text) # pull out words
book1 = [w.lower() for w in book1 if len(w)>=3]
with io.open("moby_dick.txt") as f:
text = f.read() # read as a single large string
book2 = word_pattern.findall(text) # pull out words
book2 = [w.lower() for w in book2 if len(w)>=3]
#Convert these into relative percentages/total book length
wordcount_book1 = {}
for word in book1:
if word in wordcount_book1:
wordcount_book1[word]+=1
else:
wordcount_book1[word]=1
'''
for word in wordcount_book1:
wordcount_book1[word] /= len(wordcount_book1)
for word in wordcount_book2:
wordcount_book2[word] /= len(wordcount_book2)
'''
wordcount_book2 = {}
for word in book2:
if word in wordcount_book2:
wordcount_book2[word]+=1
else:
wordcount_book2[word]=1
common_words = {}
for i in wordcount_book1:
for j in wordcount_book2:
if i == j:
common_words[i] = [wordcount_book1[i], wordcount_book2[j]]
break
book_singles= {}
for i in wordcount_book1:
if i not in common_words:
book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
if i not in common_words:
book_singles[i] = [0, wordcount_book2[i]]
wordcount_book1 = collections.Counter(book1)
wordcount_book2 = collections.Counter(book2)
# how many words of different lengths?
word_length_book1 = collections.Counter([len(word) for word in book1])
word_length_book2 = collections.Counter([len(word) for word in book2])
print(wordcount_book1)
#plt.plot(list(word_length_book1.keys()),list(word_length_book1.values()), list(word_length_book2.keys()), list(word_length_book2.values()), 'bo')
for i in range(len(common_words)):
plt.plot(list(common_words.values())[i][0], list(common_words.values())[i][1], 'bo', alpha = 0.2)
for i in range(len(book_singles)):
plt.plot(list(book_singles.values())[i][0], list(book_singles.values())[i][1], 'ro', alpha = 0.2)
plt.ylabel('Swannsway')
plt.xlabel('Moby Dick')
plt.show()
#key:value
这里有一些优化代码的技巧。
计算单词的出现次数。
使用 collections
库中的 Counter
class(参见 this post):
from collections import Counter
wordcount_book1 = Counter(book1)
wordcount_book2 = Counter(book2)
寻找常见和独特的词。
使用 set
class。所有词是并,普通词是交,独特词是差。
book1_words = set(wordcount_book1.keys())
book2_words = set(wordcount_book2.keys())
all_words = book1_words | book2_words
common_words = book1_words & book2_words
book_singles = [book1_words - common_words, book2_words - common_words]
计算单词长度。
先计算所有单词的长度,再乘以每本书的字数:
word_length = Counter([len(w) for w in all_words])
word_length_book1 = {w:word_length[w]*wordcount_book1[w] for w in book1_words})
word_length_book1 = {w:word_length[w]*wordcount_book2[w] for w in book2_words}
也许这些情节应该可以在没有循环的情况下完成,但不幸的是我不明白你在试图描绘什么。
您的大部分代码只有轻微的低效问题,我已尝试解决这些问题。您最大的延迟是在绘制 book_singles
时,我相信我已经修复了。详情:我换了这个:
word_pattern = re.compile(r'\w+')
至:
word_pattern = re.compile(r'[a-zA-Z]{3,}')
因为 book_singles
足够大,不包括数字!通过在模式中包含最小尺寸,我们消除了对这个循环的需要:
book1 = [w.lower() for w in book1 if len(w)>=3]
还有与 book2 匹配的一本。这里:
book1 = word_pattern.findall(text) # pull out words
book1 = [w.lower() for w in book1 if len(w)>=3]
我移动了 .lower()
所以我们只做一次,而不是每个字:
book1 = word_pattern.findall(text.lower()) # pull out words
book1 = [w for w in book1 if len(w) >= 3]
因为它很可能在 C 中实现,所以这是一个胜利。这个:
wordcount_book1 = {}
for word in book1:
if word in wordcount_book1:
wordcount_book1[word]+=1
else:
wordcount_book1[word]=1
我转而使用 defaultdict
,因为您已经导入了集合:
wordcount_book1 = collections.defaultdict(int)
for word in book1:
wordcount_book1[word] += 1
对于这些循环:
common_words = {}
for i in wordcount_book1:
for j in wordcount_book2:
if i == j:
common_words[i] = [wordcount_book1[i], wordcount_book2[j]]
break
book_singles= {}
for i in wordcount_book1:
if i not in common_words:
book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
if i not in common_words:
book_singles[i] = [0, wordcount_book2[i]]
我重写了第一个循环,这是一场灾难,然后让它承担双重任务,因为它已经完成了第二个循环的工作:
common_words = {}
book_singles = {}
for i in wordcount_book1:
if i in wordcount_book2:
common_words[i] = [wordcount_book1[i], wordcount_book2[i]]
else:
book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
if i not in common_words:
book_singles[i] = [0, wordcount_book2[i]]
最后,这些绘图循环在它们一遍又一遍地走 common_words.values()
和 book_singles.values()
的方式以及它们一次绘制一个点的方式上都非常低效:
for i in range(len(common_words)):
plt.plot(list(common_words.values())[i][0], list(common_words.values())[i][1], 'bo', alpha = 0.2)
for i in range(len(book_singles)):
plt.plot(list(book_singles.values())[i][0], list(book_singles.values())[i][1], 'ro', alpha = 0.2)
我将它们简单地更改为:
counts1, counts2 = zip(*common_words.values())
plt.plot(counts1, counts2, 'bo', alpha=0.2)
counts1, counts2 = zip(*book_singles.values())
plt.plot(counts1, counts2, 'ro', alpha=0.2)
完整的返工代码,省略了您计算但未在示例中使用的内容:
import re # regular expressions
import collections
from matplotlib import pyplot as plt
# xs=[x1,x2,...,xn]
# Number of occurrences of the word in book 1
# use
# ys=[y1.y2,...,yn]
# Number of occurrences of the word in book 2
# plt.plot(xs,ys)
# save as svg or pdf files
word_pattern = re.compile(r'[a-zA-Z]{3,}')
# with ensures closing of file even if there are failures
with open("swannsway.txt") as f:
text = f.read() # read as a single large string
book1 = word_pattern.findall(text.lower()) # pull out words
with open("moby_dick.txt") as f:
text = f.read() # read as a single large string
book2 = word_pattern.findall(text.lower()) # pull out words
# Convert these into relative percentages/total book length
wordcount_book1 = collections.defaultdict(int)
for word in book1:
wordcount_book1[word] += 1
wordcount_book2 = collections.defaultdict(int)
for word in book2:
wordcount_book2[word] += 1
common_words = {}
book_singles = {}
for i in wordcount_book1:
if i in wordcount_book2:
common_words[i] = [wordcount_book1[i], wordcount_book2[i]]
else:
book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
if i not in common_words:
book_singles[i] = [0, wordcount_book2[i]]
counts1, counts2 = zip(*common_words.values())
plt.plot(counts1, counts2, 'bo', alpha=0.2)
counts1, counts2 = zip(*book_singles.values())
plt.plot(counts1, counts2, 'ro', alpha=0.2)
plt.xlabel('Moby Dick')
plt.ylabel('Swannsway')
plt.show()
输出
您可以删除 stop words 以减少高分词并显示有趣的数据。
我刚刚完成了一个程序,该程序从书籍中读取文本并绘制字数统计图,x 轴是一本书的字数,y 轴是第二本书的字数。它可以工作,但速度出奇地慢,我希望得到一些关于如何优化它的提示。我认为我最关心的是为两本书之间的相似词创建一个字典,以及为一本书中而不是另一本书中的词创建一个字典。这个实现给程序增加了很多运行时间,我想找到一种 pythonic 的方法来改进它。下面是代码:
import re # regular expressions
import io
import collections
from matplotlib import pyplot as plt
# xs=[x1,x2,...,xn]
# Number of occurences of the word in book 1
# use
# ys=[y1.y2,...,yn]
# Number of occurences of the word in book 2
# plt.plot(xs,ys)
# save as svg or pdf files
word_pattern = re.compile(r'\w+')
# with version ensures closing even if there are failures
with io.open("swannsway.txt") as f:
text = f.read() # read as a single large string
book1 = word_pattern.findall(text) # pull out words
book1 = [w.lower() for w in book1 if len(w)>=3]
with io.open("moby_dick.txt") as f:
text = f.read() # read as a single large string
book2 = word_pattern.findall(text) # pull out words
book2 = [w.lower() for w in book2 if len(w)>=3]
#Convert these into relative percentages/total book length
wordcount_book1 = {}
for word in book1:
if word in wordcount_book1:
wordcount_book1[word]+=1
else:
wordcount_book1[word]=1
'''
for word in wordcount_book1:
wordcount_book1[word] /= len(wordcount_book1)
for word in wordcount_book2:
wordcount_book2[word] /= len(wordcount_book2)
'''
wordcount_book2 = {}
for word in book2:
if word in wordcount_book2:
wordcount_book2[word]+=1
else:
wordcount_book2[word]=1
common_words = {}
for i in wordcount_book1:
for j in wordcount_book2:
if i == j:
common_words[i] = [wordcount_book1[i], wordcount_book2[j]]
break
book_singles= {}
for i in wordcount_book1:
if i not in common_words:
book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
if i not in common_words:
book_singles[i] = [0, wordcount_book2[i]]
wordcount_book1 = collections.Counter(book1)
wordcount_book2 = collections.Counter(book2)
# how many words of different lengths?
word_length_book1 = collections.Counter([len(word) for word in book1])
word_length_book2 = collections.Counter([len(word) for word in book2])
print(wordcount_book1)
#plt.plot(list(word_length_book1.keys()),list(word_length_book1.values()), list(word_length_book2.keys()), list(word_length_book2.values()), 'bo')
for i in range(len(common_words)):
plt.plot(list(common_words.values())[i][0], list(common_words.values())[i][1], 'bo', alpha = 0.2)
for i in range(len(book_singles)):
plt.plot(list(book_singles.values())[i][0], list(book_singles.values())[i][1], 'ro', alpha = 0.2)
plt.ylabel('Swannsway')
plt.xlabel('Moby Dick')
plt.show()
#key:value
这里有一些优化代码的技巧。
计算单词的出现次数。
使用 collections
库中的 Counter
class(参见 this post):
from collections import Counter
wordcount_book1 = Counter(book1)
wordcount_book2 = Counter(book2)
寻找常见和独特的词。
使用 set
class。所有词是并,普通词是交,独特词是差。
book1_words = set(wordcount_book1.keys())
book2_words = set(wordcount_book2.keys())
all_words = book1_words | book2_words
common_words = book1_words & book2_words
book_singles = [book1_words - common_words, book2_words - common_words]
计算单词长度。 先计算所有单词的长度,再乘以每本书的字数:
word_length = Counter([len(w) for w in all_words])
word_length_book1 = {w:word_length[w]*wordcount_book1[w] for w in book1_words})
word_length_book1 = {w:word_length[w]*wordcount_book2[w] for w in book2_words}
也许这些情节应该可以在没有循环的情况下完成,但不幸的是我不明白你在试图描绘什么。
您的大部分代码只有轻微的低效问题,我已尝试解决这些问题。您最大的延迟是在绘制 book_singles
时,我相信我已经修复了。详情:我换了这个:
word_pattern = re.compile(r'\w+')
至:
word_pattern = re.compile(r'[a-zA-Z]{3,}')
因为 book_singles
足够大,不包括数字!通过在模式中包含最小尺寸,我们消除了对这个循环的需要:
book1 = [w.lower() for w in book1 if len(w)>=3]
还有与 book2 匹配的一本。这里:
book1 = word_pattern.findall(text) # pull out words
book1 = [w.lower() for w in book1 if len(w)>=3]
我移动了 .lower()
所以我们只做一次,而不是每个字:
book1 = word_pattern.findall(text.lower()) # pull out words
book1 = [w for w in book1 if len(w) >= 3]
因为它很可能在 C 中实现,所以这是一个胜利。这个:
wordcount_book1 = {}
for word in book1:
if word in wordcount_book1:
wordcount_book1[word]+=1
else:
wordcount_book1[word]=1
我转而使用 defaultdict
,因为您已经导入了集合:
wordcount_book1 = collections.defaultdict(int)
for word in book1:
wordcount_book1[word] += 1
对于这些循环:
common_words = {}
for i in wordcount_book1:
for j in wordcount_book2:
if i == j:
common_words[i] = [wordcount_book1[i], wordcount_book2[j]]
break
book_singles= {}
for i in wordcount_book1:
if i not in common_words:
book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
if i not in common_words:
book_singles[i] = [0, wordcount_book2[i]]
我重写了第一个循环,这是一场灾难,然后让它承担双重任务,因为它已经完成了第二个循环的工作:
common_words = {}
book_singles = {}
for i in wordcount_book1:
if i in wordcount_book2:
common_words[i] = [wordcount_book1[i], wordcount_book2[i]]
else:
book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
if i not in common_words:
book_singles[i] = [0, wordcount_book2[i]]
最后,这些绘图循环在它们一遍又一遍地走 common_words.values()
和 book_singles.values()
的方式以及它们一次绘制一个点的方式上都非常低效:
for i in range(len(common_words)):
plt.plot(list(common_words.values())[i][0], list(common_words.values())[i][1], 'bo', alpha = 0.2)
for i in range(len(book_singles)):
plt.plot(list(book_singles.values())[i][0], list(book_singles.values())[i][1], 'ro', alpha = 0.2)
我将它们简单地更改为:
counts1, counts2 = zip(*common_words.values())
plt.plot(counts1, counts2, 'bo', alpha=0.2)
counts1, counts2 = zip(*book_singles.values())
plt.plot(counts1, counts2, 'ro', alpha=0.2)
完整的返工代码,省略了您计算但未在示例中使用的内容:
import re # regular expressions
import collections
from matplotlib import pyplot as plt
# xs=[x1,x2,...,xn]
# Number of occurrences of the word in book 1
# use
# ys=[y1.y2,...,yn]
# Number of occurrences of the word in book 2
# plt.plot(xs,ys)
# save as svg or pdf files
word_pattern = re.compile(r'[a-zA-Z]{3,}')
# with ensures closing of file even if there are failures
with open("swannsway.txt") as f:
text = f.read() # read as a single large string
book1 = word_pattern.findall(text.lower()) # pull out words
with open("moby_dick.txt") as f:
text = f.read() # read as a single large string
book2 = word_pattern.findall(text.lower()) # pull out words
# Convert these into relative percentages/total book length
wordcount_book1 = collections.defaultdict(int)
for word in book1:
wordcount_book1[word] += 1
wordcount_book2 = collections.defaultdict(int)
for word in book2:
wordcount_book2[word] += 1
common_words = {}
book_singles = {}
for i in wordcount_book1:
if i in wordcount_book2:
common_words[i] = [wordcount_book1[i], wordcount_book2[i]]
else:
book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
if i not in common_words:
book_singles[i] = [0, wordcount_book2[i]]
counts1, counts2 = zip(*common_words.values())
plt.plot(counts1, counts2, 'bo', alpha=0.2)
counts1, counts2 = zip(*book_singles.values())
plt.plot(counts1, counts2, 'ro', alpha=0.2)
plt.xlabel('Moby Dick')
plt.ylabel('Swannsway')
plt.show()
输出
您可以删除 stop words 以减少高分词并显示有趣的数据。