书中提出的如何将函数集成到一段代码中"Web Scraping with Python"

Question

我正在阅读 "Web Scraping with Python"。在第 8 章中，作者通过一个 ngrams 示例展示了以下代码：

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator

def cleanInput(input):
    input = re.sub('\n+', " ", input).lower()
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = bytes(input, "UTF-8")
    input = input.decode("ascii", "ignore")
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def ngrams(input, n):
    input = cleanInput(input)
    output = {}
    for i in range(len(input)-n+1):
        ngramTemp = " ".join(input[i:i+n])
        if ngramTemp not in output:
            output[ngramTemp] = 0
        output[ngramTemp] += 1
    return output

content = str(
        urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),
        'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1),
                      reverse=True)
print(sortedNGrams)

效果很好，但结果包含一堆没有意义的词。为了改进它，作者说可以使用一个新功能：

def isCommon(ngram):
commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it",
               "i", "that", "for", "you", "he", "with", "on", "do", "say",
               "this", "they", "is", "an", "at", "but", "we", "his",
               "from", "that", "not", "by", "she", "or", "as", "what",
               "go", "their", "can", "who", "get", "if", "would", "her",
               "all", "my", "make", "about", "know", "will", "as", "up",
               "one", "time", "has", "been", "there", "year", "so",
               "think", "when", "which", "them", "some", "me", "people",
               "take", "out", "into", "just", "see", "him", "your", "come",
               "could", "now", "than", "like", "other", "how", "then",
               "its", "our", "two", "more", "these", "want", "way", "look",
               "first", "also", "new", "because", "day", "more", "use",
               "no", "man", "find", "here", "thing", "give", "many",
               "well"]
for word in ngram:
    if word in commonWords:
        return True
return False

但是作者没有说的是如何应用函数得到书中所示的结果：

('united states', 10), ('executive department', 4), ('general governm
ent', 4), ('called upon', 3), ('government should', 3), ('whole count
ry', 3), ('mr jefferson', 3), ('chief magistrate', 3), ('same causes'
, 3), ('legislative body', 3)

有什么办法吗？

提前致谢。

Answer 1

这似乎产生了你的输出：

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator

def cleanInput(input):
    input = re.sub('\n+', " ", input).lower()
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = bytes(input, "UTF-8")
    input = input.decode("ascii", "ignore")
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def ngrams(input, n):
    input = cleanInput(input)
    output = {}
    for i in range(len(input)-n+1):

        words = input[i:i+n]
        #check if any of the words forming the n-gram is "common"
        if isCommon(words): continue

        ngramTemp = " ".join(words)
        if ngramTemp not in output:
            output[ngramTemp] = 0
        output[ngramTemp] += 1
    return output

content = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read)), 'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1),
                      reverse=True)

for ngram, cnt in sortedNGrams:
    if cnt >= 3:
        print(ngram, cnt)

给出：

united states 10
executive department 4
general government 4
same causes 3
legislative body 3
chief magistrate 3
called upon 3
whole country 3
government should 3
mr jefferson 3

书中提出的如何将函数集成到一段代码中"Web Scraping with Python"

How to integrate function into a piece of code proposed in the book "Web Scraping with Python"

python

n-gram

web-scraping