书中提出的如何将函数集成到一段代码中"Web Scraping with Python"

How to integrate function into a piece of code proposed in the book "Web Scraping with Python"

我正在阅读 "Web Scraping with Python"。在第 8 章中,作者通过一个 ngrams 示例展示了以下代码:

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator

def cleanInput(input):
    input = re.sub('\n+', " ", input).lower()
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = bytes(input, "UTF-8")
    input = input.decode("ascii", "ignore")
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def ngrams(input, n):
    input = cleanInput(input)
    output = {}
    for i in range(len(input)-n+1):
        ngramTemp = " ".join(input[i:i+n])
        if ngramTemp not in output:
            output[ngramTemp] = 0
        output[ngramTemp] += 1
    return output

content = str(
        urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),
        'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1),
                      reverse=True)
print(sortedNGrams)

效果很好,但结果包含一堆没有意义的词。为了改进它,作者说可以使用一个新功能:

def isCommon(ngram):
commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it",
               "i", "that", "for", "you", "he", "with", "on", "do", "say",
               "this", "they", "is", "an", "at", "but", "we", "his",
               "from", "that", "not", "by", "she", "or", "as", "what",
               "go", "their", "can", "who", "get", "if", "would", "her",
               "all", "my", "make", "about", "know", "will", "as", "up",
               "one", "time", "has", "been", "there", "year", "so",
               "think", "when", "which", "them", "some", "me", "people",
               "take", "out", "into", "just", "see", "him", "your", "come",
               "could", "now", "than", "like", "other", "how", "then",
               "its", "our", "two", "more", "these", "want", "way", "look",
               "first", "also", "new", "because", "day", "more", "use",
               "no", "man", "find", "here", "thing", "give", "many",
               "well"]
for word in ngram:
    if word in commonWords:
        return True
return False

但是作者没有说的是如何应用函数得到书中所示的结果:

('united states', 10), ('executive department', 4), ('general governm
ent', 4), ('called upon', 3), ('government should', 3), ('whole count
ry', 3), ('mr jefferson', 3), ('chief magistrate', 3), ('same causes'
, 3), ('legislative body', 3)

有什么办法吗?

提前致谢。

这似乎产生了你的输出:

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator

def cleanInput(input):
    input = re.sub('\n+', " ", input).lower()
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = bytes(input, "UTF-8")
    input = input.decode("ascii", "ignore")
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def ngrams(input, n):
    input = cleanInput(input)
    output = {}
    for i in range(len(input)-n+1):

        words = input[i:i+n]
        #check if any of the words forming the n-gram is "common"
        if isCommon(words): continue

        ngramTemp = " ".join(words)
        if ngramTemp not in output:
            output[ngramTemp] = 0
        output[ngramTemp] += 1
    return output

content = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read)), 'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1),
                      reverse=True)

for ngram, cnt in sortedNGrams:
    if cnt >= 3:
        print(ngram, cnt)

给出:

united states 10
executive department 4
general government 4
same causes 3
legislative body 3
chief magistrate 3
called upon 3
whole country 3
government should 3
mr jefferson 3