书中提出的如何将函数集成到一段代码中"Web Scraping with Python"
How to integrate function into a piece of code proposed in the book "Web Scraping with Python"
我正在阅读 "Web Scraping with Python"。在第 8 章中,作者通过一个 ngrams 示例展示了以下代码:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator
def cleanInput(input):
input = re.sub('\n+', " ", input).lower()
input = re.sub('\[[0-9]*\]', "", input)
input = re.sub(' +', " ", input)
input = bytes(input, "UTF-8")
input = input.decode("ascii", "ignore")
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput
def ngrams(input, n):
input = cleanInput(input)
output = {}
for i in range(len(input)-n+1):
ngramTemp = " ".join(input[i:i+n])
if ngramTemp not in output:
output[ngramTemp] = 0
output[ngramTemp] += 1
return output
content = str(
urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),
'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1),
reverse=True)
print(sortedNGrams)
效果很好,但结果包含一堆没有意义的词。为了改进它,作者说可以使用一个新功能:
def isCommon(ngram):
commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it",
"i", "that", "for", "you", "he", "with", "on", "do", "say",
"this", "they", "is", "an", "at", "but", "we", "his",
"from", "that", "not", "by", "she", "or", "as", "what",
"go", "their", "can", "who", "get", "if", "would", "her",
"all", "my", "make", "about", "know", "will", "as", "up",
"one", "time", "has", "been", "there", "year", "so",
"think", "when", "which", "them", "some", "me", "people",
"take", "out", "into", "just", "see", "him", "your", "come",
"could", "now", "than", "like", "other", "how", "then",
"its", "our", "two", "more", "these", "want", "way", "look",
"first", "also", "new", "because", "day", "more", "use",
"no", "man", "find", "here", "thing", "give", "many",
"well"]
for word in ngram:
if word in commonWords:
return True
return False
但是作者没有说的是如何应用函数得到书中所示的结果:
('united states', 10), ('executive department', 4), ('general governm
ent', 4), ('called upon', 3), ('government should', 3), ('whole count
ry', 3), ('mr jefferson', 3), ('chief magistrate', 3), ('same causes'
, 3), ('legislative body', 3)
有什么办法吗?
提前致谢。
这似乎产生了你的输出:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator
def cleanInput(input):
input = re.sub('\n+', " ", input).lower()
input = re.sub('\[[0-9]*\]', "", input)
input = re.sub(' +', " ", input)
input = bytes(input, "UTF-8")
input = input.decode("ascii", "ignore")
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput
def ngrams(input, n):
input = cleanInput(input)
output = {}
for i in range(len(input)-n+1):
words = input[i:i+n]
#check if any of the words forming the n-gram is "common"
if isCommon(words): continue
ngramTemp = " ".join(words)
if ngramTemp not in output:
output[ngramTemp] = 0
output[ngramTemp] += 1
return output
content = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read)), 'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1),
reverse=True)
for ngram, cnt in sortedNGrams:
if cnt >= 3:
print(ngram, cnt)
给出:
united states 10
executive department 4
general government 4
same causes 3
legislative body 3
chief magistrate 3
called upon 3
whole country 3
government should 3
mr jefferson 3
我正在阅读 "Web Scraping with Python"。在第 8 章中,作者通过一个 ngrams 示例展示了以下代码:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator
def cleanInput(input):
input = re.sub('\n+', " ", input).lower()
input = re.sub('\[[0-9]*\]', "", input)
input = re.sub(' +', " ", input)
input = bytes(input, "UTF-8")
input = input.decode("ascii", "ignore")
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput
def ngrams(input, n):
input = cleanInput(input)
output = {}
for i in range(len(input)-n+1):
ngramTemp = " ".join(input[i:i+n])
if ngramTemp not in output:
output[ngramTemp] = 0
output[ngramTemp] += 1
return output
content = str(
urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),
'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1),
reverse=True)
print(sortedNGrams)
效果很好,但结果包含一堆没有意义的词。为了改进它,作者说可以使用一个新功能:
def isCommon(ngram):
commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it",
"i", "that", "for", "you", "he", "with", "on", "do", "say",
"this", "they", "is", "an", "at", "but", "we", "his",
"from", "that", "not", "by", "she", "or", "as", "what",
"go", "their", "can", "who", "get", "if", "would", "her",
"all", "my", "make", "about", "know", "will", "as", "up",
"one", "time", "has", "been", "there", "year", "so",
"think", "when", "which", "them", "some", "me", "people",
"take", "out", "into", "just", "see", "him", "your", "come",
"could", "now", "than", "like", "other", "how", "then",
"its", "our", "two", "more", "these", "want", "way", "look",
"first", "also", "new", "because", "day", "more", "use",
"no", "man", "find", "here", "thing", "give", "many",
"well"]
for word in ngram:
if word in commonWords:
return True
return False
但是作者没有说的是如何应用函数得到书中所示的结果:
('united states', 10), ('executive department', 4), ('general governm
ent', 4), ('called upon', 3), ('government should', 3), ('whole count
ry', 3), ('mr jefferson', 3), ('chief magistrate', 3), ('same causes'
, 3), ('legislative body', 3)
有什么办法吗?
提前致谢。
这似乎产生了你的输出:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator
def cleanInput(input):
input = re.sub('\n+', " ", input).lower()
input = re.sub('\[[0-9]*\]', "", input)
input = re.sub(' +', " ", input)
input = bytes(input, "UTF-8")
input = input.decode("ascii", "ignore")
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput
def ngrams(input, n):
input = cleanInput(input)
output = {}
for i in range(len(input)-n+1):
words = input[i:i+n]
#check if any of the words forming the n-gram is "common"
if isCommon(words): continue
ngramTemp = " ".join(words)
if ngramTemp not in output:
output[ngramTemp] = 0
output[ngramTemp] += 1
return output
content = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read)), 'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1),
reverse=True)
for ngram, cnt in sortedNGrams:
if cnt >= 3:
print(ngram, cnt)
给出:
united states 10
executive department 4
general government 4
same causes 3
legislative body 3
chief magistrate 3
called upon 3
whole country 3
government should 3
mr jefferson 3