URLError: <urlopen error unknown url type: pubmed>
URLError: <urlopen error unknown url type: pubmed>
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
class FrequencySummarizer:
def __init__(self, min_cut=0.1, max_cut=0.9):
"""
Initilize the text summarizer.
Words that have a frequency term lower than min_cut
or higer than max_cut will be ignored.
"""
self._min_cut = min_cut
self._max_cut = max_cut
self._stopwords = set(stopwords.words('english') + list(punctuation))
def _compute_frequencies(self, word_sent):
"""
Compute the frequency of each of word.
Input:
word_sent, a list of sentences already tokenized.
Output:
freq, a dictionary where freq[w] is the frequency of w.
"""
freq = defaultdict(int)
for s in word_sent:
for word in s:
if word not in self._stopwords:
freq[word] += 1
# frequencies normalization and fitering
m = float(max(freq.values()))
#for w in freq.keys():
for w in list(freq):
freq[w] = freq[w]/m
if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
del freq[w]
return freq
def summarize(self, text, n):
"""
Return a list of n sentences
which represent the summary of text.
"""
sents = sent_tokenize(text)
assert n <= len(sents)
word_sent = [word_tokenize(s.lower()) for s in sents]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i,sent in enumerate(word_sent):
for w in sent:
if w in self._freq:
ranking[i] += self._freq[w]
sents_idx = self._rank(ranking, n)
return [sents[j] for j in sents_idx]
def _rank(self, ranking, n):
""" return the first n sentences with highest ranking """
return nlargest(n, ranking, key=ranking.get)
#import urllib2
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_only_text(url):
"""
return the title and the text of the article
at the specified url
"""
page = urlopen(url).read().decode('utf8')
soup = BeautifulSoup(page)
text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
return soup.title.text, text
feed_xml = urlopen('http://feeds.bbci.co.uk/news/rss.xml').read()
feed = BeautifulSoup(feed_xml.decode('utf8'))
to_summarize = map(lambda p: p.text, feed.find_all('guid'))
fs = FrequencySummarizer()
#for article_url in to_summarize[:5]:
for article_url in to_summarize:
title, text = get_only_text(article_url)
print ('----------------------------------')
print (title)
for s in fs.summarize(text, 2):
print ('*',s)
以上代码适用于 RSS 提要 - http://feeds.bbci.co.uk/news/rss.xml
但给我一个 RSS 提要错误 - https://pubmed.ncbi.nlm.nih.gov/rss/search/1jq74NZspErHZpX3J2B97GZMF7znbt0391VdUGxV1hA6J5hMMP/?limit=15&utm_campaign=pubmed-2&fc=20200625034601
错误是:
URLError: <urlopen error unknown url type: pubmed>
第二个 RSS 提要有什么问题,或者我必须以不同的方式处理它吗?
问题如下:
命令
feed.find_all('guid')
给了我
[<guid ispermalink="false">pubmed:32475840</guid>,
<guid ispermalink="false">pubmed:32461484</guid>,
<guid ispermalink="false">pubmed:32461442</guid>,
<guid ispermalink="false">pubmed:32355441</guid>,
<guid ispermalink="false">pubmed:32343707</guid>,
<guid ispermalink="false">pubmed:32317470</guid>,
<guid ispermalink="false">pubmed:32317460</guid>,
<guid ispermalink="false">pubmed:32317451</guid>,
<guid ispermalink="false">pubmed:32312587</guid>,
<guid ispermalink="false">pubmed:32298984</guid>,
<guid ispermalink="false">pubmed:32292851</guid>,
<guid ispermalink="false">pubmed:32280309</guid>,
<guid ispermalink="false">pubmed:32210693</guid>,
<guid ispermalink="false">pubmed:32117292</guid>,
<guid ispermalink="false">pubmed:32116486</guid>]
当然 pubmed:32116486 不是网站 link,所以我不得不将搜索条件从
更改为
feed.find_all('guid')
到
for link in feed.find_all('a'):
list_of_sites.append(link.get('href'))
to_summarize = map(lambda k: k, (k for k in list_of_sites))
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
class FrequencySummarizer:
def __init__(self, min_cut=0.1, max_cut=0.9):
"""
Initilize the text summarizer.
Words that have a frequency term lower than min_cut
or higer than max_cut will be ignored.
"""
self._min_cut = min_cut
self._max_cut = max_cut
self._stopwords = set(stopwords.words('english') + list(punctuation))
def _compute_frequencies(self, word_sent):
"""
Compute the frequency of each of word.
Input:
word_sent, a list of sentences already tokenized.
Output:
freq, a dictionary where freq[w] is the frequency of w.
"""
freq = defaultdict(int)
for s in word_sent:
for word in s:
if word not in self._stopwords:
freq[word] += 1
# frequencies normalization and fitering
m = float(max(freq.values()))
#for w in freq.keys():
for w in list(freq):
freq[w] = freq[w]/m
if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
del freq[w]
return freq
def summarize(self, text, n):
"""
Return a list of n sentences
which represent the summary of text.
"""
sents = sent_tokenize(text)
assert n <= len(sents)
word_sent = [word_tokenize(s.lower()) for s in sents]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i,sent in enumerate(word_sent):
for w in sent:
if w in self._freq:
ranking[i] += self._freq[w]
sents_idx = self._rank(ranking, n)
return [sents[j] for j in sents_idx]
def _rank(self, ranking, n):
""" return the first n sentences with highest ranking """
return nlargest(n, ranking, key=ranking.get)
#import urllib2
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_only_text(url):
"""
return the title and the text of the article
at the specified url
"""
page = urlopen(url).read().decode('utf8')
soup = BeautifulSoup(page)
text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
return soup.title.text, text
feed_xml = urlopen('http://feeds.bbci.co.uk/news/rss.xml').read()
feed = BeautifulSoup(feed_xml.decode('utf8'))
to_summarize = map(lambda p: p.text, feed.find_all('guid'))
fs = FrequencySummarizer()
#for article_url in to_summarize[:5]:
for article_url in to_summarize:
title, text = get_only_text(article_url)
print ('----------------------------------')
print (title)
for s in fs.summarize(text, 2):
print ('*',s)
以上代码适用于 RSS 提要 - http://feeds.bbci.co.uk/news/rss.xml 但给我一个 RSS 提要错误 - https://pubmed.ncbi.nlm.nih.gov/rss/search/1jq74NZspErHZpX3J2B97GZMF7znbt0391VdUGxV1hA6J5hMMP/?limit=15&utm_campaign=pubmed-2&fc=20200625034601
错误是:
URLError: <urlopen error unknown url type: pubmed>
第二个 RSS 提要有什么问题,或者我必须以不同的方式处理它吗?
问题如下:
命令
feed.find_all('guid')
给了我
[<guid ispermalink="false">pubmed:32475840</guid>,
<guid ispermalink="false">pubmed:32461484</guid>,
<guid ispermalink="false">pubmed:32461442</guid>,
<guid ispermalink="false">pubmed:32355441</guid>,
<guid ispermalink="false">pubmed:32343707</guid>,
<guid ispermalink="false">pubmed:32317470</guid>,
<guid ispermalink="false">pubmed:32317460</guid>,
<guid ispermalink="false">pubmed:32317451</guid>,
<guid ispermalink="false">pubmed:32312587</guid>,
<guid ispermalink="false">pubmed:32298984</guid>,
<guid ispermalink="false">pubmed:32292851</guid>,
<guid ispermalink="false">pubmed:32280309</guid>,
<guid ispermalink="false">pubmed:32210693</guid>,
<guid ispermalink="false">pubmed:32117292</guid>,
<guid ispermalink="false">pubmed:32116486</guid>]
当然 pubmed:32116486 不是网站 link,所以我不得不将搜索条件从
更改为feed.find_all('guid')
到
for link in feed.find_all('a'):
list_of_sites.append(link.get('href'))
to_summarize = map(lambda k: k, (k for k in list_of_sites))