URLError: <urlopen error unknown url type: pubmed>

Question

from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

class FrequencySummarizer:
  def __init__(self, min_cut=0.1, max_cut=0.9):
    """
     Initilize the text summarizer.
     Words that have a frequency term lower than min_cut 
     or higer than max_cut will be ignored.
    """
    self._min_cut = min_cut
    self._max_cut = max_cut 
    self._stopwords = set(stopwords.words('english') + list(punctuation))

  def _compute_frequencies(self, word_sent):
    """ 
      Compute the frequency of each of word.
      Input: 
       word_sent, a list of sentences already tokenized.
      Output: 
       freq, a dictionary where freq[w] is the frequency of w.
    """
    freq = defaultdict(int)
    for s in word_sent:
      for word in s:
        if word not in self._stopwords:
          freq[word] += 1
    # frequencies normalization and fitering
    m = float(max(freq.values()))
    #for w in freq.keys():
    for w in list(freq):
      freq[w] = freq[w]/m
      if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
        del freq[w]
    return freq

  def summarize(self, text, n):
    """
      Return a list of n sentences 
      which represent the summary of text.
    """
    sents = sent_tokenize(text)
    assert n <= len(sents)
    word_sent = [word_tokenize(s.lower()) for s in sents]
    self._freq = self._compute_frequencies(word_sent)
    ranking = defaultdict(int)
    for i,sent in enumerate(word_sent):
      for w in sent:
        if w in self._freq:
          ranking[i] += self._freq[w]
    sents_idx = self._rank(ranking, n)    
    return [sents[j] for j in sents_idx]

  def _rank(self, ranking, n):
    """ return the first n sentences with highest ranking """
    return nlargest(n, ranking, key=ranking.get)

#import urllib2
from urllib.request import urlopen
from bs4 import BeautifulSoup

def get_only_text(url):
 """ 
  return the title and the text of the article
  at the specified url
 """
 page = urlopen(url).read().decode('utf8')
 soup = BeautifulSoup(page)
 text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
 return soup.title.text, text

feed_xml = urlopen('http://feeds.bbci.co.uk/news/rss.xml').read()
feed = BeautifulSoup(feed_xml.decode('utf8'))
to_summarize = map(lambda p: p.text, feed.find_all('guid'))

fs = FrequencySummarizer()
#for article_url in to_summarize[:5]:
for article_url in to_summarize:
  title, text = get_only_text(article_url)
  print ('----------------------------------')
  print (title)
  for s in fs.summarize(text, 2):
   print ('*',s)

以上代码适用于 RSS 提要 - http://feeds.bbci.co.uk/news/rss.xml 但给我一个 RSS 提要错误 - https://pubmed.ncbi.nlm.nih.gov/rss/search/1jq74NZspErHZpX3J2B97GZMF7znbt0391VdUGxV1hA6J5hMMP/?limit=15&utm_campaign=pubmed-2&fc=20200625034601

错误是：

URLError: <urlopen error unknown url type: pubmed>

第二个 RSS 提要有什么问题，或者我必须以不同的方式处理它吗？

Answer 1

问题如下：

命令

feed.find_all('guid')

给了我

[<guid ispermalink="false">pubmed:32475840</guid>,
 <guid ispermalink="false">pubmed:32461484</guid>,
 <guid ispermalink="false">pubmed:32461442</guid>,
 <guid ispermalink="false">pubmed:32355441</guid>,
 <guid ispermalink="false">pubmed:32343707</guid>,
 <guid ispermalink="false">pubmed:32317470</guid>,
 <guid ispermalink="false">pubmed:32317460</guid>,
 <guid ispermalink="false">pubmed:32317451</guid>,
 <guid ispermalink="false">pubmed:32312587</guid>,
 <guid ispermalink="false">pubmed:32298984</guid>,
 <guid ispermalink="false">pubmed:32292851</guid>,
 <guid ispermalink="false">pubmed:32280309</guid>,
 <guid ispermalink="false">pubmed:32210693</guid>,
 <guid ispermalink="false">pubmed:32117292</guid>,
 <guid ispermalink="false">pubmed:32116486</guid>]

当然 pubmed:32116486 不是网站 link，所以我不得不将搜索条件从

更改为

feed.find_all('guid')

到

for link in feed.find_all('a'):
    list_of_sites.append(link.get('href'))

to_summarize = map(lambda k: k, (k for k in list_of_sites))

URLError: <urlopen error unknown url type: pubmed>

URLError: <urlopen error unknown url type: pubmed>

urllib

beautifulsoup

python-3.x