如果 RSS 频道中有新项目,如何使用 feedparser 进行检测?

How to detect with feedparser if there are new items in an RSS channel?

我有以下代码。当你理解代码时,你可以看看两个注释 大写字母。我可以使用 insert or ignore 测试频道中是否有新项目,但是 我正在尝试使用 feed.updated_parsed 属性的更好机制。为什么不起作用 符合预期吗?

from __future__ import unicode_literals
import feedparser
from sqlite3  import dbapi2 as sqlite
import sys, os
from datetime import datetime
from time import mktime
from daeutils import *
import re
import random
import optparse
import curses
import socket

def getActiveChannels():
  """Returns a list of active RSS channels"""
  con = sqlite.connect(connectionString)
  cur = con.cursor()
  cur.execute("select id, title, xmlurl, updated from channels")
  channels = cur.fetchall()
  cur.close()
  con.close()
  return channels

def getItemsForChannel(xmlUrl, lastUpdate):   
  socket.setdefaulttimeout(60)
  feedparserDictionary = feedparser.parse(xmlUrl)
  updatedTime = datetime.fromtimestamp(mktime(feedparserDictionary.feed.updated_parsed))
  lst = datetime.strptime(lastUpdate, "%Y-%m-%dT%H:%M:%S.%f")
  if updatedTime < lst:
    return [] # HERE NOT BEHAVING CORRECTLY, WHEN I COMMENT THIS LINE, THERE MAY BE A FEW ITEMS

  items = feedparserDictionary.entries
  print "There are new %d items" % len(items)
  return items

def setChannelUpdateTime(xmlUrl, tm):
  con = sqlite.connect(connectionString)
  cur = con.cursor()
  cur.execute("update channels set updated = :tm where xmlurl = :xmlUrl", locals())
  con.commit()
  print "updated successfully"
  cur.close()
  con.close()

if __name__ == "_main__":
   con = sqlite.connect(connectionString)
   for channel in getActiveChannels():
     channelId, channelTitle, channelXmlUrl, lastChannelUpdate = channel
     countOfNewItems = 0
     items = getItemsForChannel(channelXmlUrl, lastChannelUpdate)

     for item in items:
       title, link, description, priority, updated = item
       cur = con.cursor()
       cur.execute("insert or ignore into feeds \
              (title, link, description, read, updated, channelid) \
              values (?, ?, ?, ?, ?, ?)", \
              (title, link, description, 0, updated, channelId))

       countOfNewItems += cur.rowcount # WHICH ARE INSERTED HERE
       con.commit()
       cur.close()


     if countOfNewItems:
       print "Found new items"
       now = datetime.now().isoformat()
       if "." not in now:
         now = now + ".000000"
       setChannelUpdateTime(channelXmlUrl, now)

这是sqlite中的两个表:

CREATE TABLE channels (id integer primary key, title string, text string, description string, type string, xmlurl string unique, htmlurl string, priority integer, active integer, deactivated integer, updated text);
CREATE TABLE feeds (id integer primary key, title string, link string unique, description string, read integer, priority integer, updated string, channelid integer, foreign key (channelid) references channels(id));

我认为可能的错误是您正在尝试比较提要上的 updated 字段,提要创建者可能无法很好地支持这些提要。或由于使用 isoformat 等而导致的时区格式

无论如何,我认为比较 PER ENTRY updated 属性比比较主要用于使提要缓存无效的提要 属性 要好得多。

这是一个工作示例,其中我 return 仅来自函数的新条目。

import socket
from datetime import datetime, timedelta
from time import mktime

import feedparser
from pprint import pprint


def getItemsForChannel(xmlUrl, lastUpdate):
    lst = datetime.fromisoformat(lastUpdate)

    socket.setdefaulttimeout(60)

    parsed = feedparser.parse(xmlUrl)

    items = [entry for entry in parsed.entries if
             datetime.fromtimestamp(mktime(entry.updated_parsed)) > lst]
    print("There are new {} items".format(len(items)))
    return items


pprint(getItemsForChannel(
    'http://serverfault.com/feeds/tag/+or+linux+or+ubuntu+or+vim+or+rsync+or+gnome',
    (datetime.now() - timedelta(hours=3)).isoformat()
))

它使用 from/to iso 格式作为数据库值中最后解析的日期,并比较每个条目的条目,而不是基于提要 updated 属性.