如果 RSS 频道中有新项目,如何使用 feedparser 进行检测?
How to detect with feedparser if there are new items in an RSS channel?
我有以下代码。当你理解代码时,你可以看看两个注释
大写字母。我可以使用 insert or ignore
测试频道中是否有新项目,但是
我正在尝试使用 feed.updated_parsed
属性的更好机制。为什么不起作用
符合预期吗?
from __future__ import unicode_literals
import feedparser
from sqlite3 import dbapi2 as sqlite
import sys, os
from datetime import datetime
from time import mktime
from daeutils import *
import re
import random
import optparse
import curses
import socket
def getActiveChannels():
"""Returns a list of active RSS channels"""
con = sqlite.connect(connectionString)
cur = con.cursor()
cur.execute("select id, title, xmlurl, updated from channels")
channels = cur.fetchall()
cur.close()
con.close()
return channels
def getItemsForChannel(xmlUrl, lastUpdate):
socket.setdefaulttimeout(60)
feedparserDictionary = feedparser.parse(xmlUrl)
updatedTime = datetime.fromtimestamp(mktime(feedparserDictionary.feed.updated_parsed))
lst = datetime.strptime(lastUpdate, "%Y-%m-%dT%H:%M:%S.%f")
if updatedTime < lst:
return [] # HERE NOT BEHAVING CORRECTLY, WHEN I COMMENT THIS LINE, THERE MAY BE A FEW ITEMS
items = feedparserDictionary.entries
print "There are new %d items" % len(items)
return items
def setChannelUpdateTime(xmlUrl, tm):
con = sqlite.connect(connectionString)
cur = con.cursor()
cur.execute("update channels set updated = :tm where xmlurl = :xmlUrl", locals())
con.commit()
print "updated successfully"
cur.close()
con.close()
if __name__ == "_main__":
con = sqlite.connect(connectionString)
for channel in getActiveChannels():
channelId, channelTitle, channelXmlUrl, lastChannelUpdate = channel
countOfNewItems = 0
items = getItemsForChannel(channelXmlUrl, lastChannelUpdate)
for item in items:
title, link, description, priority, updated = item
cur = con.cursor()
cur.execute("insert or ignore into feeds \
(title, link, description, read, updated, channelid) \
values (?, ?, ?, ?, ?, ?)", \
(title, link, description, 0, updated, channelId))
countOfNewItems += cur.rowcount # WHICH ARE INSERTED HERE
con.commit()
cur.close()
if countOfNewItems:
print "Found new items"
now = datetime.now().isoformat()
if "." not in now:
now = now + ".000000"
setChannelUpdateTime(channelXmlUrl, now)
这是sqlite中的两个表:
CREATE TABLE channels (id integer primary key, title string, text string, description string, type string, xmlurl string unique, htmlurl string, priority integer, active integer, deactivated integer, updated text);
CREATE TABLE feeds (id integer primary key, title string, link string unique, description string, read integer, priority integer, updated string, channelid integer, foreign key (channelid) references channels(id));
我认为可能的错误是您正在尝试比较提要上的 updated
字段,提要创建者可能无法很好地支持这些提要。或由于使用 isoformat 等而导致的时区格式
无论如何,我认为比较 PER ENTRY updated
属性比比较主要用于使提要缓存无效的提要 属性 要好得多。
这是一个工作示例,其中我 return 仅来自函数的新条目。
import socket
from datetime import datetime, timedelta
from time import mktime
import feedparser
from pprint import pprint
def getItemsForChannel(xmlUrl, lastUpdate):
lst = datetime.fromisoformat(lastUpdate)
socket.setdefaulttimeout(60)
parsed = feedparser.parse(xmlUrl)
items = [entry for entry in parsed.entries if
datetime.fromtimestamp(mktime(entry.updated_parsed)) > lst]
print("There are new {} items".format(len(items)))
return items
pprint(getItemsForChannel(
'http://serverfault.com/feeds/tag/+or+linux+or+ubuntu+or+vim+or+rsync+or+gnome',
(datetime.now() - timedelta(hours=3)).isoformat()
))
它使用 from/to iso 格式作为数据库值中最后解析的日期,并比较每个条目的条目,而不是基于提要 updated
属性.
我有以下代码。当你理解代码时,你可以看看两个注释
大写字母。我可以使用 insert or ignore
测试频道中是否有新项目,但是
我正在尝试使用 feed.updated_parsed
属性的更好机制。为什么不起作用
符合预期吗?
from __future__ import unicode_literals
import feedparser
from sqlite3 import dbapi2 as sqlite
import sys, os
from datetime import datetime
from time import mktime
from daeutils import *
import re
import random
import optparse
import curses
import socket
def getActiveChannels():
"""Returns a list of active RSS channels"""
con = sqlite.connect(connectionString)
cur = con.cursor()
cur.execute("select id, title, xmlurl, updated from channels")
channels = cur.fetchall()
cur.close()
con.close()
return channels
def getItemsForChannel(xmlUrl, lastUpdate):
socket.setdefaulttimeout(60)
feedparserDictionary = feedparser.parse(xmlUrl)
updatedTime = datetime.fromtimestamp(mktime(feedparserDictionary.feed.updated_parsed))
lst = datetime.strptime(lastUpdate, "%Y-%m-%dT%H:%M:%S.%f")
if updatedTime < lst:
return [] # HERE NOT BEHAVING CORRECTLY, WHEN I COMMENT THIS LINE, THERE MAY BE A FEW ITEMS
items = feedparserDictionary.entries
print "There are new %d items" % len(items)
return items
def setChannelUpdateTime(xmlUrl, tm):
con = sqlite.connect(connectionString)
cur = con.cursor()
cur.execute("update channels set updated = :tm where xmlurl = :xmlUrl", locals())
con.commit()
print "updated successfully"
cur.close()
con.close()
if __name__ == "_main__":
con = sqlite.connect(connectionString)
for channel in getActiveChannels():
channelId, channelTitle, channelXmlUrl, lastChannelUpdate = channel
countOfNewItems = 0
items = getItemsForChannel(channelXmlUrl, lastChannelUpdate)
for item in items:
title, link, description, priority, updated = item
cur = con.cursor()
cur.execute("insert or ignore into feeds \
(title, link, description, read, updated, channelid) \
values (?, ?, ?, ?, ?, ?)", \
(title, link, description, 0, updated, channelId))
countOfNewItems += cur.rowcount # WHICH ARE INSERTED HERE
con.commit()
cur.close()
if countOfNewItems:
print "Found new items"
now = datetime.now().isoformat()
if "." not in now:
now = now + ".000000"
setChannelUpdateTime(channelXmlUrl, now)
这是sqlite中的两个表:
CREATE TABLE channels (id integer primary key, title string, text string, description string, type string, xmlurl string unique, htmlurl string, priority integer, active integer, deactivated integer, updated text);
CREATE TABLE feeds (id integer primary key, title string, link string unique, description string, read integer, priority integer, updated string, channelid integer, foreign key (channelid) references channels(id));
我认为可能的错误是您正在尝试比较提要上的 updated
字段,提要创建者可能无法很好地支持这些提要。或由于使用 isoformat 等而导致的时区格式
无论如何,我认为比较 PER ENTRY updated
属性比比较主要用于使提要缓存无效的提要 属性 要好得多。
这是一个工作示例,其中我 return 仅来自函数的新条目。
import socket
from datetime import datetime, timedelta
from time import mktime
import feedparser
from pprint import pprint
def getItemsForChannel(xmlUrl, lastUpdate):
lst = datetime.fromisoformat(lastUpdate)
socket.setdefaulttimeout(60)
parsed = feedparser.parse(xmlUrl)
items = [entry for entry in parsed.entries if
datetime.fromtimestamp(mktime(entry.updated_parsed)) > lst]
print("There are new {} items".format(len(items)))
return items
pprint(getItemsForChannel(
'http://serverfault.com/feeds/tag/+or+linux+or+ubuntu+or+vim+or+rsync+or+gnome',
(datetime.now() - timedelta(hours=3)).isoformat()
))
它使用 from/to iso 格式作为数据库值中最后解析的日期,并比较每个条目的条目,而不是基于提要 updated
属性.