使用 python django 解析 RSS XML
Parsing RSS XML with python django
我正在尝试解析 3 个不同的 RSS 源,这些是源。
https://www.nba.com/bucks/rss.xml
http://www.espn.com/espn/rss/ncb/news
http://rss.nytimes.com/services/xml/rss/nyt/ProBasketball.xml
除了 url
这三个来源的大部分结构都相似
我正在尝试将它们解析为以下 Feed 对象,
class Feed(Base):
title = models.CharField(db_index=True, unique=True, max_length=255)
link = models.CharField(db_index=True, max_length=255, )
summary = models.TextField(null=True)
author = models.CharField(null=True, max_length=255)
url = models.CharField(max_length=512, null=True)
published = models.DateTimeField()
source = models.ForeignKey(Source, on_delete=models.CASCADE, null=True)
这是源对象,
class Source(Base):
name = models.CharField(db_index=True, max_length=255)
link = models.CharField(db_index=True, max_length=255, unique=True)
这是我用来解析的代码,
import logging
import xml.etree.ElementTree as ET
import requests
import maya
from django.utils import timezone
from aggregator.models import Feed
class ParseFeeds:
@staticmethod
def parse(source):
logger = logging.getLogger(__name__)
logger.info("Starting {}".format(source.name))
root = ET.fromstring(requests.get(source.link).text)
items = root.findall(".//item")
for item in items:
title = ''
if item.find('title'):
title = item.find('title').text
link = ''
if item.find('link'):
link = item.find('link').text
description = ''
if item.find('description'):
description = item.find('description').text
author = ''
if item.find('author'):
author = item.find('author').text
published = timezone.now()
if item.find('pubDate'):
published = maya.parse(item.find('pubDate').text).datetime()
url = ''
if item.find('enclosure'):
url = item.find('enclosure').attrib['url']
if item.find('image'):
url = item.find('image')
if not Feed.objects.filter(title=title).exists():
logger.info("Title:{} Link:{} Summary:{} Author:{} Published:{} Url:{}".format(title, link, description, author, published, url))
feed = Feed(title=title, link=link, summary=description, author=author, published=published, url=url,
source=source)
feed.save()
logger.info("Adding {} from {}".format(feed.title, feed.source.name))
logger.info("Finished {}".format(source.name))
虽然我可以在 python 控制台上解析这些来源中的每一个,但此处创建的提要对象最终包含所有 None
或默认字段。
我在这里做错了什么。
你应该使用
for item in items:
title = ''
if item.find('title') is not None: # The "is not None" part is critical here.
title = item.find('title').text
# And so on ...
如果您在终端中尝试
bool(item.find('title')) # This is False
item.find('title') is not None # while this is True
每次你想检查某物是否是 None,使用 if something is None
构造。
我正在尝试解析 3 个不同的 RSS 源,这些是源。
https://www.nba.com/bucks/rss.xml
http://www.espn.com/espn/rss/ncb/news
http://rss.nytimes.com/services/xml/rss/nyt/ProBasketball.xml
除了 url
这三个来源的大部分结构都相似我正在尝试将它们解析为以下 Feed 对象,
class Feed(Base):
title = models.CharField(db_index=True, unique=True, max_length=255)
link = models.CharField(db_index=True, max_length=255, )
summary = models.TextField(null=True)
author = models.CharField(null=True, max_length=255)
url = models.CharField(max_length=512, null=True)
published = models.DateTimeField()
source = models.ForeignKey(Source, on_delete=models.CASCADE, null=True)
这是源对象,
class Source(Base):
name = models.CharField(db_index=True, max_length=255)
link = models.CharField(db_index=True, max_length=255, unique=True)
这是我用来解析的代码,
import logging
import xml.etree.ElementTree as ET
import requests
import maya
from django.utils import timezone
from aggregator.models import Feed
class ParseFeeds:
@staticmethod
def parse(source):
logger = logging.getLogger(__name__)
logger.info("Starting {}".format(source.name))
root = ET.fromstring(requests.get(source.link).text)
items = root.findall(".//item")
for item in items:
title = ''
if item.find('title'):
title = item.find('title').text
link = ''
if item.find('link'):
link = item.find('link').text
description = ''
if item.find('description'):
description = item.find('description').text
author = ''
if item.find('author'):
author = item.find('author').text
published = timezone.now()
if item.find('pubDate'):
published = maya.parse(item.find('pubDate').text).datetime()
url = ''
if item.find('enclosure'):
url = item.find('enclosure').attrib['url']
if item.find('image'):
url = item.find('image')
if not Feed.objects.filter(title=title).exists():
logger.info("Title:{} Link:{} Summary:{} Author:{} Published:{} Url:{}".format(title, link, description, author, published, url))
feed = Feed(title=title, link=link, summary=description, author=author, published=published, url=url,
source=source)
feed.save()
logger.info("Adding {} from {}".format(feed.title, feed.source.name))
logger.info("Finished {}".format(source.name))
虽然我可以在 python 控制台上解析这些来源中的每一个,但此处创建的提要对象最终包含所有 None
或默认字段。
我在这里做错了什么。
你应该使用
for item in items:
title = ''
if item.find('title') is not None: # The "is not None" part is critical here.
title = item.find('title').text
# And so on ...
如果您在终端中尝试
bool(item.find('title')) # This is False
item.find('title') is not None # while this is True
每次你想检查某物是否是 None,使用 if something is None
构造。