Scrapy 没有抓取所有 HTML 标签
Scrapy not scraping all HTML tags
我正在尝试使用 Scrapy 从 website.The 中抓取信息,一般结构如下:
<item>
<title>........</title>
<link>.........</link>
<category>......</category>
<category>.......</category>
<pubdate>.........</pubdate>
</item>
网站 XML 有 26 个这样的项目。我想抓取每个项目的 link、标题类别和发布日期,并存储在 CSV file.My 蜘蛛 class如下:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from testscraper.items import testscraperItem
class MySpider(BaseSpider):
name="Test_scraper"
allowed_domains=["http://nytimes.com/feed/"]
start_urls=["http://nytimes.com/feed/"]
def parse(self,response):
data=[]
hxs = HtmlXPathSelector(response)
items= hxs.select('//item')
for item in items:
struct=testscraperItem()
title=item.select('./title/text()').extract()
link=item.select('./link/@href').extract()
pubdate=item.select('./pubDate/text()').extract()
topics=item.select('./category/text()').extract()
struct["title"]=title
struct["link"]=link
struct["pubdate"]=pubdate
struct["topics"]=topics
data.append(struct)
return data
除出版日期标签外,一切正常,我无法删除它(我得到一个空值)。此标签的示例值是:
<pubDate>Thu, 19 Feb 2015 19:29:08 GMT</pubDate>
我使用 response.xpath 尝试了以下代码,我能够提取 pubdate 标签:
def parse(self,response):
items=[]
pubdates=response.xpath('//item//pubDate/text()')
for pubdate in pubdates:
item["pubdate"]=pubdate.extract()
return items
为什么当我遍历项目时我无法提取 pubdate 标签内容,而不是当我将整个网页作为一个整体时能够提取它?我真的很困惑,希望得到帮助 this.Thanks!!出于其他目的,我 必须遍历每个项目 所以代码片段 2 不是一个选项 - 我必须遵循我拥有的第一个代码片段的结构写的
它看起来很像 XML 供稿。如果是这种情况,则需要使用 XMLFeedSpider
:
from scrapy import Item, Field
from scrapy.contrib.spiders import XMLFeedSpider
from testscraper.items import testscraperItem
class MySpider(XMLFeedSpider):
name = "Test_scraper"
itertag = 'item'
allowed_domains = ["dealbook.nytimes.com"]
start_urls = ["http://dealbook.nytimes.com/feed/"]
def parse_nodes(self, response, nodes):
for index, selector in enumerate(nodes, start=1):
ret = iterate_spider_output(self.parse_node(response, selector))
for result_item in self.process_results(response, ret):
result_item['index'] = index
yield result_item
def parse_node(self, response, selector):
struct = testscraperItem()
title = selector.select('./title/text()').extract()
link = selector.select('./link/@href').extract()
pubdate = selector.select('./pubDate/text()').extract()
topics = selector.select('./category/text()').extract()
struct["title"] = title
struct["link"] = link
struct["pubdate"] = pubdate
struct["topics"] = topics
yield struct
输出:
{'link': [],
'pubdate': [u'Fri, 20 Feb 2015 18:02:28 GMT'],
'title': [u'Currency\u2019s Weakness Troubles China\u2019s Policy Makers'],
'topics': [u'China',
u'Renminbi (Currency)',
u'Economic Conditions and Trends',
u"People's Bank of China",
u'Xi Jinping']}
{'link': [],
'pubdate': [u'Thu, 19 Feb 2015 15:58:15 GMT'],
'title': [u'New Rules Spur a Humbling Overhaul of Wall St. Banks'],
'topics': [u'Banking and Financial Institutions',
u'Dodd-Frank Wall Street Reform and Consumer Protection Act (2010)',
u'Executive Compensation',
u'Regulation and Deregulation of Industry',
u'Goldman Sachs Group Inc',
u'JPMorgan Chase & Company',
u'Federal Reserve System',
u'Federal Deposit Insurance Corp']}
...
我正在尝试使用 Scrapy 从 website.The 中抓取信息,一般结构如下:
<item>
<title>........</title>
<link>.........</link>
<category>......</category>
<category>.......</category>
<pubdate>.........</pubdate>
</item>
网站 XML 有 26 个这样的项目。我想抓取每个项目的 link、标题类别和发布日期,并存储在 CSV file.My 蜘蛛 class如下:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from testscraper.items import testscraperItem
class MySpider(BaseSpider):
name="Test_scraper"
allowed_domains=["http://nytimes.com/feed/"]
start_urls=["http://nytimes.com/feed/"]
def parse(self,response):
data=[]
hxs = HtmlXPathSelector(response)
items= hxs.select('//item')
for item in items:
struct=testscraperItem()
title=item.select('./title/text()').extract()
link=item.select('./link/@href').extract()
pubdate=item.select('./pubDate/text()').extract()
topics=item.select('./category/text()').extract()
struct["title"]=title
struct["link"]=link
struct["pubdate"]=pubdate
struct["topics"]=topics
data.append(struct)
return data
除出版日期标签外,一切正常,我无法删除它(我得到一个空值)。此标签的示例值是:
<pubDate>Thu, 19 Feb 2015 19:29:08 GMT</pubDate>
我使用 response.xpath 尝试了以下代码,我能够提取 pubdate 标签:
def parse(self,response):
items=[]
pubdates=response.xpath('//item//pubDate/text()')
for pubdate in pubdates:
item["pubdate"]=pubdate.extract()
return items
为什么当我遍历项目时我无法提取 pubdate 标签内容,而不是当我将整个网页作为一个整体时能够提取它?我真的很困惑,希望得到帮助 this.Thanks!!出于其他目的,我 必须遍历每个项目 所以代码片段 2 不是一个选项 - 我必须遵循我拥有的第一个代码片段的结构写的
它看起来很像 XML 供稿。如果是这种情况,则需要使用 XMLFeedSpider
:
from scrapy import Item, Field
from scrapy.contrib.spiders import XMLFeedSpider
from testscraper.items import testscraperItem
class MySpider(XMLFeedSpider):
name = "Test_scraper"
itertag = 'item'
allowed_domains = ["dealbook.nytimes.com"]
start_urls = ["http://dealbook.nytimes.com/feed/"]
def parse_nodes(self, response, nodes):
for index, selector in enumerate(nodes, start=1):
ret = iterate_spider_output(self.parse_node(response, selector))
for result_item in self.process_results(response, ret):
result_item['index'] = index
yield result_item
def parse_node(self, response, selector):
struct = testscraperItem()
title = selector.select('./title/text()').extract()
link = selector.select('./link/@href').extract()
pubdate = selector.select('./pubDate/text()').extract()
topics = selector.select('./category/text()').extract()
struct["title"] = title
struct["link"] = link
struct["pubdate"] = pubdate
struct["topics"] = topics
yield struct
输出:
{'link': [],
'pubdate': [u'Fri, 20 Feb 2015 18:02:28 GMT'],
'title': [u'Currency\u2019s Weakness Troubles China\u2019s Policy Makers'],
'topics': [u'China',
u'Renminbi (Currency)',
u'Economic Conditions and Trends',
u"People's Bank of China",
u'Xi Jinping']}
{'link': [],
'pubdate': [u'Thu, 19 Feb 2015 15:58:15 GMT'],
'title': [u'New Rules Spur a Humbling Overhaul of Wall St. Banks'],
'topics': [u'Banking and Financial Institutions',
u'Dodd-Frank Wall Street Reform and Consumer Protection Act (2010)',
u'Executive Compensation',
u'Regulation and Deregulation of Industry',
u'Goldman Sachs Group Inc',
u'JPMorgan Chase & Company',
u'Federal Reserve System',
u'Federal Deposit Insurance Corp']}
...