登录后Scrapy递归网站抓取
Scrapy recursive website crawl after login
登录后我编写了一个蜘蛛程序来抓取网站
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest, Request
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors import LinkExtractor
class LoginSpider(scrapy.Spider):
name = "login"
allowed_domains = ["mydomain.com"]
start_urls = ['https://login.mydomain.com/login']
rules = [Rule(LinkExtractor(allow=('//a[contains(text(), "Next")]'), restrict_xpaths=('//a[contains(text(), "Previous")]',)), 'parse_info')]
def parse(self, response):
return [FormRequest.from_response(response,
formdata={"username":"myemail","password":"mypassword"},
callback=self.parse_info, dont_filter=True)]
def parse_info(self, response):
items = []
for tr in range(1, 5):
xpath = "/html/body/table/tbody/tr[%s]/td[1]/text()" % tr
td1 = Selector(response=response).xpath(xpath).extract()
item = MyItem()
item['col1'] = td1
items.append(item)
return items
和 html
<html>
<table>
<tbody>
<tr><td>Row 1</td></tr>
<tr><td>Row 2</td></tr>
</tbody>
</table>
<div><a href="?page=2">Next</a></div>
<div><a href="#">Previous</a></div>
</html>
所以蜘蛛所做的是它自动从登录页面登录用户并重定向到带有上面 html 的主页。
现在我想要实现的是,我想使用上面的 python 脚本在第一页之后抓取下一页。
我已经阅读了有关规则实施的 Scrapy 文档,但我没有成功使其发挥作用。请帮帮我,我已经坚持了一天多了。谢谢。
I have read about the Scrapy documentation about Rules implementation
but I have no success to make it work.
您代码中的规则不起作用,因为您使用的是标准 Spider (scrapy.Spider
) 而不是 CrawlSpider
。
保留标准 Spider 并手动实现分页,而不是使用 CrawlSpider。做类似的事情:
def parse_info(self, response):
# items = []
for tr in range(1, 5):
xpath = "/html/body/table/tbody/tr[%s]/td[1]/text()" % tr
td1 = Selector(response=response).xpath(xpath).extract()
item = MyItem()
item['col1'] = td1
# items.append(item)
yield item
# return items
# If there is a next page, extract href, build request
# and send it to server
next_page = response.xpath('//a[contains(text(), "Next")]/@href')
if next_page:
next_page_href = next_page.extract()[0]
next_page_url = response.urljoin(next_page_href)
request = scrapy.Request(next_page_url, callback=self.parse_info)
yield request
登录后我编写了一个蜘蛛程序来抓取网站
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest, Request
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors import LinkExtractor
class LoginSpider(scrapy.Spider):
name = "login"
allowed_domains = ["mydomain.com"]
start_urls = ['https://login.mydomain.com/login']
rules = [Rule(LinkExtractor(allow=('//a[contains(text(), "Next")]'), restrict_xpaths=('//a[contains(text(), "Previous")]',)), 'parse_info')]
def parse(self, response):
return [FormRequest.from_response(response,
formdata={"username":"myemail","password":"mypassword"},
callback=self.parse_info, dont_filter=True)]
def parse_info(self, response):
items = []
for tr in range(1, 5):
xpath = "/html/body/table/tbody/tr[%s]/td[1]/text()" % tr
td1 = Selector(response=response).xpath(xpath).extract()
item = MyItem()
item['col1'] = td1
items.append(item)
return items
和 html
<html>
<table>
<tbody>
<tr><td>Row 1</td></tr>
<tr><td>Row 2</td></tr>
</tbody>
</table>
<div><a href="?page=2">Next</a></div>
<div><a href="#">Previous</a></div>
</html>
所以蜘蛛所做的是它自动从登录页面登录用户并重定向到带有上面 html 的主页。
现在我想要实现的是,我想使用上面的 python 脚本在第一页之后抓取下一页。
我已经阅读了有关规则实施的 Scrapy 文档,但我没有成功使其发挥作用。请帮帮我,我已经坚持了一天多了。谢谢。
I have read about the Scrapy documentation about Rules implementation but I have no success to make it work.
您代码中的规则不起作用,因为您使用的是标准 Spider (scrapy.Spider
) 而不是 CrawlSpider
。
保留标准 Spider 并手动实现分页,而不是使用 CrawlSpider。做类似的事情:
def parse_info(self, response):
# items = []
for tr in range(1, 5):
xpath = "/html/body/table/tbody/tr[%s]/td[1]/text()" % tr
td1 = Selector(response=response).xpath(xpath).extract()
item = MyItem()
item['col1'] = td1
# items.append(item)
yield item
# return items
# If there is a next page, extract href, build request
# and send it to server
next_page = response.xpath('//a[contains(text(), "Next")]/@href')
if next_page:
next_page_href = next_page.extract()[0]
next_page_url = response.urljoin(next_page_href)
request = scrapy.Request(next_page_url, callback=self.parse_info)
yield request