如何在scrapy中抓取url的url?
How to crawl the url of url in scrapy?
从主要 url 中提取 url = [http://www.amazon.in/Lenovo-Ideapad-15-6-inch-Integrated-Graphics/dp/B01EN6RA7W?ie=UTF8&keywords=lenovo%20laptop&qid=1479811190&ref_=sr_1_1&s=computers&sr=1-1]
import scrapy
from product.items import ProductItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class amazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.in"]
start_urls = [ main url here]
def parse(self, response):
item=ProductItem()
for content in response.xpath("sample xpath"):
url = content.xpath("a/@href").extract()
request = scrapy.Request(str(url[0]),callback=self.page2_parse)
#url is extracted from my main url
item['product_Rating'] = request
yield item
def page2_parse(self,response):
#here i dint get the response for the second url content
for content in response.xpath(sample xpath):
yield content.xpath(sample xpath).extract()
这里没有执行第二个功能。请帮帮我。
我终于做到了,请按照下面的代码来实现 url of url 的抓取值。
def parse(self, response):
item=ProductItem()
url_list = [content for content in response.xpath("//div[@class='listing']/div/a/@href").extract()]
item['product_DetailUrl'] = url_list
for url in url_list:
request = Request(str(url),callback=self.page2_parse)
request.meta['item'] = item
yield request
def page2_parse(self,response):
item=ProductItem()
item = response.meta['item']
item['product_ColorAvailability'] = [content for content in response.xpath("//div[@id='templateOption']//ul/li//img/@color").extract()]
yield item
从主要 url 中提取 url = [http://www.amazon.in/Lenovo-Ideapad-15-6-inch-Integrated-Graphics/dp/B01EN6RA7W?ie=UTF8&keywords=lenovo%20laptop&qid=1479811190&ref_=sr_1_1&s=computers&sr=1-1]
import scrapy
from product.items import ProductItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class amazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.in"]
start_urls = [ main url here]
def parse(self, response):
item=ProductItem()
for content in response.xpath("sample xpath"):
url = content.xpath("a/@href").extract()
request = scrapy.Request(str(url[0]),callback=self.page2_parse)
#url is extracted from my main url
item['product_Rating'] = request
yield item
def page2_parse(self,response):
#here i dint get the response for the second url content
for content in response.xpath(sample xpath):
yield content.xpath(sample xpath).extract()
这里没有执行第二个功能。请帮帮我。
我终于做到了,请按照下面的代码来实现 url of url 的抓取值。
def parse(self, response):
item=ProductItem()
url_list = [content for content in response.xpath("//div[@class='listing']/div/a/@href").extract()]
item['product_DetailUrl'] = url_list
for url in url_list:
request = Request(str(url),callback=self.page2_parse)
request.meta['item'] = item
yield request
def page2_parse(self,response):
item=ProductItem()
item = response.meta['item']
item['product_ColorAvailability'] = [content for content in response.xpath("//div[@id='templateOption']//ul/li//img/@color").extract()]
yield item