无法将 scrapy 导入 json
Trouble importing scrapy into json
我试图从 Craigslist 中提取一些信息并将其存储在 JSON 文件中,但信息存储有点错误。我得到的不是 [title, link, location, time] 的数组,而是一个包含所有标题的数组,一个包含所有 links 等。我的标题是错误的还是for 循环本身错误?
from scrapy.spiders import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["https://pittsburgh.craigslist.org/search/ccc"]
def parse(self, response):
titles = response.selector.xpath("//p[@class='row']")
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("//span[@id='titletextonly']").extract()
item["link"] = titles.xpath("a/@href").extract()
item["location"] = titles.xpath("//small").extract()
item["time"] = titles.xpath('//time').extract()
items.append(item)
return items
那是因为您的内部 xpath 匹配从树根开始的元素。相反,您需要通过在前面加上一个点来强制它们在每个项目的上下文中工作:
for title in titles:
item = CraigslistSampleItem()
item["title"] = title.xpath(".//span[@id='titletextonly']").extract()
item["link"] = title.xpath("a/@href").extract()
item["location"] = title.xpath(".//small").extract()
item["time"] = title.xpath('.//time').extract()
yield item
我试图从 Craigslist 中提取一些信息并将其存储在 JSON 文件中,但信息存储有点错误。我得到的不是 [title, link, location, time] 的数组,而是一个包含所有标题的数组,一个包含所有 links 等。我的标题是错误的还是for 循环本身错误?
from scrapy.spiders import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["https://pittsburgh.craigslist.org/search/ccc"]
def parse(self, response):
titles = response.selector.xpath("//p[@class='row']")
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("//span[@id='titletextonly']").extract()
item["link"] = titles.xpath("a/@href").extract()
item["location"] = titles.xpath("//small").extract()
item["time"] = titles.xpath('//time').extract()
items.append(item)
return items
那是因为您的内部 xpath 匹配从树根开始的元素。相反,您需要通过在前面加上一个点来强制它们在每个项目的上下文中工作:
for title in titles:
item = CraigslistSampleItem()
item["title"] = title.xpath(".//span[@id='titletextonly']").extract()
item["link"] = title.xpath("a/@href").extract()
item["location"] = title.xpath(".//small").extract()
item["time"] = title.xpath('.//time').extract()
yield item