Scrapy 只从列表中返回一个项目
Scrapy only returning a single item from list
我认为我的 xpaths
编码方式不正确,因为每个 url 我只得到一个结果。然而,每个 url 总共有 25 个职位发布(不包括在下一页中的职位。)我怎样才能更正我的 xpaths 以获得所有结果?
这是我的刮板:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
callback = self.parse,
cb_kwargs = {
'items':items
}
)
def parse(self, response, items):
container = response.xpath('//ol[@id="searchResults"]')
for lists in container:
loader = ItemLoader(CvItem(), selector = lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[@id]//a[@title]/@title')
loader.add_xpath('salary', '//article[@id]//dl//dd[@class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[@id]/div//div/p/a//text()')
yield loader.load_item()
我为那些在我上传后的前 15 分钟内检查过的请求更新了一个小错误。
问题出在容器的 xpath 中。您只得到容器而实际上没有其中的项目,因此您只在容器本身上循环一次,而不是要抓取的实际项目。
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
title = Field(output_processor=TakeFirst())
organisation = Field(output_processor=TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url=url,
cb_kwargs={
'items': items
}
)
def parse(self, response, items):
container = response.xpath('//ol[@id="searchResults"]//li[@class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector=lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[@id]//a[@title]/@title')
loader.add_xpath('salary', '//article[@id]//dl//dd[@class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[@id]/div//div/p/a//text()')
yield loader.load_item()
我认为我的 xpaths
编码方式不正确,因为每个 url 我只得到一个结果。然而,每个 url 总共有 25 个职位发布(不包括在下一页中的职位。)我怎样才能更正我的 xpaths 以获得所有结果?
这是我的刮板:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
callback = self.parse,
cb_kwargs = {
'items':items
}
)
def parse(self, response, items):
container = response.xpath('//ol[@id="searchResults"]')
for lists in container:
loader = ItemLoader(CvItem(), selector = lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[@id]//a[@title]/@title')
loader.add_xpath('salary', '//article[@id]//dl//dd[@class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[@id]/div//div/p/a//text()')
yield loader.load_item()
我为那些在我上传后的前 15 分钟内检查过的请求更新了一个小错误。
问题出在容器的 xpath 中。您只得到容器而实际上没有其中的项目,因此您只在容器本身上循环一次,而不是要抓取的实际项目。
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
title = Field(output_processor=TakeFirst())
organisation = Field(output_processor=TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url=url,
cb_kwargs={
'items': items
}
)
def parse(self, response, items):
container = response.xpath('//ol[@id="searchResults"]//li[@class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector=lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[@id]//a[@title]/@title')
loader.add_xpath('salary', '//article[@id]//dl//dd[@class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[@id]/div//div/p/a//text()')
yield loader.load_item()