未写入 Scrapy 结果
Scrapy Result not being written
我正在抓取以下网站:https://graphics.stltoday.com/apps/payrolls/salaries/teachers/
希望抓取每个人的所有数据。这意味着按照 link 到达每个地区,然后到达该地区内的每个工作类别,最后到达每个员工。我认为问题可能出在我的 URL 正则表达式上,但我不确定。在每个员工的页面上,我想我已经正确识别了 XPaths:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class Spider2(CrawlSpider):
#name of the spider
name = 'stltoday'
#list of allowed domains
allowed_domains = ['graphics.stltoday.com']
#starting url for scraping
start_urls = ['https://graphics.stltoday.com/apps/payrolls/salaries/teachers']
rules = [
Rule(LinkExtractor(
allow=['/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/[0-9]+/position/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/detail/[0-9]+/$']),
callback='parse_item',
follow=True),
]
#setting the location of the output csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/stltoday1.csv'
}
def parse_item(self, response):
#Remove XML namespaces
response.selector.remove_namespaces()
url = response.url
#Extract article information
fullname = response.xpath('//p[@class="table__title"]./text()').extract_first()
for row in response.xpath('//th[@scope="row"]'):
yield {
"url": url,
"fullname": fullname,
"district": row.xpath('./text()').extract_first(),
"school": row.xpath('./following-sibling::*[1]/text()').extract_first(),
"degree": row.xpath('./following-sibling::*[2]/text()').extract_first(),
"salary": row.xpath('./following-sibling::*[3]/text()').extract_first(),
"extcontractpay": row.xpath('./following-sibling::*[4]/text()').extract_first(),
"extraduty": row.xpath('./following-sibling::*[5]/text()').extract_first(),
"totalpay": row.xpath('./following-sibling::*[6]/text()').extract_first(),
"yearsindistrict": row.xpath('./following-sibling::*[7]/text()').extract_first(),
"yearsinmoschools": row.xpath('./following-sibling::*[8]/text()').extract_first(),
}
for item in zip(url,fullname,district,school,degree,salary,extcontractpay,extraduty,totalpay,yearsindistrict,yearsinmoschools):
yield {
'url' : url,
'fullname' : fullname,
'district' : district,
'school' : school,
'degree' : degree,
'salary' : salary,
'extcontractpay' : extcontractpay,
'extraduty' : extraduty,
'totalpay' : totalpay,
'yearsindistrict' : yearsindistrict,
'yearsinmoschools' : yearsinmoschools
}
蜘蛛程序运行(在我暂停之前几分钟),但没有任何内容写入 .csv 文件。
所以我钻进了一个兔子洞,将蜘蛛重建为一个基本的蜘蛛,而不是爬行。我不明白为什么 LinkEctract 规则集中没有回调解析器。
无论如何,我创建了一个 cvs_exporter 函数来更好地管理输出。将它及其参数添加到设置中,瞧。
The spider traverses through site via same logic as the "Crawl"spider,
though target specif to the urls, rather the a broad crawl. From
"parse_district" > "parse_postions" > finally to "parse_person", where
the items you wish to scrape exist.
#stlSpider.py
import scrapy
from stltoday.items import StltodayItem
class StlspiderSpider(scrapy.Spider):
name = 'stlSpider'
allowed_domains = ['graphics.stltoday.com']
start_urls = ['http://graphics.stltoday.com/apps/payrolls/salaries/teachers/']
def parse(self, response):
for href in response.xpath("//th/a/@href").re(".*/teachers/[0-9]+/"):
yield scrapy.Request(response.urljoin(href),
callback=self.parse_district)
def parse_district(self, response):
for href in response.xpath("//th/a/@href").re(".*position.*"):
yield scrapy.Request(response.urljoin(href),
callback=self.parse_position)
def parse_position(self, response):
for href in response.xpath("//td/a/@href").extract():
yield scrapy.Request(response.urljoin(href),
callback=self.parse_person)
def parse_person(self, response):
item = StltodayItem()
name = response.xpath('//p[@class="table__title"]/text()').extract_first()
row = response.xpath('//th[@scope="row"]')
item["url"] = response.url
item["fullname"] = name
item["district"] = row.xpath('//th[contains(., "District")]/following-sibling::td/text()').extract_first()
item["school"] = row.xpath('//th[contains(., "School")]/following-sibling::td/text()').extract_first()
item["degree"] = row.xpath('//th[contains(., "Degree")]/following-sibling::td/text()').extract_first()
item["salary"] = row.xpath('//th[contains(., "Salary")]/following-sibling::td/text()').extract_first()
item["extcontractpay"] = row.xpath('//th[contains(., "Extended")]/following-sibling::td/text()').extract_first()
item["extraduty"] = row.xpath('//th[contains(., "Extra")]/following-sibling::td/text()').extract_first()
item["totalpay"] = row.xpath('//th[contains(., "Total")]/following-sibling::td/text()').extract_first()
item["yearsindistrict"] = row.xpath('//th[contains(., "Years in district")]/following-sibling::td/text()').extract_first()
item["yearsinmoschools"] = row.xpath('//th[contains(., "Years in MO")]/following-sibling::td/text()').extract_first()
yield item
Itemized the... items lol
#items.py
import scrapy
class StltodayItem(scrapy.Item):
url = scrapy.Field()
fullname = scrapy.Field()
district = scrapy.Field()
school = scrapy.Field()
degree = scrapy.Field()
salary = scrapy.Field()
extcontractpay = scrapy.Field()
extraduty = scrapy.Field()
totalpay = scrapy.Field()
yearsindistrict = scrapy.Field()
yearsinmoschools = scrapy.Field()
Created a "csv_exporter" module where you can call to it to make
adjustments on how the output of your file, including setting
delimitters and order of items to output
#csv_exporter.py
_author_ = 'Erick'
from scrapy.conf import settings
from scrapy.contrib.exporter import CsvItemExporter
class MyProjectCsvItemExporter(CsvItemExporter):
def __init__(self, *args, **kwargs):
delimiter = settings.get('CSV_DELIMITER', ',')
kwargs['delimiter'] = delimiter
fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
if fields_to_export :
kwargs['fields_to_export'] = fields_to_export
super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs)
Include the exporter to your settings.py file, here you include the
args set ins "csv_exporter" which is the delimiter you wish to use and
the order of feilds(items) to export
#settings.py
OT_NAME = 'stltoday'
SPIDER_MODULES = ['stltoday.spiders']
NEWSPIDER_MODULE = 'stltoday.spiders'
FEED_FORMAT = 'csv'
FEED_URI = 'tmp/stltoday1.csv'
FIELDS_TO_EXPORT = ["url", "fullname", "district", "school", "degree", "salary", "extcontractpay", "extraduty", "totalpay", "yearsindistrict", "yearsinmoschools"]
FEED_EXPORTERS = {
'csv': 'stltoday.csv_exporter.MyProjectCsvItemExporter',
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'stltoday (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
...
我正在抓取以下网站:https://graphics.stltoday.com/apps/payrolls/salaries/teachers/
希望抓取每个人的所有数据。这意味着按照 link 到达每个地区,然后到达该地区内的每个工作类别,最后到达每个员工。我认为问题可能出在我的 URL 正则表达式上,但我不确定。在每个员工的页面上,我想我已经正确识别了 XPaths:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class Spider2(CrawlSpider):
#name of the spider
name = 'stltoday'
#list of allowed domains
allowed_domains = ['graphics.stltoday.com']
#starting url for scraping
start_urls = ['https://graphics.stltoday.com/apps/payrolls/salaries/teachers']
rules = [
Rule(LinkExtractor(
allow=['/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/[0-9]+/position/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/detail/[0-9]+/$']),
callback='parse_item',
follow=True),
]
#setting the location of the output csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/stltoday1.csv'
}
def parse_item(self, response):
#Remove XML namespaces
response.selector.remove_namespaces()
url = response.url
#Extract article information
fullname = response.xpath('//p[@class="table__title"]./text()').extract_first()
for row in response.xpath('//th[@scope="row"]'):
yield {
"url": url,
"fullname": fullname,
"district": row.xpath('./text()').extract_first(),
"school": row.xpath('./following-sibling::*[1]/text()').extract_first(),
"degree": row.xpath('./following-sibling::*[2]/text()').extract_first(),
"salary": row.xpath('./following-sibling::*[3]/text()').extract_first(),
"extcontractpay": row.xpath('./following-sibling::*[4]/text()').extract_first(),
"extraduty": row.xpath('./following-sibling::*[5]/text()').extract_first(),
"totalpay": row.xpath('./following-sibling::*[6]/text()').extract_first(),
"yearsindistrict": row.xpath('./following-sibling::*[7]/text()').extract_first(),
"yearsinmoschools": row.xpath('./following-sibling::*[8]/text()').extract_first(),
}
for item in zip(url,fullname,district,school,degree,salary,extcontractpay,extraduty,totalpay,yearsindistrict,yearsinmoschools):
yield {
'url' : url,
'fullname' : fullname,
'district' : district,
'school' : school,
'degree' : degree,
'salary' : salary,
'extcontractpay' : extcontractpay,
'extraduty' : extraduty,
'totalpay' : totalpay,
'yearsindistrict' : yearsindistrict,
'yearsinmoschools' : yearsinmoschools
}
蜘蛛程序运行(在我暂停之前几分钟),但没有任何内容写入 .csv 文件。
所以我钻进了一个兔子洞,将蜘蛛重建为一个基本的蜘蛛,而不是爬行。我不明白为什么 LinkEctract 规则集中没有回调解析器。
无论如何,我创建了一个 cvs_exporter 函数来更好地管理输出。将它及其参数添加到设置中,瞧。
The spider traverses through site via same logic as the "Crawl"spider, though target specif to the urls, rather the a broad crawl. From "parse_district" > "parse_postions" > finally to "parse_person", where the items you wish to scrape exist.
#stlSpider.py
import scrapy
from stltoday.items import StltodayItem
class StlspiderSpider(scrapy.Spider):
name = 'stlSpider'
allowed_domains = ['graphics.stltoday.com']
start_urls = ['http://graphics.stltoday.com/apps/payrolls/salaries/teachers/']
def parse(self, response):
for href in response.xpath("//th/a/@href").re(".*/teachers/[0-9]+/"):
yield scrapy.Request(response.urljoin(href),
callback=self.parse_district)
def parse_district(self, response):
for href in response.xpath("//th/a/@href").re(".*position.*"):
yield scrapy.Request(response.urljoin(href),
callback=self.parse_position)
def parse_position(self, response):
for href in response.xpath("//td/a/@href").extract():
yield scrapy.Request(response.urljoin(href),
callback=self.parse_person)
def parse_person(self, response):
item = StltodayItem()
name = response.xpath('//p[@class="table__title"]/text()').extract_first()
row = response.xpath('//th[@scope="row"]')
item["url"] = response.url
item["fullname"] = name
item["district"] = row.xpath('//th[contains(., "District")]/following-sibling::td/text()').extract_first()
item["school"] = row.xpath('//th[contains(., "School")]/following-sibling::td/text()').extract_first()
item["degree"] = row.xpath('//th[contains(., "Degree")]/following-sibling::td/text()').extract_first()
item["salary"] = row.xpath('//th[contains(., "Salary")]/following-sibling::td/text()').extract_first()
item["extcontractpay"] = row.xpath('//th[contains(., "Extended")]/following-sibling::td/text()').extract_first()
item["extraduty"] = row.xpath('//th[contains(., "Extra")]/following-sibling::td/text()').extract_first()
item["totalpay"] = row.xpath('//th[contains(., "Total")]/following-sibling::td/text()').extract_first()
item["yearsindistrict"] = row.xpath('//th[contains(., "Years in district")]/following-sibling::td/text()').extract_first()
item["yearsinmoschools"] = row.xpath('//th[contains(., "Years in MO")]/following-sibling::td/text()').extract_first()
yield item
Itemized the... items lol
#items.py
import scrapy
class StltodayItem(scrapy.Item):
url = scrapy.Field()
fullname = scrapy.Field()
district = scrapy.Field()
school = scrapy.Field()
degree = scrapy.Field()
salary = scrapy.Field()
extcontractpay = scrapy.Field()
extraduty = scrapy.Field()
totalpay = scrapy.Field()
yearsindistrict = scrapy.Field()
yearsinmoschools = scrapy.Field()
Created a "csv_exporter" module where you can call to it to make adjustments on how the output of your file, including setting delimitters and order of items to output
#csv_exporter.py
_author_ = 'Erick'
from scrapy.conf import settings
from scrapy.contrib.exporter import CsvItemExporter
class MyProjectCsvItemExporter(CsvItemExporter):
def __init__(self, *args, **kwargs):
delimiter = settings.get('CSV_DELIMITER', ',')
kwargs['delimiter'] = delimiter
fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
if fields_to_export :
kwargs['fields_to_export'] = fields_to_export
super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs)
Include the exporter to your settings.py file, here you include the args set ins "csv_exporter" which is the delimiter you wish to use and the order of feilds(items) to export
#settings.py
OT_NAME = 'stltoday'
SPIDER_MODULES = ['stltoday.spiders']
NEWSPIDER_MODULE = 'stltoday.spiders'
FEED_FORMAT = 'csv'
FEED_URI = 'tmp/stltoday1.csv'
FIELDS_TO_EXPORT = ["url", "fullname", "district", "school", "degree", "salary", "extcontractpay", "extraduty", "totalpay", "yearsindistrict", "yearsinmoschools"]
FEED_EXPORTERS = {
'csv': 'stltoday.csv_exporter.MyProjectCsvItemExporter',
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'stltoday (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
...