如何以最简单的方式将抓取的数据保存在 JSON 文件中?
How do I save scraped data the most easy way inside a JSON file?
我正在从一个网站上抓取一些数据并一个一个地存储在 item
字典中。如何将所有数据存储为 JSON 格式,比如 geramny_startup_jobs.json
文件?我的代码在这里:
import scrapy
import json
import re
import textwrap
import JobsItem from JobsItem
class GermanyStartupJobs(scrapy.Spider):
name = 'JobsItem'
# start_urls= ['https://www.germanystartupjobs.com/jm-ajax/get_listings/' + str(i) for i in range(1, 5)]
start_urls= ['https://www.germanystartupjobs.com/jm-ajax/get_listings/']
def parse(self, response):
data = json.loads(response.body)
html = data['html']
selector = scrapy.Selector(text=data['html'], type="html")
hrefs = selector.xpath('//a/@href').extract()
for href in hrefs:
yield scrapy.Request(href, callback=self.parse_detail)
def parse_detail(self, response):
try:
full_d = str(response.xpath\
('//div[@class="col-sm-5 justify-text"]//*/text()').extract())
full_des_li = full_d.split(',')
full_des_lis = []
for f in full_des_li:
ff = "".join((f.strip().replace('\n', '')).split())
if len(ff) < 3:
continue
full_des_lis.append(f)
full = 'u'+ str(full_des_lis)
length = len(full)
full_des_list = textwrap.wrap(full, length/3)[:-1]
full_des_list.reverse()
# get the job title
try:
title = response.css('.job-title').xpath('./text()').extract_first().strip()
except:
print "No title"
title = ''
# get the company name
try:
company_name = response.css('.company-title').xpath('./normal/text()').extract_first().strip()
except:
print "No company name"
company_name = ''
# get the company location
try:
company_location = response.xpath('//a[@class="google_map_link"]/text()').extract_first().strip()
except:
print 'No company location'
company_location = ''
# get the job poster email (if available)
try:
pattern = re.compile(r"(\w(?:[-.+]?\w+)+\@(?:[a-z0-9](?:[-+]?\w+)*\.)+[a-z]{2,})", re.I)
for text in full_des_list:
email = pattern.findall(text)[-1]
if email is not None:
break
except:
print 'No email'
email = ''
# get the job poster phone number(if available)
try:
r = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
phone = r.findall(full_des_list[0])[-1]
if phone is not None:
phone = '+49-' +phone
except:
print 'no phone'
phone = ''
# get the name of the poster (if available)
try:
for text in full_des_list:
names = get_human_names(text)
if len(names) != 0:
name = names[-1]
print name
break
except:
print 'no name found'
name = ''
item = {
'title': title,
'company name': company_name,
'company_location': company_location,
# 'poster name': name,
'email': email,
'phone': phone,
'source': u"Germany Startup Job"
}
yield item
except:
print 'Not valid'
# raise Exception("Think better!!")
我从 scrapy
网站创建了另一个类似于以下的文件用于建模,并导入到上述文件中。
import scrapy
class JobsItem(scrapy.Item):
title = scrapy.Field()
company_name = scrapy.Field()
company_location = scrapy.Field()
email = scrapy.Field()
phone = scrapy.Field()
source = scrapy.Field()
然后,我运行 命令scrapy crawl JobsItem -o geramny_startup_jobs.json
似乎不起作用。我得到了输出 Scrapy 1.2.2 - no active project
,这是否意味着我需要为 运行 宁这个我不打算做的命令创建一个项目。
更新:我发现命令 scrapy runspider file_name.py -o item.json
及其返回未清理格式的输出。仍然需要获得干净的输出。
您没有在蜘蛛中使用 JobsItem
class。替换此代码:
item = {
'title': title,
'company name': company_name,
'company_location': company_location,
# 'poster name': name,
'email': email,
'phone': phone,
'source': u"Germany Startup Job"
}
使用此代码:
item = JobsItem()
item['title'] = title
item['company_name'] = company_name
item['company_location'] = company_location
item['email'] = email
item['phone'] = phone
item['source'] = u"Germany Startup Job"
这样,您的蜘蛛将返回一个项目 class 而不是简单的字典。这将允许 Scrapy 在您使用标志 -o geramny_startup_jobs.json
.
时将项目写入磁盘
我正在从一个网站上抓取一些数据并一个一个地存储在 item
字典中。如何将所有数据存储为 JSON 格式,比如 geramny_startup_jobs.json
文件?我的代码在这里:
import scrapy
import json
import re
import textwrap
import JobsItem from JobsItem
class GermanyStartupJobs(scrapy.Spider):
name = 'JobsItem'
# start_urls= ['https://www.germanystartupjobs.com/jm-ajax/get_listings/' + str(i) for i in range(1, 5)]
start_urls= ['https://www.germanystartupjobs.com/jm-ajax/get_listings/']
def parse(self, response):
data = json.loads(response.body)
html = data['html']
selector = scrapy.Selector(text=data['html'], type="html")
hrefs = selector.xpath('//a/@href').extract()
for href in hrefs:
yield scrapy.Request(href, callback=self.parse_detail)
def parse_detail(self, response):
try:
full_d = str(response.xpath\
('//div[@class="col-sm-5 justify-text"]//*/text()').extract())
full_des_li = full_d.split(',')
full_des_lis = []
for f in full_des_li:
ff = "".join((f.strip().replace('\n', '')).split())
if len(ff) < 3:
continue
full_des_lis.append(f)
full = 'u'+ str(full_des_lis)
length = len(full)
full_des_list = textwrap.wrap(full, length/3)[:-1]
full_des_list.reverse()
# get the job title
try:
title = response.css('.job-title').xpath('./text()').extract_first().strip()
except:
print "No title"
title = ''
# get the company name
try:
company_name = response.css('.company-title').xpath('./normal/text()').extract_first().strip()
except:
print "No company name"
company_name = ''
# get the company location
try:
company_location = response.xpath('//a[@class="google_map_link"]/text()').extract_first().strip()
except:
print 'No company location'
company_location = ''
# get the job poster email (if available)
try:
pattern = re.compile(r"(\w(?:[-.+]?\w+)+\@(?:[a-z0-9](?:[-+]?\w+)*\.)+[a-z]{2,})", re.I)
for text in full_des_list:
email = pattern.findall(text)[-1]
if email is not None:
break
except:
print 'No email'
email = ''
# get the job poster phone number(if available)
try:
r = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
phone = r.findall(full_des_list[0])[-1]
if phone is not None:
phone = '+49-' +phone
except:
print 'no phone'
phone = ''
# get the name of the poster (if available)
try:
for text in full_des_list:
names = get_human_names(text)
if len(names) != 0:
name = names[-1]
print name
break
except:
print 'no name found'
name = ''
item = {
'title': title,
'company name': company_name,
'company_location': company_location,
# 'poster name': name,
'email': email,
'phone': phone,
'source': u"Germany Startup Job"
}
yield item
except:
print 'Not valid'
# raise Exception("Think better!!")
我从 scrapy
网站创建了另一个类似于以下的文件用于建模,并导入到上述文件中。
import scrapy
class JobsItem(scrapy.Item):
title = scrapy.Field()
company_name = scrapy.Field()
company_location = scrapy.Field()
email = scrapy.Field()
phone = scrapy.Field()
source = scrapy.Field()
然后,我运行 命令scrapy crawl JobsItem -o geramny_startup_jobs.json
似乎不起作用。我得到了输出 Scrapy 1.2.2 - no active project
,这是否意味着我需要为 运行 宁这个我不打算做的命令创建一个项目。
更新:我发现命令 scrapy runspider file_name.py -o item.json
及其返回未清理格式的输出。仍然需要获得干净的输出。
您没有在蜘蛛中使用 JobsItem
class。替换此代码:
item = {
'title': title,
'company name': company_name,
'company_location': company_location,
# 'poster name': name,
'email': email,
'phone': phone,
'source': u"Germany Startup Job"
}
使用此代码:
item = JobsItem()
item['title'] = title
item['company_name'] = company_name
item['company_location'] = company_location
item['email'] = email
item['phone'] = phone
item['source'] = u"Germany Startup Job"
这样,您的蜘蛛将返回一个项目 class 而不是简单的字典。这将允许 Scrapy 在您使用标志 -o geramny_startup_jobs.json
.