为什么我的脚本只保存我插入的 40 个对象中的一个?
Why does my script only save one out of the 40 objects that I insert?
我正在抓取数据并将其放入数据库中。问题是只保存了一个对象,而我知道收集了大约 40 个对象。
如何让我的脚本保存所有对象?
class PresstvPipeline(object):
def __init__(self):
engine = db_connect()
create_presstv_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, items, spider):
session = self.Session()
for title, link, date in zip(items['title'], items['link'], items['date']):
print(title, link, date)
item = Presstv(title = title, link = link, date = date)
if session.query(Presstv).filter_by(link=item.link).first() == None:
try:
session.add(item)
session.commit()
logger.info('Item saved')
except:
session.rollback()
raise
finally:
session.close()
return item
presstv_url = "http://www.url.ir/Default/Section/1"
presstv_xpath = '//html/body/section/div/div/section/div[2]/section/ul'
presstv_pipeline = PresstvPipeline()
def presstv_extract_item(element):
return {
'title': element.xpath('li/div/div/p/text()'),
'link': element.xpath('li/div/div/a/@href'),
'date': element.xpath('li/div/div/div/text()'),
}
def spider_html(input_url, extract_function, input_xpath, pipeline):
tree = lxml.html.parse(input_url)
for element in tree.xpath(input_xpath):
pipeline.process_item(extract_function(element), None)
presstv = spider_html(presstv_url, presstv_extract_item, presstv_xpath, presstv_pipeline)
您将在 for 循环内关闭会话,因此在后续迭代中不会发生任何事情。实际上它比那更糟糕,因为你在循环内 returning 项目,这意味着循环甚至不会执行其余的迭代。将 rollback/close 处理移到循环外。也将 return 移到外面。您不需要回滚,因为无论如何会话都会关闭。
def process_item(self, items, spider):
session = self.Session()
try:
for title, link, date in zip(items['title'], items['link'], items['date']):
print(title, link, date)
item = Presstv(title = title, link = link, date = date)
if session.query(Presstv).filter_by(link=item.link).first() == None:
session.add(item)
session.commit()
logger.info('Item saved')
finally:
session.close()
return items
我正在抓取数据并将其放入数据库中。问题是只保存了一个对象,而我知道收集了大约 40 个对象。
如何让我的脚本保存所有对象?
class PresstvPipeline(object):
def __init__(self):
engine = db_connect()
create_presstv_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, items, spider):
session = self.Session()
for title, link, date in zip(items['title'], items['link'], items['date']):
print(title, link, date)
item = Presstv(title = title, link = link, date = date)
if session.query(Presstv).filter_by(link=item.link).first() == None:
try:
session.add(item)
session.commit()
logger.info('Item saved')
except:
session.rollback()
raise
finally:
session.close()
return item
presstv_url = "http://www.url.ir/Default/Section/1"
presstv_xpath = '//html/body/section/div/div/section/div[2]/section/ul'
presstv_pipeline = PresstvPipeline()
def presstv_extract_item(element):
return {
'title': element.xpath('li/div/div/p/text()'),
'link': element.xpath('li/div/div/a/@href'),
'date': element.xpath('li/div/div/div/text()'),
}
def spider_html(input_url, extract_function, input_xpath, pipeline):
tree = lxml.html.parse(input_url)
for element in tree.xpath(input_xpath):
pipeline.process_item(extract_function(element), None)
presstv = spider_html(presstv_url, presstv_extract_item, presstv_xpath, presstv_pipeline)
您将在 for 循环内关闭会话,因此在后续迭代中不会发生任何事情。实际上它比那更糟糕,因为你在循环内 returning 项目,这意味着循环甚至不会执行其余的迭代。将 rollback/close 处理移到循环外。也将 return 移到外面。您不需要回滚,因为无论如何会话都会关闭。
def process_item(self, items, spider):
session = self.Session()
try:
for title, link, date in zip(items['title'], items['link'], items['date']):
print(title, link, date)
item = Presstv(title = title, link = link, date = date)
if session.query(Presstv).filter_by(link=item.link).first() == None:
session.add(item)
session.commit()
logger.info('Item saved')
finally:
session.close()
return items