将Scrapy数据保存到MySQL中对应的URL
Saving Scrapy data to corresponding URL in MySQL
目前正在使用 Scrapy。
我有一个 URL 的列表存储在 MySQL 数据库中。蜘蛛访问这些 URLs,捕获两个目标信息(score 和 count)。我的目标是当 Scrapy 完成抓取时,它会在移动到下一个 URL.
之前自动填充相应的列
我是新手,我似乎无法使保存部分正常工作。 score 和 count 成功传递到数据库。但它被保存为新行而不是与源相关联 URL。
这是我的代码:
amazon_spider.py
import scrapy
from whatoplaybot.items import crawledScore
import MySQLdb
class amazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = []
def parse(self, response):
print self.start_urls
def start_requests(self):
conn = MySQLdb.connect(
user='root',
passwd='',
db='scraper',
host='127.0.0.1',
charset="utf8",
use_unicode=True
)
cursor = conn.cursor()
cursor.execute(
'SELECT url FROM scraped;'
)
rows = cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[0])
conn.close()
def parse(self, response):
item = crawledScore()
item['reviewScore'] = response.xpath('//*[@id="avgRating"]/span/a/span/text()').re("[0-9,.]+")[0]
item['reviewCount'] = response.xpath('//*[@id="summaryStars"]/a/text()').re("[0-9,]+")
yield item
pipelines.py
import sys
import MySQLdb
class storeScore(object):
def __init__(self):
self.conn = MySQLdb.connect(
user='root',
passwd='',
db='scraper',
host='127.0.0.1',
charset="utf8",
use_unicode=True
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped(score, count) VALUES (%s, %s)""", (item['reviewScore'], item['reviewCount']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
非常感谢任何帮助和指导。
谢谢大家。
按照以下步骤操作:
将 reviewURL 字段添加到您的 crawledScore 项目:
class crawledScore(scrapy.Item):
reviewScore = scrapy.Field()
reviewCount = scrapy.Field()
reviewURL = scrapy.Field()
将回复url保存到项目['reviewURL']:
def parse(self, response):
item = crawledScore()
item['reviewScore'] = response.xpath('//*[@id="avgRating"]/span/a/span/text()').re("[0-9,.]+")[0]
item['reviewCount'] = response.xpath('//*[@id="summaryStars"]/a/text()').re("[0-9,]+")
item['reviewURL'] = response.url
yield item
最后,在您的管道文件中,根据您的逻辑插入或更新:
插入:
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped(score, count, url) VALUES (%s, %s, %s)""", (item['reviewScore'], item['reviewCount'], item['reviewURL']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
更新:
def process_item(self, item, spider):
try:
self.cursor.execute("""UPDATE scraped SET score=%s, count=%s WHERE url=%s""", (item['reviewScore'], item['reviewCount'], item['reviewURL']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
目前正在使用 Scrapy。
我有一个 URL 的列表存储在 MySQL 数据库中。蜘蛛访问这些 URLs,捕获两个目标信息(score 和 count)。我的目标是当 Scrapy 完成抓取时,它会在移动到下一个 URL.
之前自动填充相应的列我是新手,我似乎无法使保存部分正常工作。 score 和 count 成功传递到数据库。但它被保存为新行而不是与源相关联 URL。
这是我的代码: amazon_spider.py
import scrapy
from whatoplaybot.items import crawledScore
import MySQLdb
class amazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = []
def parse(self, response):
print self.start_urls
def start_requests(self):
conn = MySQLdb.connect(
user='root',
passwd='',
db='scraper',
host='127.0.0.1',
charset="utf8",
use_unicode=True
)
cursor = conn.cursor()
cursor.execute(
'SELECT url FROM scraped;'
)
rows = cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[0])
conn.close()
def parse(self, response):
item = crawledScore()
item['reviewScore'] = response.xpath('//*[@id="avgRating"]/span/a/span/text()').re("[0-9,.]+")[0]
item['reviewCount'] = response.xpath('//*[@id="summaryStars"]/a/text()').re("[0-9,]+")
yield item
pipelines.py
import sys
import MySQLdb
class storeScore(object):
def __init__(self):
self.conn = MySQLdb.connect(
user='root',
passwd='',
db='scraper',
host='127.0.0.1',
charset="utf8",
use_unicode=True
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped(score, count) VALUES (%s, %s)""", (item['reviewScore'], item['reviewCount']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
非常感谢任何帮助和指导。
谢谢大家。
按照以下步骤操作:
将 reviewURL 字段添加到您的 crawledScore 项目:
class crawledScore(scrapy.Item):
reviewScore = scrapy.Field()
reviewCount = scrapy.Field()
reviewURL = scrapy.Field()
将回复url保存到项目['reviewURL']:
def parse(self, response):
item = crawledScore()
item['reviewScore'] = response.xpath('//*[@id="avgRating"]/span/a/span/text()').re("[0-9,.]+")[0]
item['reviewCount'] = response.xpath('//*[@id="summaryStars"]/a/text()').re("[0-9,]+")
item['reviewURL'] = response.url
yield item
最后,在您的管道文件中,根据您的逻辑插入或更新:
插入:
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped(score, count, url) VALUES (%s, %s, %s)""", (item['reviewScore'], item['reviewCount'], item['reviewURL']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
更新:
def process_item(self, item, spider):
try:
self.cursor.execute("""UPDATE scraped SET score=%s, count=%s WHERE url=%s""", (item['reviewScore'], item['reviewCount'], item['reviewURL']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item