Scrapy找不到项目
Scrapy can't find items
我目前仍在学习 Scrapy 并尝试使用管道和 ItemLoader。
但是,我目前遇到的问题是蜘蛛显示 Item.py
不存在。我到底做错了什么,为什么我没有从蜘蛛获取任何数据到我的管道中?
运行 没有导入项目的 Spider 工作正常。管道也在 settings.py
.
中激活
我的错误日志如下:
Traceback (most recent call last):
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
exec(code, run_globals)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\Scripts\scrapy.exe\__main__.py", line 7, in <module>
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\cmdline.py", line 144, in execute
cmd.crawler_process = CrawlerProcess(settings)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 280, in __init__
super().__init__(settings)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 152, in __init__
self.spider_loader = self._get_spider_loader(settings)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 146, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spiderloader.py", line 67, in from_settings
return cls(settings)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spiderloader.py", line 24, in __init__
self._load_all_spiders()
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spiderloader.py", line 51, in _load_all_spiders
for module in walk_modules(name):
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\misc.py", line 88, in
walk_modules
submod = import_module(fullpath)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\importlib\__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "C:\Users\Syrix\WebCrawler\watches\watches\spiders\watchbot.py", line 5, in <module>
from watches.watches.items import WatchesItem
ModuleNotFoundError: No module named 'watches.watches'
我的蜘蛛看起来像这样:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from watches.watches.items import WatchesItem
from scrapy.exceptions import DropItem
class WatchbotSpider(scrapy.Spider):
name = 'watchbot'
start_urls = ['https://www.watch.de/english/rolex.html']
def parse(self, response, **kwargs):
for link in response.css('div.product-item-link a::attr(href)'):
url = link.get()
yield scrapy.Request(url, callback=self.parse_categories)
def parse_categories(self, response):
for product in response.xpath('//*[@id="main"]/div[2]/div[1]'):
l = ItemLoader(item=WatchesItem(), selector=product)
l.add_xpath('name', '//span[@itemprop="sku"]/text()')
l.add_xpath('reference', '//span[@itemprop="sku"]/text()')
l.add_xpath('year', '//div[@class="product-option baujahr"]/div[@class="product-option-value"]/text()')
yield l.load_item()
items.py:
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
class WatchesItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
reference = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
year = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
最后但同样重要的是我的管道:
import mysql
import mysql.connector
from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(
host = '',
user = '',
passwd = '',
database = ''
)
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
return item
def store_db(self, item):
self.curr.execute("""insert into test.watch values (%s, %s, %s)""", (
item['name'][0],
item['reference'][0],
item['year'][0],
))
self.conn.commit()
编辑:
PS E:\semester\webcrawler_watches\watches\Crawler> scrapy crawl watchbot
Traceback (most recent call last):
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
exec(code, run_globals)
File "E:\semester\webcrawler_watches\venv\Scripts\scrapy.exe\__main__.py", line 7, in <module>
File "E:\semester\webcrawler_watches\venv\lib\site-packages\scrapy\cmdline.py", line 114, in execute
settings = get_project_settings()
File "E:\semester\webcrawler_watches\venv\lib\site-packages\scrapy\utils\project.py", line 68, in get_project_settings
settings.setmodule(settings_module_path, priority='project')
File "E:\semester. semester\webcrawler_watches\venv\lib\site-packages\scrapy\settings\__init__.py", line 287, in setmodule
module = import_module(module)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\importlib\__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1004, in _find_and_load_unlocked
ModuleNotFoundError: No module named 'watches'
PS E:\semester\webcrawler_watches\watches\Crawler>
这是我的作品。请关注这个。
import mysql
import mysql.connector
# from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(
host = 'localhost',
user = 'root',
passwd = '', # your password
database = '', # your databse
)
self.curr = self.conn.cursor()
# def create_table(self):
# self.curr.execute("""DROP TABLE IF EXISTS scrapy_tb """)
# self.curr.execute("""create table scrapy_tb (name text, reference text, year text)""")
def process_item(self, item, spider):
self.store_db(item)
# print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
return item
def store_db(self, item):
self.curr.execute("""insert into scrapy_tb values (%s, %s, %s)""",
(
item['name'][0],
item['reference'][0],
item['year'][0]
))
self.conn.commit()
return item
self.conn.close()
我目前仍在学习 Scrapy 并尝试使用管道和 ItemLoader。
但是,我目前遇到的问题是蜘蛛显示 Item.py
不存在。我到底做错了什么,为什么我没有从蜘蛛获取任何数据到我的管道中?
运行 没有导入项目的 Spider 工作正常。管道也在 settings.py
.
我的错误日志如下:
Traceback (most recent call last):
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
exec(code, run_globals)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\Scripts\scrapy.exe\__main__.py", line 7, in <module>
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\cmdline.py", line 144, in execute
cmd.crawler_process = CrawlerProcess(settings)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 280, in __init__
super().__init__(settings)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 152, in __init__
self.spider_loader = self._get_spider_loader(settings)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 146, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spiderloader.py", line 67, in from_settings
return cls(settings)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spiderloader.py", line 24, in __init__
self._load_all_spiders()
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spiderloader.py", line 51, in _load_all_spiders
for module in walk_modules(name):
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\misc.py", line 88, in
walk_modules
submod = import_module(fullpath)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\importlib\__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "C:\Users\Syrix\WebCrawler\watches\watches\spiders\watchbot.py", line 5, in <module>
from watches.watches.items import WatchesItem
ModuleNotFoundError: No module named 'watches.watches'
我的蜘蛛看起来像这样:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from watches.watches.items import WatchesItem
from scrapy.exceptions import DropItem
class WatchbotSpider(scrapy.Spider):
name = 'watchbot'
start_urls = ['https://www.watch.de/english/rolex.html']
def parse(self, response, **kwargs):
for link in response.css('div.product-item-link a::attr(href)'):
url = link.get()
yield scrapy.Request(url, callback=self.parse_categories)
def parse_categories(self, response):
for product in response.xpath('//*[@id="main"]/div[2]/div[1]'):
l = ItemLoader(item=WatchesItem(), selector=product)
l.add_xpath('name', '//span[@itemprop="sku"]/text()')
l.add_xpath('reference', '//span[@itemprop="sku"]/text()')
l.add_xpath('year', '//div[@class="product-option baujahr"]/div[@class="product-option-value"]/text()')
yield l.load_item()
items.py:
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
class WatchesItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
reference = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
year = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
最后但同样重要的是我的管道:
import mysql
import mysql.connector
from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(
host = '',
user = '',
passwd = '',
database = ''
)
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
return item
def store_db(self, item):
self.curr.execute("""insert into test.watch values (%s, %s, %s)""", (
item['name'][0],
item['reference'][0],
item['year'][0],
))
self.conn.commit()
编辑:
PS E:\semester\webcrawler_watches\watches\Crawler> scrapy crawl watchbot
Traceback (most recent call last):
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
exec(code, run_globals)
File "E:\semester\webcrawler_watches\venv\Scripts\scrapy.exe\__main__.py", line 7, in <module>
File "E:\semester\webcrawler_watches\venv\lib\site-packages\scrapy\cmdline.py", line 114, in execute
settings = get_project_settings()
File "E:\semester\webcrawler_watches\venv\lib\site-packages\scrapy\utils\project.py", line 68, in get_project_settings
settings.setmodule(settings_module_path, priority='project')
File "E:\semester. semester\webcrawler_watches\venv\lib\site-packages\scrapy\settings\__init__.py", line 287, in setmodule
module = import_module(module)
File "C:\Users\Syrix\AppData\Local\Programs\Python\Python310\lib\importlib\__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1004, in _find_and_load_unlocked
ModuleNotFoundError: No module named 'watches'
PS E:\semester\webcrawler_watches\watches\Crawler>
这是我的作品。请关注这个。
import mysql
import mysql.connector
# from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(
host = 'localhost',
user = 'root',
passwd = '', # your password
database = '', # your databse
)
self.curr = self.conn.cursor()
# def create_table(self):
# self.curr.execute("""DROP TABLE IF EXISTS scrapy_tb """)
# self.curr.execute("""create table scrapy_tb (name text, reference text, year text)""")
def process_item(self, item, spider):
self.store_db(item)
# print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
return item
def store_db(self, item):
self.curr.execute("""insert into scrapy_tb values (%s, %s, %s)""",
(
item['name'][0],
item['reference'][0],
item['year'][0]
))
self.conn.commit()
return item
self.conn.close()