Selector 类型的对象 JSON 不可序列化
Object of type Selector is not JSON serializable
我正在尝试抓取一个动态网站,我需要 Selenium。
我想抓取的链接只有在我单击该特定元素时才会打开。它们由 jQuery 打开,所以我唯一的选择是单击它们,因为没有 href 属性或任何可以给我 URL.
的东西
我的方法是这样的:
# -*- coding: utf-8 -*-
import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield SeleniumRequest(
url='https://www.anofm.ro/lmvw.html?agentie=Covasna&categ=3&subcateg=1',
callback=self.parse
)
def parse(self, response):
driver = response.meta['driver']
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "tableRepeat2"))
)
finally:
html = driver.page_source
response_obj = Selector(text=html)
links = response_obj.xpath("//tbody[@id='tableRepeat2']")
for link in links:
driver.execute_script("arguments[0].click();", link)
yield {
'Ocupatia': response_obj.xpath("//div[@id='print']/p/text()[1]")
}
但是不行。
在我要单击该元素的行上,我收到此错误:
TypeError: Object of type Selector is not JSON serializable
我有点理解这个错误,但我不知道如何解决它。我不知何故需要将该对象从选择器转换为可点击按钮。
我在网上查看了解决方案和文档,但找不到任何有用的东西。
任何人都可以帮助我更好地理解这个错误以及我应该如何解决它?
谢谢。
实际上,数据也是从 API
调用 JSON
响应中生成的,您可以轻松地从 API
中抓取。这是分页的工作解决方案。每页包含 8 个项目,共 32 个项目。
代码:
import scrapy
import json
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield scrapy.Request(
url='https://www.anofm.ro/dmxConnect/api/oferte_bos/oferte_bos_query2L_Test.php?offset=8&cauta=&select=Covasna&limit=8&localitate=',
method='GET',
callback=self.parse,
meta= {
'limit': 8}
)
def parse(self, response):
resp = json.loads(response.body)
hits = resp.get('lmv').get('data')
for h in hits:
yield {
'Ocupatia': h.get('OCCUPATION')
}
total_limit = resp.get('lmv').get('total')
next_limit = response.meta['limit'] + 8
if next_limit <= total_limit:
yield scrapy.Request(
url=f'https://www.anofm.ro/dmxConnect/api/oferte_bos/oferte_bos_query2L_Test.php?offset=8&cauta=&select=Covasna&limit={next_limit}&localitate=',
method='GET',
callback=self.parse,
meta= {
'limit': next_limit}
)
您将 Scrapy 对象与 Selenium 函数混合使用,这会产生问题。我不知道如何转换对象,但我只想为此使用 Selenium
finally:
links = driver.find_elements_by_xpath("//tbody[@id='tableRepeat2']/tr")
print('len(links):', len(links))
for link in links:
# doesn't work for me - even
#driver.execute_script("arguments[0].scrollIntoView();", link)
#link.click()
# open information
driver.execute_script("arguments[0].click();", link)
# javascript may need some time to display it
time.sleep(1)
# get data
ocupatia = driver.find_element_by_xpath(".//div[@id='print']/p").text
ocupatia = ocupatia.split('\n', 1)[0] # first line
ocupatia = ocupatia.split(':', 1)[1].strip() # text after first `:`
print('Ocupatia -->', ocupatia)
# close information
driver.find_element_by_xpath('//button[text()="Inchide"]').click()
yield {
'Ocupatia': ocupatia
}
完整的工作代码。
每个人都可以把它放在一个文件中 运行 python script.py
而无需在 scrapy
.
中创建项目
您必须更改 SELENIUM_DRIVER_EXECUTABLE_PATH
以更正路径。
import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
import time
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield SeleniumRequest(
url='https://www.anofm.ro/lmvw.html?agentie=Covasna&categ=3&subcateg=1',
#callback=self.parse
)
def parse(self, response):
driver = response.meta['driver']
try:
print("try")
element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, "//tbody[@id='tableRepeat2']/tr/td"))
)
finally:
print("finally")
links = driver.find_elements_by_xpath("//tbody[@id='tableRepeat2']/tr")
print('len(links):', len(links))
for link in links:
#driver.execute_script("arguments[0].scrollIntoView();", link)
#link.click()
# open information
driver.execute_script("arguments[0].click();", link)
# javascript may need some time to display it
time.sleep(1)
# get data
ocupatia = driver.find_element_by_xpath(".//div[@id='print']/p").text
ocupatia = ocupatia.split('\n', 1)[0] # first line
ocupatia = ocupatia.split(':', 1)[1].strip() # text after first `:`
print('Ocupatia -->', ocupatia)
# close information
driver.find_element_by_xpath('//button[text()="Inchide"]').click()
yield {
'Ocupatia': ocupatia
}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800},
'SELENIUM_DRIVER_NAME': 'firefox',
'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/furas/bin/geckodriver',
'SELENIUM_DRIVER_ARGUMENTS': [], # ['-headless']
})
c.crawl(AnofmSpider)
c.start()
我正在尝试抓取一个动态网站,我需要 Selenium。
我想抓取的链接只有在我单击该特定元素时才会打开。它们由 jQuery 打开,所以我唯一的选择是单击它们,因为没有 href 属性或任何可以给我 URL.
的东西我的方法是这样的:
# -*- coding: utf-8 -*-
import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield SeleniumRequest(
url='https://www.anofm.ro/lmvw.html?agentie=Covasna&categ=3&subcateg=1',
callback=self.parse
)
def parse(self, response):
driver = response.meta['driver']
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "tableRepeat2"))
)
finally:
html = driver.page_source
response_obj = Selector(text=html)
links = response_obj.xpath("//tbody[@id='tableRepeat2']")
for link in links:
driver.execute_script("arguments[0].click();", link)
yield {
'Ocupatia': response_obj.xpath("//div[@id='print']/p/text()[1]")
}
但是不行。
在我要单击该元素的行上,我收到此错误:
TypeError: Object of type Selector is not JSON serializable
我有点理解这个错误,但我不知道如何解决它。我不知何故需要将该对象从选择器转换为可点击按钮。
我在网上查看了解决方案和文档,但找不到任何有用的东西。
任何人都可以帮助我更好地理解这个错误以及我应该如何解决它?
谢谢。
实际上,数据也是从 API
调用 JSON
响应中生成的,您可以轻松地从 API
中抓取。这是分页的工作解决方案。每页包含 8 个项目,共 32 个项目。
代码:
import scrapy
import json
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield scrapy.Request(
url='https://www.anofm.ro/dmxConnect/api/oferte_bos/oferte_bos_query2L_Test.php?offset=8&cauta=&select=Covasna&limit=8&localitate=',
method='GET',
callback=self.parse,
meta= {
'limit': 8}
)
def parse(self, response):
resp = json.loads(response.body)
hits = resp.get('lmv').get('data')
for h in hits:
yield {
'Ocupatia': h.get('OCCUPATION')
}
total_limit = resp.get('lmv').get('total')
next_limit = response.meta['limit'] + 8
if next_limit <= total_limit:
yield scrapy.Request(
url=f'https://www.anofm.ro/dmxConnect/api/oferte_bos/oferte_bos_query2L_Test.php?offset=8&cauta=&select=Covasna&limit={next_limit}&localitate=',
method='GET',
callback=self.parse,
meta= {
'limit': next_limit}
)
您将 Scrapy 对象与 Selenium 函数混合使用,这会产生问题。我不知道如何转换对象,但我只想为此使用 Selenium
finally:
links = driver.find_elements_by_xpath("//tbody[@id='tableRepeat2']/tr")
print('len(links):', len(links))
for link in links:
# doesn't work for me - even
#driver.execute_script("arguments[0].scrollIntoView();", link)
#link.click()
# open information
driver.execute_script("arguments[0].click();", link)
# javascript may need some time to display it
time.sleep(1)
# get data
ocupatia = driver.find_element_by_xpath(".//div[@id='print']/p").text
ocupatia = ocupatia.split('\n', 1)[0] # first line
ocupatia = ocupatia.split(':', 1)[1].strip() # text after first `:`
print('Ocupatia -->', ocupatia)
# close information
driver.find_element_by_xpath('//button[text()="Inchide"]').click()
yield {
'Ocupatia': ocupatia
}
完整的工作代码。
每个人都可以把它放在一个文件中 运行 python script.py
而无需在 scrapy
.
您必须更改 SELENIUM_DRIVER_EXECUTABLE_PATH
以更正路径。
import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
import time
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield SeleniumRequest(
url='https://www.anofm.ro/lmvw.html?agentie=Covasna&categ=3&subcateg=1',
#callback=self.parse
)
def parse(self, response):
driver = response.meta['driver']
try:
print("try")
element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, "//tbody[@id='tableRepeat2']/tr/td"))
)
finally:
print("finally")
links = driver.find_elements_by_xpath("//tbody[@id='tableRepeat2']/tr")
print('len(links):', len(links))
for link in links:
#driver.execute_script("arguments[0].scrollIntoView();", link)
#link.click()
# open information
driver.execute_script("arguments[0].click();", link)
# javascript may need some time to display it
time.sleep(1)
# get data
ocupatia = driver.find_element_by_xpath(".//div[@id='print']/p").text
ocupatia = ocupatia.split('\n', 1)[0] # first line
ocupatia = ocupatia.split(':', 1)[1].strip() # text after first `:`
print('Ocupatia -->', ocupatia)
# close information
driver.find_element_by_xpath('//button[text()="Inchide"]').click()
yield {
'Ocupatia': ocupatia
}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800},
'SELENIUM_DRIVER_NAME': 'firefox',
'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/furas/bin/geckodriver',
'SELENIUM_DRIVER_ARGUMENTS': [], # ['-headless']
})
c.crawl(AnofmSpider)
c.start()