为什么 Scrapy selenium 同时提供 'Null' 和重复值?
Why Scrapy selenium providing the 'Null' and duplicate value at the same time?
我正在尝试执行此脚本,但我不知道为什么它会同时抛出 'Null' 和重复值!我的目标是输入必要的值并单击搜索按钮,从页面获取所有 'href' 并收集数据,这工作正常但同时提供 'Null' 和重复值! .我不知道我在这里到底错过了什么。
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
class RightMove2Spider(scrapy.Spider):
name = 'rightmove2'
start_urls = ["https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"]
def __init__(self, name=None, **kwargs):
chrome_options = Options()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
driver.get("https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale")
price_range = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "(//option[@value='2000000'])[2]")))
price_range.click()
time.sleep(1)
bedroom_range = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "(//option[@value='5'])[1]")))
bedroom_range.click()
time.sleep(1)
tick_box = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//span[@class='tickbox--indicator']")))
tick_box.click()
time.sleep(1)
find_properties_btn = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[@id='submit']")))
find_properties_btn.click()
time.sleep(3)
self.property_xpath = driver.find_elements(By.XPATH, "//*[@class='l-searchResult is-list']/div/div/div[4]/div[1]/div[2]/a")
# driver.close()
super().__init__(name, **kwargs)
def parse(self, response):
for el in self.property_xpath:
href= el.get_attribute('href')
time.sleep(1)
yield SeleniumRequest(
url=href,
wait_time=3)
yield {
'Title': response.xpath("//h1[@itemprop='streetAddress']/text()").get(),
'Price': response.xpath("//div[@class='_1gfnqJ3Vtd1z40MlC0MzXu']/span/text()").get(),
'Agent Name': response.xpath("//div[@class='RPNfwwZBarvBLs58-mdN8']/a/text()").get(),
'Agent Address': response.xpath("//div[@class='OojFk4MTxFDKIfqreGNt0']/text()").get(),
'Agent Telephone': response.xpath("//a[@class='_3E1fAHUmQ27HFUFIBdrW0u']/text()").get(),
'Added on': response.xpath("//div[@class='_2nk2x6QhNB1UrxdI5KpvaF']/text()").get(),
'Links': response.url
}
for x in range(24, 1008, 24):
abs_url = f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&minBedrooms=5&maxPrice=2000000&index={x}&propertyTypes=&includeSSTC=true&mustHave=&dontShow=&furnishTypes=&keywords='
yield SeleniumRequest(
url= abs_url,
callback=self.parse
)
输出
{"Title": null, "Price": null, "Agent Name": null, "Agent Address": null, "Agent Telephone": null, "Added on": null, "Links": "https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"},
{"Title": "Combwell Crescent, Abbey Wood, London", "Price": "£450,000", "Agent Name": "Anthony Martin Estate Agents, Bexleyheath", "Agent Address": "2 Pickford Lane,\r\nBexleyheath,\r\nDA7 4QW", "Agent Telephone": "020 8012 7475", "Added on": "Added on 30/11/2021", "Links": "https://www.rightmove.co.uk/properties/117050312"},
{"Title": null, "Price": null, "Agent Name": null, "Agent Address": null, "Agent Telephone": null, "Added on": null, "Links": "https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"},
{"Title": null, "Price": null, "Agent Name": null, "Agent Address": null, "Agent Telephone": null, "Added on": null, "Links": "https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"},
{"Title": "Combwell Crescent, Abbey Wood, London", "Price": "£450,000", "Agent Name": "Anthony Martin Estate Agents, Bexleyheath", "Agent Address": "2 Pickford Lane,\r\nBexleyheath,\r\nDA7 4QW", "Agent Telephone": "020 8012 7475", "Added on": "Added on 30/11/2021", "Links": "https://www.rightmove.co.uk/properties/117050312"},
{"Title": null, "Price": null, "Agent Name": null, "Agent Address": null, "Agent Telephone": null, "Added on": null, "Links": "https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"},
{"Title": "Combwell Crescent, Abbey Wood, London", "Price": "£450,000", "Agent Name": "Anthony Martin Estate Agents, Bexleyheath", "Agent Address": "2 Pickford Lane,\r\nBexleyheath,\r\nDA7 4QW", "Agent Telephone": "020 8012 7475", "Added on": "Added on 30/11/2021", "Links": "https://www.rightmove.co.uk/properties/117050312"},
在开始 webscraping 项目之前,成功取决于在正确的 way.Data 中选择正确的工具,也从 api 调用 json 响应中生成。为什么使用 selenium 使 web scraping 如此复杂,您可以轻松地从 api?
获取数据
脚本:
import scrapy
#import json
class PropertySpider(scrapy.Spider):
name = 'property'
def start_requests(self):
headers= {
"Content-Type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
}
yield scrapy.Request(
url='https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false',
method="GET",
headers=headers,
callback=self.parse
)
def parse(self,response):
resp=response.json()
for item in resp['properties']:
yield {
"title":item['summary'],
'price':item['price']['amount'],
'url':'https://www.rightmove.co.uk' + item['propertyUrl']
}
输出:
{'title': "A stunning two bedroom, two bathroom apartment on the 11th floor set over approx 1,645 sq ft, located in St George's brilliant new river fronted development, One Blackfriars, SE1.", 'price': 2000000, 'url': 'https://www.rightmove.co.uk/properties/118739888#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': 'An immaculate four bedroom townhouse arranged over three floors nestled along a peaceful row of pretty houses.', 'price': 2000000, 'url': 'https://www.rightmove.co.uk/properties/118772936#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': "With a great view of the River Thames, The Shard and the City of London, this bright and ideally located two bedroom apartment is 'as new' and is available for chain free sale through Prime London. The bright and clean living space, coming in at 1,210 sq ft / 112 sq m presents exceptionally well...", 'price': 2000000, 'url': 'https://www.rightmove.co.uk/properties/113289182#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': 'With incredible views from the 19th and 20th floors, this 1,678 sq ft (155.9 sqm) penthouse apartment at The Perspective Building is available for sale exclusively through Prime London. The property features two large double bedrooms (both with en suite), occasional/guest bedroom, large open-pla...', 'price': 1999950, 'url': 'https://www.rightmove.co.uk/properties/73980120#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': 'Set in one of London’s most desirable riverside locations, adjacent to Westminster and next to the London Eye, 8 Casson Square celebrates the rich history and heritage of its surroundings. The combination of the intricate architectural design and the impressive location will together create so...', 'price': 1965000, 'url': 'https://www.rightmove.co.uk/properties/79565985#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': 'Newly refurbished two bedroom, two bathroom apartment in Whitehall Court, Westminster.', 'price': 1950000, 'url': 'https://www.rightmove.co.uk/properties/116568074#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
...等等
我正在尝试执行此脚本,但我不知道为什么它会同时抛出 'Null' 和重复值!我的目标是输入必要的值并单击搜索按钮,从页面获取所有 'href' 并收集数据,这工作正常但同时提供 'Null' 和重复值! .我不知道我在这里到底错过了什么。
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
class RightMove2Spider(scrapy.Spider):
name = 'rightmove2'
start_urls = ["https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"]
def __init__(self, name=None, **kwargs):
chrome_options = Options()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
driver.get("https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale")
price_range = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "(//option[@value='2000000'])[2]")))
price_range.click()
time.sleep(1)
bedroom_range = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "(//option[@value='5'])[1]")))
bedroom_range.click()
time.sleep(1)
tick_box = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//span[@class='tickbox--indicator']")))
tick_box.click()
time.sleep(1)
find_properties_btn = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[@id='submit']")))
find_properties_btn.click()
time.sleep(3)
self.property_xpath = driver.find_elements(By.XPATH, "//*[@class='l-searchResult is-list']/div/div/div[4]/div[1]/div[2]/a")
# driver.close()
super().__init__(name, **kwargs)
def parse(self, response):
for el in self.property_xpath:
href= el.get_attribute('href')
time.sleep(1)
yield SeleniumRequest(
url=href,
wait_time=3)
yield {
'Title': response.xpath("//h1[@itemprop='streetAddress']/text()").get(),
'Price': response.xpath("//div[@class='_1gfnqJ3Vtd1z40MlC0MzXu']/span/text()").get(),
'Agent Name': response.xpath("//div[@class='RPNfwwZBarvBLs58-mdN8']/a/text()").get(),
'Agent Address': response.xpath("//div[@class='OojFk4MTxFDKIfqreGNt0']/text()").get(),
'Agent Telephone': response.xpath("//a[@class='_3E1fAHUmQ27HFUFIBdrW0u']/text()").get(),
'Added on': response.xpath("//div[@class='_2nk2x6QhNB1UrxdI5KpvaF']/text()").get(),
'Links': response.url
}
for x in range(24, 1008, 24):
abs_url = f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&minBedrooms=5&maxPrice=2000000&index={x}&propertyTypes=&includeSSTC=true&mustHave=&dontShow=&furnishTypes=&keywords='
yield SeleniumRequest(
url= abs_url,
callback=self.parse
)
输出
{"Title": null, "Price": null, "Agent Name": null, "Agent Address": null, "Agent Telephone": null, "Added on": null, "Links": "https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"},
{"Title": "Combwell Crescent, Abbey Wood, London", "Price": "£450,000", "Agent Name": "Anthony Martin Estate Agents, Bexleyheath", "Agent Address": "2 Pickford Lane,\r\nBexleyheath,\r\nDA7 4QW", "Agent Telephone": "020 8012 7475", "Added on": "Added on 30/11/2021", "Links": "https://www.rightmove.co.uk/properties/117050312"},
{"Title": null, "Price": null, "Agent Name": null, "Agent Address": null, "Agent Telephone": null, "Added on": null, "Links": "https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"},
{"Title": null, "Price": null, "Agent Name": null, "Agent Address": null, "Agent Telephone": null, "Added on": null, "Links": "https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"},
{"Title": "Combwell Crescent, Abbey Wood, London", "Price": "£450,000", "Agent Name": "Anthony Martin Estate Agents, Bexleyheath", "Agent Address": "2 Pickford Lane,\r\nBexleyheath,\r\nDA7 4QW", "Agent Telephone": "020 8012 7475", "Added on": "Added on 30/11/2021", "Links": "https://www.rightmove.co.uk/properties/117050312"},
{"Title": null, "Price": null, "Agent Name": null, "Agent Address": null, "Agent Telephone": null, "Added on": null, "Links": "https://www.rightmove.co.uk/property-for-sale/search.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale"},
{"Title": "Combwell Crescent, Abbey Wood, London", "Price": "£450,000", "Agent Name": "Anthony Martin Estate Agents, Bexleyheath", "Agent Address": "2 Pickford Lane,\r\nBexleyheath,\r\nDA7 4QW", "Agent Telephone": "020 8012 7475", "Added on": "Added on 30/11/2021", "Links": "https://www.rightmove.co.uk/properties/117050312"},
在开始 webscraping 项目之前,成功取决于在正确的 way.Data 中选择正确的工具,也从 api 调用 json 响应中生成。为什么使用 selenium 使 web scraping 如此复杂,您可以轻松地从 api?
获取数据脚本:
import scrapy
#import json
class PropertySpider(scrapy.Spider):
name = 'property'
def start_requests(self):
headers= {
"Content-Type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
}
yield scrapy.Request(
url='https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false',
method="GET",
headers=headers,
callback=self.parse
)
def parse(self,response):
resp=response.json()
for item in resp['properties']:
yield {
"title":item['summary'],
'price':item['price']['amount'],
'url':'https://www.rightmove.co.uk' + item['propertyUrl']
}
输出:
{'title': "A stunning two bedroom, two bathroom apartment on the 11th floor set over approx 1,645 sq ft, located in St George's brilliant new river fronted development, One Blackfriars, SE1.", 'price': 2000000, 'url': 'https://www.rightmove.co.uk/properties/118739888#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': 'An immaculate four bedroom townhouse arranged over three floors nestled along a peaceful row of pretty houses.', 'price': 2000000, 'url': 'https://www.rightmove.co.uk/properties/118772936#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': "With a great view of the River Thames, The Shard and the City of London, this bright and ideally located two bedroom apartment is 'as new' and is available for chain free sale through Prime London. The bright and clean living space, coming in at 1,210 sq ft / 112 sq m presents exceptionally well...", 'price': 2000000, 'url': 'https://www.rightmove.co.uk/properties/113289182#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': 'With incredible views from the 19th and 20th floors, this 1,678 sq ft (155.9 sqm) penthouse apartment at The Perspective Building is available for sale exclusively through Prime London. The property features two large double bedrooms (both with en suite), occasional/guest bedroom, large open-pla...', 'price': 1999950, 'url': 'https://www.rightmove.co.uk/properties/73980120#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': 'Set in one of London’s most desirable riverside locations, adjacent to Westminster and next to the London Eye, 8 Casson Square celebrates the rich history and heritage of its surroundings. The combination of the intricate architectural design and the impressive location will together create so...', 'price': 1965000, 'url': 'https://www.rightmove.co.uk/properties/79565985#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
{'title': 'Newly refurbished two bedroom, two bathroom apartment in Whitehall Court, Westminster.', 'price': 1950000, 'url': 'https://www.rightmove.co.uk/properties/116568074#/?channel=RES_BUY'}
2022-03-29 15:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.rightmove.co.uk/api/_search?locationIdentifier=STATION%5E9662&numberOfPropertiesPerPage=24&radius=0.5&sortType=2&index=24&includeSSTC=false&viewType=LIST&channel=BUY&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false>
...等等