无法从网站中提取 Latitutde/longitude
Cannot extract Latitutde/longitude from website
我是 stacker flow 的新手,这是我的第一个 post,所以我希望我能很好地解释自己,你能帮助我!在此先感谢您的帮助!!
我正在使用 Scrapy 从我的祖国抓取一个流行的真实声明网站。我对我想要的所有特征都做得很好,例如价格、表面、卧室等。但是我没能得到属性的latitude/longitude。在网站中,例如https://www.portalinmobiliario.com/MLC-564988630-estilo-mariposa-_JM#position=2&type=item&tracking_id=ed337e69-9999-4ede-b393-ef378e1a5675,你可以找到如图所示的google地图位置,在这个HTML元素里面,可以得到lat/long(以蓝色突出显示)但是当我尝试在我的代码中访问此元素时,蜘蛛无法识别它。
使用此 css 选择器 crs_location = response.css('div.map-container img:nth-child(1)').getall()
我能够在 div 中获取第一个 img,得到以下输出 https://http2.mlstatic.com/resources/frontend/web-vip/ui-dist/images/pin-real-estate-d1ebb73e65.svg
,但是当我更改 nth-child 到: crs_location = response.css('div.map-container img:nth-child(2)').getall()
得到第二个 child (我想要的), crs_location 变量结果为空。
如果您能帮助弄清楚如何获得此 lat/long,我将不胜感激。
谢谢!
HTML elements
完整代码:
导入 scrapy
来自 scrapy 导入选择器
导入请求
将 pandas 导入为 pd
将 numpy 导入为 np
# Import the CrawlerProcess
from scrapy.crawler import CrawlerProcess
# Create the Spider class
class Spider_Inmob(scrapy.Spider):
name = 'spider_inmob'
#download_delay = 3
# start_requests method
def start_requests( self ):
headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
i=1
page=0
for y in range(1):
url = 'http://portalinmobiliario.com/venta/departamento/propiedades-usadas/providencia-metropolitana/_Desde_' + str(page)
print("----------PRUEBA1--------------" + str(page))
page = 51 + 50*i
i+=1
yield scrapy.Request(url = url, callback=self.parse, headers=headers)
def parse(self, response):
global aux3
links_busqueda = response.css('ol.ui-search-layout > li.ui-search-layout__item a.ui-search-result__content.ui-search-link::attr(href)').getall()
print(len(links_busqueda))
for url in links_busqueda:
aux3 = aux3+1
print(aux3)
yield response.follow(url=url, callback = self.parse_propiedad, meta={'dont_redirect': True, 'handle_httpstatus_list':[302]})
def parse_propiedad(self,response):
global aux2
aux2 = aux2+1
global crs_Bedroom, crs_Currency, crs_Link, crs_Parking, crs_Price, crs_Restroom, crs_Storage, crs_Total_Surface, crs_Useful_Surface, crs_location
#print ("Number iteration " + str(aux2))
global Nombre_variables
#print('-------------------------PRUEBAAAA------1---------------')
aux=1
crs_prueba = response.css('header.item-title > h1.item-title__primary::text').getall()
#print(crs_prueba)
#This for goes over each characteristic for property like, total surface, bedrooms, bathrooms, etc
for i in range(20):
variable = response.css('section.specs-container > ul.specs-list li.specs-item:nth-child('+ str(i) +') > strong::text').getall()
variable2 = response.css('section.specs-container > ul.specs-list li.specs-item:nth-child('+ str(i) +') > span::text').getall()
np_variable = np.array(variable)
if not variable:
a=0
else:
for var in Nombre_variables:
if np_variable[0] == "Superficie total":
crs_Total_Surface = variable2
elif np_variable[0] == "Superficie útil":
crs_Useful_Surface = variable2
elif np_variable[0] == "Dormitorios":
crs_Bedroom = variable2
elif np_variable[0] == "Baños":
crs_Restroom = variable2
elif np_variable[0] == "Estacionamientos":
crs_Parking = variable2
elif np_variable[0] == "Bodegas":
crs_Storage = variable2
# print(crs_Storage)
#print("----------------PRUEBA--------------2--------------------")
crs_Link = response.url
crs_location = response.css('div.map-container img:nth-child(2)').getall()
print("/n/n/n")
print(crs_location)
print("/n/n/n")
# Ass we have two kind of currency, we transform everything to UF currency
variable3 = response.css('fieldset.item-price span.price-tag > span.price-tag-symbol::text').getall()
np_variable3 = np.array(variable3)
# print(np_variable3[0])
if np_variable3[0] != "UF":
crs_Currency = "$"
variable4 = response.css('fieldset.item-price span.price-tag > span.price-tag-fraction::text').getall()
variable4= str(variable4).strip("['']")
# print(variable4)
variable4= str(variable4).replace(".","")
# print(variable4)
# print(type(variable4))
np_variable4 = np.array(variable4)
variable4 = float(variable4)
# print(variable4)
crs_Price = round(variable4/28500,0)
else:
crs_Currency = response.css('fieldset.item-price span.price-tag > span.price-tag-symbol::text').getall()
crs_Price = response.css('fieldset.item-price span.price-tag > span.price-tag-fraction::text').getall()
df2 = {'Link':[crs_Link],
'Currency':[crs_Currency],
'Price':[crs_Price],
'Total Surface':[crs_Total_Surface],
'Useful Surface':[crs_Useful_Surface],
'Location':[crs_location],
'Bedroom':[crs_Bedroom],
'Restroom':[crs_Restroom],
'Parking':[crs_Parking],
'Storage':[crs_Storage]}
# print(df2)
# print('-------------------------PRUEBAAAA---------------')
global df3
df3 = df3.append(df2, ignore_index=True)
#print(df3.head())
#Name of variables to take in consideration
Nombre_variables =["Superficie total", "Superficie útil", "Dormitorios", "Baños", "Estacionamientos", "Bodegas"]
Dict_Nombre_variables = {}
#initialize DataFrame
headers = ["Link","Currency", "Price", "Total Surface","Useful Surface", "Location", "Bedroom", "Restroom", "Parking", "Storage"]
df_data = pd.DataFrame(columns=headers)
headers = ["Link","Currency", "Price", "Total Surface","Useful Surface", "Location", "Bedroom", "Restroom", "Parking", "Storage"]
df3 = pd.DataFrame(columns=headers)
#Initialize global variables used in methods
aux2=0
crs_Link=0
crs_Currency=0
crs_Price=0
crs_Total_Surface=0
crs_Useful_Surface=0
crs_location=0
crs_Bedroom=0
crs_Restroom=0
crs_Parking=0
crs_Storage =0
aux3=0
# Run the Spider
process = CrawlerProcess({'USER_AGENT': 'hol'})
process.crawl(Spider_Inmob)
process.start()
path = "D:\0. Documentos\7. DataCamp\1. WebScraping\99. Ejemplos\PortalInmob.csv"
df3.to_csv(path)
print(df3.head())
print(df3)
print(df3['Location'])
请求和正则表达式非常简单,因为我们知道它是页面上唯一的 lat/lon,并且我们知道 url 格式。我们可以使用正则表达式捕获 url 的 lat/lon 部分并将其分开。
import requests
import re
url = 'https://www.portalinmobiliario.com/MLC-564988630-estilo-mariposa-_JM#position=2&type=item&tracking_id=ed337e69-9999-4ede-b393-ef378e1a5675'
r = requests.get(url).text
lat, lon = re.findall(r'center=(-?\d+\.\d+\%2C-?\d+\.\d+)',r)[0].split('%2C')
我是 stacker flow 的新手,这是我的第一个 post,所以我希望我能很好地解释自己,你能帮助我!在此先感谢您的帮助!! 我正在使用 Scrapy 从我的祖国抓取一个流行的真实声明网站。我对我想要的所有特征都做得很好,例如价格、表面、卧室等。但是我没能得到属性的latitude/longitude。在网站中,例如https://www.portalinmobiliario.com/MLC-564988630-estilo-mariposa-_JM#position=2&type=item&tracking_id=ed337e69-9999-4ede-b393-ef378e1a5675,你可以找到如图所示的google地图位置,在这个HTML元素里面,可以得到lat/long(以蓝色突出显示)但是当我尝试在我的代码中访问此元素时,蜘蛛无法识别它。
使用此 css 选择器 crs_location = response.css('div.map-container img:nth-child(1)').getall()
我能够在 div 中获取第一个 img,得到以下输出 https://http2.mlstatic.com/resources/frontend/web-vip/ui-dist/images/pin-real-estate-d1ebb73e65.svg
,但是当我更改 nth-child 到: crs_location = response.css('div.map-container img:nth-child(2)').getall()
得到第二个 child (我想要的), crs_location 变量结果为空。
如果您能帮助弄清楚如何获得此 lat/long,我将不胜感激。 谢谢!
HTML elements
完整代码: 导入 scrapy 来自 scrapy 导入选择器 导入请求 将 pandas 导入为 pd 将 numpy 导入为 np
# Import the CrawlerProcess
from scrapy.crawler import CrawlerProcess
# Create the Spider class
class Spider_Inmob(scrapy.Spider):
name = 'spider_inmob'
#download_delay = 3
# start_requests method
def start_requests( self ):
headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
i=1
page=0
for y in range(1):
url = 'http://portalinmobiliario.com/venta/departamento/propiedades-usadas/providencia-metropolitana/_Desde_' + str(page)
print("----------PRUEBA1--------------" + str(page))
page = 51 + 50*i
i+=1
yield scrapy.Request(url = url, callback=self.parse, headers=headers)
def parse(self, response):
global aux3
links_busqueda = response.css('ol.ui-search-layout > li.ui-search-layout__item a.ui-search-result__content.ui-search-link::attr(href)').getall()
print(len(links_busqueda))
for url in links_busqueda:
aux3 = aux3+1
print(aux3)
yield response.follow(url=url, callback = self.parse_propiedad, meta={'dont_redirect': True, 'handle_httpstatus_list':[302]})
def parse_propiedad(self,response):
global aux2
aux2 = aux2+1
global crs_Bedroom, crs_Currency, crs_Link, crs_Parking, crs_Price, crs_Restroom, crs_Storage, crs_Total_Surface, crs_Useful_Surface, crs_location
#print ("Number iteration " + str(aux2))
global Nombre_variables
#print('-------------------------PRUEBAAAA------1---------------')
aux=1
crs_prueba = response.css('header.item-title > h1.item-title__primary::text').getall()
#print(crs_prueba)
#This for goes over each characteristic for property like, total surface, bedrooms, bathrooms, etc
for i in range(20):
variable = response.css('section.specs-container > ul.specs-list li.specs-item:nth-child('+ str(i) +') > strong::text').getall()
variable2 = response.css('section.specs-container > ul.specs-list li.specs-item:nth-child('+ str(i) +') > span::text').getall()
np_variable = np.array(variable)
if not variable:
a=0
else:
for var in Nombre_variables:
if np_variable[0] == "Superficie total":
crs_Total_Surface = variable2
elif np_variable[0] == "Superficie útil":
crs_Useful_Surface = variable2
elif np_variable[0] == "Dormitorios":
crs_Bedroom = variable2
elif np_variable[0] == "Baños":
crs_Restroom = variable2
elif np_variable[0] == "Estacionamientos":
crs_Parking = variable2
elif np_variable[0] == "Bodegas":
crs_Storage = variable2
# print(crs_Storage)
#print("----------------PRUEBA--------------2--------------------")
crs_Link = response.url
crs_location = response.css('div.map-container img:nth-child(2)').getall()
print("/n/n/n")
print(crs_location)
print("/n/n/n")
# Ass we have two kind of currency, we transform everything to UF currency
variable3 = response.css('fieldset.item-price span.price-tag > span.price-tag-symbol::text').getall()
np_variable3 = np.array(variable3)
# print(np_variable3[0])
if np_variable3[0] != "UF":
crs_Currency = "$"
variable4 = response.css('fieldset.item-price span.price-tag > span.price-tag-fraction::text').getall()
variable4= str(variable4).strip("['']")
# print(variable4)
variable4= str(variable4).replace(".","")
# print(variable4)
# print(type(variable4))
np_variable4 = np.array(variable4)
variable4 = float(variable4)
# print(variable4)
crs_Price = round(variable4/28500,0)
else:
crs_Currency = response.css('fieldset.item-price span.price-tag > span.price-tag-symbol::text').getall()
crs_Price = response.css('fieldset.item-price span.price-tag > span.price-tag-fraction::text').getall()
df2 = {'Link':[crs_Link],
'Currency':[crs_Currency],
'Price':[crs_Price],
'Total Surface':[crs_Total_Surface],
'Useful Surface':[crs_Useful_Surface],
'Location':[crs_location],
'Bedroom':[crs_Bedroom],
'Restroom':[crs_Restroom],
'Parking':[crs_Parking],
'Storage':[crs_Storage]}
# print(df2)
# print('-------------------------PRUEBAAAA---------------')
global df3
df3 = df3.append(df2, ignore_index=True)
#print(df3.head())
#Name of variables to take in consideration
Nombre_variables =["Superficie total", "Superficie útil", "Dormitorios", "Baños", "Estacionamientos", "Bodegas"]
Dict_Nombre_variables = {}
#initialize DataFrame
headers = ["Link","Currency", "Price", "Total Surface","Useful Surface", "Location", "Bedroom", "Restroom", "Parking", "Storage"]
df_data = pd.DataFrame(columns=headers)
headers = ["Link","Currency", "Price", "Total Surface","Useful Surface", "Location", "Bedroom", "Restroom", "Parking", "Storage"]
df3 = pd.DataFrame(columns=headers)
#Initialize global variables used in methods
aux2=0
crs_Link=0
crs_Currency=0
crs_Price=0
crs_Total_Surface=0
crs_Useful_Surface=0
crs_location=0
crs_Bedroom=0
crs_Restroom=0
crs_Parking=0
crs_Storage =0
aux3=0
# Run the Spider
process = CrawlerProcess({'USER_AGENT': 'hol'})
process.crawl(Spider_Inmob)
process.start()
path = "D:\0. Documentos\7. DataCamp\1. WebScraping\99. Ejemplos\PortalInmob.csv"
df3.to_csv(path)
print(df3.head())
print(df3)
print(df3['Location'])
请求和正则表达式非常简单,因为我们知道它是页面上唯一的 lat/lon,并且我们知道 url 格式。我们可以使用正则表达式捕获 url 的 lat/lon 部分并将其分开。
import requests
import re
url = 'https://www.portalinmobiliario.com/MLC-564988630-estilo-mariposa-_JM#position=2&type=item&tracking_id=ed337e69-9999-4ede-b393-ef378e1a5675'
r = requests.get(url).text
lat, lon = re.findall(r'center=(-?\d+\.\d+\%2C-?\d+\.\d+)',r)[0].split('%2C')