提取网站,但它们提供了错误的输出
Extract website but they provide wrong output
我要尝试提取 table 他们会给我输出但他们会错这是页面 link
https://hoopshype.com/salaries/players/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table[@class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//thead//tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if i]
columns=response.xpath("//table[@class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//tbody//tr")
for column in columns:
players=column.xpath('td//text()').getall()
players = ''.join(players).strip()
details = dict(zip(keys, players))
yield details
试试这个:
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table/thead/tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if I]
columns=response.xpath("//table/tbody/tr")
for column in columns:
player_name = [column.xpath('td[@class="name"]/a/text()').get().strip()]
detail = column.xpath('td/@data-value').getall()
details = dict(zip(keys, player_name+detail))
yield details
data-value
也保存了数值,我们也可以用。因为我在提取 text()
时遇到了问题。
最后,我认为您不需要指定 table class 名称 (table[@class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']
),因为该页面只有一个 table。
我要尝试提取 table 他们会给我输出但他们会错这是页面 link https://hoopshype.com/salaries/players/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table[@class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//thead//tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if i]
columns=response.xpath("//table[@class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//tbody//tr")
for column in columns:
players=column.xpath('td//text()').getall()
players = ''.join(players).strip()
details = dict(zip(keys, players))
yield details
试试这个:
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table/thead/tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if I]
columns=response.xpath("//table/tbody/tr")
for column in columns:
player_name = [column.xpath('td[@class="name"]/a/text()').get().strip()]
detail = column.xpath('td/@data-value').getall()
details = dict(zip(keys, player_name+detail))
yield details
data-value
也保存了数值,我们也可以用。因为我在提取 text()
时遇到了问题。
最后,我认为您不需要指定 table class 名称 (table[@class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']
),因为该页面只有一个 table。