Scrapy 不打印任何东西
Scrapy doesn't print anything
有人可以告诉我我的代码中有什么错误吗?
我在 cmd 中写了“scrapy crawl provincia -o table_data_results.csv”,但 excel 是空的。我认为它没有抓取任何东西。
from scrapy import Spider
from scrapy.http import FormRequest
class ProvinciaSpider(Spider):
name = 'provincia'
allowed_domains = ['aduanet.gob.pe']
start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']
def parse(self, response):
data ={ 'accion': 'consultaManifExpProvincia',
'salidaPro': 'YES',
'strMenu': '-',
'strEmpTransTerrestre': '-',
'CMc1_Anno': '2022',
'CMc1_Numero': '96',
'CG_cadu': '046',
'viat': '1'}
yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)
def parse_form_page(self, response):
table = response.xpath('/html/body/form[1]/table[5]/tbody/tr/td/table/tbody/tr[1]/td/table')
trs= table.xpath('.//tr')[1:]
for tr in trs:
puerto_llegada= tr.xpath('.//td[0]/text()').extract_first().strip()
pais= tr.xpath('.//td[0]/text()').extract_first().strip()
bl= tr.xpath('.//td[2]/text()').extract_first().strip()
peso= tr.xpath('.//td[7]/text()').extract_first().strip()
bultos= tr.xpath('.//td[8]/text()').extract_first().strip()
consignatario= tr.xpath('.//td[11]/text()').extract_first().strip()
embarcador= tr.xpath('.//td[12]/text()').extract_first().strip()
yield {'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': peso,
'bultos': bultos,
'consignatario': consignatario,
'embarcador': embarcador}
编辑:
如果我想把它放在我的代码中
links=tr.xpath('.//td[4]/text()')
yield response.follow(links.get(), callback= self.parse_categories)
def parse_categories(self, response):
tabla_des= response.xpath('/html/body/form//td[@class="beta"]/table')
trs3= tabla_des.xpath('.//tr')[1:]
for tr3 in trs3:
descripcion= tr.xpath('.//td[7]/text()').extract_first().strip()
and in the yield part I want it like this:
yield {'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': float("".join(peso.split(','))),
'bultos': float("".join(bultos.split(','))),
'consignatario': consignatario,
'embarcador': embarcador,
'descripcion': descripcion}
我应该把它放在哪里?
我发现了两个问题
xpath
找不到 table
- print(len(table))
显示 0
- 所以我使用了不同的 xpath
'/html/body/form[1]//td[@class="beta"]/table'
xpath
在 1
处建立索引,但您使用 td[0]
- 所以我使用 td[1]
并更改了其他 td
中的索引
from scrapy import Spider
from scrapy.http import FormRequest
class ProvinciaSpider(Spider):
name = 'provincia'
allowed_domains = ['aduanet.gob.pe']
start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']
def parse(self, response):
data ={ 'accion': 'consultaManifExpProvincia',
'salidaPro': 'YES',
'strMenu': '-',
'strEmpTransTerrestre': '-',
'CMc1_Anno': '2022',
'CMc1_Numero': '96',
'CG_cadu': '046',
'viat': '1'}
yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)
def parse_form_page(self, response):
table = response.xpath('/html/body/form[1]//td[@class="beta"]/table')
print('table:', len(table))
trs = table.xpath('.//tr')[1:]
print('trs:', len(trs))
for tr in trs:
tds = tr.xpath('.//td')
print('tds:', len(tds))
if not tds:
print('empty row')
else:
puerto_llegada= tr.xpath('.//td[1]/text()').extract_first().strip()
pais= tr.xpath('.//td[1]/text()').extract_first().strip()
bl= tr.xpath('.//td[3]/text()').extract_first().strip()
peso= tr.xpath('.//td[8]/text()').extract_first().strip()
bultos= tr.xpath('.//td[9]/text()').extract_first().strip()
consignatario= tr.xpath('.//td[12]/text()').extract_first().strip()
embarcador= tr.xpath('.//td[13]/text()').extract_first().strip()
yield {'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': peso,
'bultos': bultos,
'consignatario': consignatario,
'embarcador': embarcador}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ProvinciaSpider)
c.start()
结果:
puerto_llegada,pais,bl,peso,bultos,consignatario,embarcador
JPYOK,JPYOK,MAEU1KT407500,"21,320.00",709,"HOWA SHOJI CO., LTD",GEALE AGROTRADING E.I.R.L.
BEANR,BEANR,MAEU216307459,"19,980.00",285,"Greencof B.V.,",COOPERATIVA AGRARIA RODRIGUEZ DE MENDOZA
NLRTM,NLRTM,MAEU216473104,"83,890.00",5280,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.
BEANR,BEANR,MAEU216473141,"23,710.00",1080,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.
BEANR,BEANR,MAEU216473186,"47,420.00",2160,AGROFAIR BENELUX BV,COOPERATIVA AGRARIA APPBOSA
NLRTM,NLRTM,MAEU216473211,"22,520.00",1080,AgroFair Benelux BV,COOPERATIVA AGRARIA DE USUARIOS RIO Y VALLE
BEANR,BEANR,MAEU216632137,"22,270.00",1080,FYFFES INTERNATIONAL,AGRO PACHA S.A.
KRPUS,KRPUS,MAEU913722041,"24,480.00",1175,TO THE ORDER,PERUPEZ S.A.C.
ITCVV,ITCVV,MAEU913779677,"66,950.00",3240,BATTAGLIO SPA,IREN PERU SOCIEDAD ANONIMA CERRADA - IREN PERU S.A
NLRTM,NLRTM,MAEU913798070,"24,700.00",5544,FRUTOS TROPICALES EUROPE B.V.,FRUTOS TROPICALES PERU EXPORT SOCIEDAD ANONIMA CER
有人可以告诉我我的代码中有什么错误吗?
我在 cmd 中写了“scrapy crawl provincia -o table_data_results.csv”,但 excel 是空的。我认为它没有抓取任何东西。
from scrapy import Spider
from scrapy.http import FormRequest
class ProvinciaSpider(Spider):
name = 'provincia'
allowed_domains = ['aduanet.gob.pe']
start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']
def parse(self, response):
data ={ 'accion': 'consultaManifExpProvincia',
'salidaPro': 'YES',
'strMenu': '-',
'strEmpTransTerrestre': '-',
'CMc1_Anno': '2022',
'CMc1_Numero': '96',
'CG_cadu': '046',
'viat': '1'}
yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)
def parse_form_page(self, response):
table = response.xpath('/html/body/form[1]/table[5]/tbody/tr/td/table/tbody/tr[1]/td/table')
trs= table.xpath('.//tr')[1:]
for tr in trs:
puerto_llegada= tr.xpath('.//td[0]/text()').extract_first().strip()
pais= tr.xpath('.//td[0]/text()').extract_first().strip()
bl= tr.xpath('.//td[2]/text()').extract_first().strip()
peso= tr.xpath('.//td[7]/text()').extract_first().strip()
bultos= tr.xpath('.//td[8]/text()').extract_first().strip()
consignatario= tr.xpath('.//td[11]/text()').extract_first().strip()
embarcador= tr.xpath('.//td[12]/text()').extract_first().strip()
yield {'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': peso,
'bultos': bultos,
'consignatario': consignatario,
'embarcador': embarcador}
编辑: 如果我想把它放在我的代码中
links=tr.xpath('.//td[4]/text()')
yield response.follow(links.get(), callback= self.parse_categories)
def parse_categories(self, response):
tabla_des= response.xpath('/html/body/form//td[@class="beta"]/table')
trs3= tabla_des.xpath('.//tr')[1:]
for tr3 in trs3:
descripcion= tr.xpath('.//td[7]/text()').extract_first().strip()
and in the yield part I want it like this:
yield {'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': float("".join(peso.split(','))),
'bultos': float("".join(bultos.split(','))),
'consignatario': consignatario,
'embarcador': embarcador,
'descripcion': descripcion}
我应该把它放在哪里?
我发现了两个问题
xpath
找不到table
-print(len(table))
显示0
- 所以我使用了不同的xpath
'/html/body/form[1]//td[@class="beta"]/table'
xpath
在1
处建立索引,但您使用td[0]
- 所以我使用td[1]
并更改了其他td
中的索引
from scrapy import Spider
from scrapy.http import FormRequest
class ProvinciaSpider(Spider):
name = 'provincia'
allowed_domains = ['aduanet.gob.pe']
start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']
def parse(self, response):
data ={ 'accion': 'consultaManifExpProvincia',
'salidaPro': 'YES',
'strMenu': '-',
'strEmpTransTerrestre': '-',
'CMc1_Anno': '2022',
'CMc1_Numero': '96',
'CG_cadu': '046',
'viat': '1'}
yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)
def parse_form_page(self, response):
table = response.xpath('/html/body/form[1]//td[@class="beta"]/table')
print('table:', len(table))
trs = table.xpath('.//tr')[1:]
print('trs:', len(trs))
for tr in trs:
tds = tr.xpath('.//td')
print('tds:', len(tds))
if not tds:
print('empty row')
else:
puerto_llegada= tr.xpath('.//td[1]/text()').extract_first().strip()
pais= tr.xpath('.//td[1]/text()').extract_first().strip()
bl= tr.xpath('.//td[3]/text()').extract_first().strip()
peso= tr.xpath('.//td[8]/text()').extract_first().strip()
bultos= tr.xpath('.//td[9]/text()').extract_first().strip()
consignatario= tr.xpath('.//td[12]/text()').extract_first().strip()
embarcador= tr.xpath('.//td[13]/text()').extract_first().strip()
yield {'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': peso,
'bultos': bultos,
'consignatario': consignatario,
'embarcador': embarcador}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ProvinciaSpider)
c.start()
结果:
puerto_llegada,pais,bl,peso,bultos,consignatario,embarcador
JPYOK,JPYOK,MAEU1KT407500,"21,320.00",709,"HOWA SHOJI CO., LTD",GEALE AGROTRADING E.I.R.L.
BEANR,BEANR,MAEU216307459,"19,980.00",285,"Greencof B.V.,",COOPERATIVA AGRARIA RODRIGUEZ DE MENDOZA
NLRTM,NLRTM,MAEU216473104,"83,890.00",5280,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.
BEANR,BEANR,MAEU216473141,"23,710.00",1080,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.
BEANR,BEANR,MAEU216473186,"47,420.00",2160,AGROFAIR BENELUX BV,COOPERATIVA AGRARIA APPBOSA
NLRTM,NLRTM,MAEU216473211,"22,520.00",1080,AgroFair Benelux BV,COOPERATIVA AGRARIA DE USUARIOS RIO Y VALLE
BEANR,BEANR,MAEU216632137,"22,270.00",1080,FYFFES INTERNATIONAL,AGRO PACHA S.A.
KRPUS,KRPUS,MAEU913722041,"24,480.00",1175,TO THE ORDER,PERUPEZ S.A.C.
ITCVV,ITCVV,MAEU913779677,"66,950.00",3240,BATTAGLIO SPA,IREN PERU SOCIEDAD ANONIMA CERRADA - IREN PERU S.A
NLRTM,NLRTM,MAEU913798070,"24,700.00",5544,FRUTOS TROPICALES EUROPE B.V.,FRUTOS TROPICALES PERU EXPORT SOCIEDAD ANONIMA CER