链接没有 url 格式以便抓取它们

Question

这是我的代码：


import scrapy
from scrapy import Spider
from scrapy.http import FormRequest

class ProvinciaSpider(Spider):
    name = 'provincia'
    allowed_domains = ['aduanet.gob.pe']
    start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']

    def parse(self, response):
        data ={ 'accion': 'consultaManifExpProvincia',
        'salidaPro': 'YES',
        'strMenu': '-',
        'strEmpTransTerrestre': '-',
        'CMc1_Anno': '2022',
        'CMc1_Numero': '96',
        'CG_cadu': '046',
        'viat': '1'}

        yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)

    def parse_form_page(self, response):
        table = response.xpath('/html/body/form[1]//td[@class="beta"]/table')
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            puerto_llegada= tr.xpath('.//td[1]/text()').extract_first().strip()
            pais= tr.xpath('.//td[1]/text()').extract_first().strip()
            bl= tr.xpath('.//td[3]/text()').extract_first().strip()
            peso= tr.xpath('.//td[8]/text()').extract_first().strip()
            bultos= tr.xpath('.//td[9]/text()').extract_first().strip()
            consignatario= tr.xpath('.//td[12]/text()').extract_first().strip()
            embarcador= tr.xpath('.//td[13]/text()').extract_first().strip()
            links=tr.xpath('.//td[4]/a/@href')

            yield response.follow(links.get(),
                                 callback=self.parse_categories,
                                 meta={'puerto_llegada': puerto_llegada,
                                       'pais': pais,
                                       'bl': bl,
                                       'peso': float("".join(peso.split(','))),
                                       'bultos': float("".join(bultos.split(','))),
                                       'consignatario': consignatario,
                                       'embarcador': embarcador})
    def parse_categories(self, response):
        puerto_llegada = response.meta['puerto_llegada']
        pais = response.meta['pais']
        bl = response.meta['bl']
        peso = response.meta['peso']
        bultos = response.meta['bultos']
        consignatario = response.meta['consignatario']
        embarcador = response.meta['embarcador']


        tabla_des= response.xpath('/html/body/form//td[@class="beta"]/table')
        trs3= tabla_des.xpath('.//tr')[1:]
        for tr3 in trs3:
            descripcion= tr.xpath('.//td[7]/text()').extract_first().strip()

            yield {'puerto_llegada': puerto_llegada,
                   'pais': pais,
                   'bl': bl,
                   'peso': PROCESOS,
                   'bultos': bultos,
                   'consignatario': consignatario,
                   'embarcador': embarcador,
                   'descripcion': descripcion}

我收到这个错误：

ValueError：请求中缺少方案 url：javascript:jsDetalle2('154');

我想从中提取数据的每个 link 都具有该格式，因此我用于提取每个 link 中的数据的代码不起作用。

link 格式类似于 javascript:jsDetalle2('154')，只是数字有所变化。

问题是它不是 http//........ 或 /manifiesto...... 在第一种情况下你只需要遵循 link 那就是所有，在第二种情况下，您必须将 URL 的第二部分与第一个响应 URL 一起加入。但是这个案例是none，所以我不知道如何让它工作。

如何编写才能工作？

Answer 1

我在浏览器中检查了这个 link - 当我单击带有文本 154 的 link 时，它会运行 POST 并包含许多值，其中之一是 'CMc2_NumDet': '154' - 所以我可以从 link 得到这个数字并在 POST.

中使用

在浏览器中你可以看到 'CMc2_Numero': "+++96" 但在代码中你需要 space 而不是像 " 96" 那样的 + （并且 scrapy 将使用 + 而不是space) 或者您可以删除所有 +，例如 "96" .

顺便说一句：我将 meta 所有值都输入为 item: {...} 所以稍后我可以使用一行 meta['item']

获取所有值

        number = tr.xpath('.//td[4]/a/text()').get()

        data = {
            'accion': "consultaManifExpProvinciaDetalle",
            'CMc2_Anno': "2022",
            'CMc2_Numero': "96",    # <--- without `+`
            'CG_cadu': "046",
            'CMc2_viatra': "1",
            'CMc2_numcon': "",
            'CMc2_NumDet': number,  # <---
            'tipo_archivo': "",
            'reporte': "ExpPro",
            'backPage': "ConsulManifExpPro",
        }

        yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias',
                          formdata=data,
                          callback=self.parse_categories,
                          meta={"item": {'puerto_llegada': puerto_llegada,
                                         'pais': pais,
                                         'bl': bl,
                                         'peso': float("".join(peso.split(','))),
                                         'bultos': float("".join(bultos.split(','))),
                                         'consignatario': consignatario,
                                         'embarcador': embarcador}})
    
def parse_categories(self, response):
    print('[parse_form_page] url:', response.url)

    item = response.meta['item']

    tabla_des = response.xpath('/html/body/form//td[@class="beta"]/table')
    trs3 = tabla_des.xpath('.//tr')[1:]
    for tr3 in trs3:   # trs3[:1]: for single result
        item['descripcion'] = tr3.xpath('.//td[7]/text()').extract_first().strip()
        yield item

完整的工作代码。

包含类别的页面可能在 table 中有许多行（具有您不使用的不同 Peso Bruto），因此它可能在 CSV 中提供许多行。

如果您只需要一行，则使用 trs3[:1]: 而不是 trs3:

我使用不同的 xpath 来查找 table 和 "Descripcion" - 因为以前的版本没有检查 table 是否有 Descripcion 并且它可以得到 3 tables 而不是一个。

import scrapy
from scrapy import Spider
from scrapy.http import FormRequest

class ProvinciaSpider(Spider):
    
    name = 'provincia'
    allowed_domains = ['aduanet.gob.pe']
    start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']

    def parse(self, response):
        payload = {
            'accion': 'consultaManifExpProvincia',
            'salidaPro': 'YES',
            'strMenu': '-',
            'strEmpTransTerrestre': '-',
            'CMc1_Anno': '2022',
            'CMc1_Numero': '96',
            'CG_cadu': '046',
            'viat': '1'
        }

        yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias',
                          formdata=payload,
                          callback=self.parse_form_page)

    def parse_form_page(self, response):
        print('[parse_form_page] url:', response.url)
        
        table = response.xpath('/html/body/form[1]//td[@class="beta"]/table')
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            item = {
                'puerto_llegada': tr.xpath('.//td[1]/text()').extract_first().strip(),
                'pais': tr.xpath('.//td[1]/text()').extract_first().strip(),
                'bl': tr.xpath('.//td[3]/text()').extract_first().strip(),
                'peso': tr.xpath('.//td[8]/text()').extract_first().strip().replace(',', ''),    # <---
                'bultos': tr.xpath('.//td[9]/text()').extract_first().strip().replace(',', ''),  # <---
                'consignatario': tr.xpath('.//td[12]/text()').extract_first().strip(),
                'embarcador': tr.xpath('.//td[13]/text()').extract_first().strip(),
            }

            number = tr.xpath('.//td[4]/a/text()').get().strip()
            print(number.strip())
            
            payload = {
                'accion': "consultaManifExpProvinciaDetalle",
                'CMc2_Anno': "2022",
                'CMc2_Numero': "96",     # without `+` or use `space` instead of `+`
                'CG_cadu': "046",
                'CMc2_viatra': "1",
                'CMc2_numcon': "",
                'CMc2_NumDet': number,   # <---
                'tipo_archivo': "",
                'reporte': "ExpPro",
                'backPage': "ConsulManifExpPro",
            }

            yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias',
                              formdata=payload,
                              callback=self.parse_categories,
                              meta={"item": item})
        
    def parse_categories(self, response):
        print('[parse_form_page] url:', response.url)

        item = response.meta['item']

        table = response.xpath('//table[./tr/th[contains(text(), "Descripcion")]]')
        print('len(table):', len(table))

        trs = table.xpath('.//tr')[1:]
        print('len(trs):', len(trs))
        
        for tr in trs:   # trs[:1]: for single result
            item['descripcion'] = tr.xpath('.//td[7]/text()').extract_first().strip()
            yield item

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(ProvinciaSpider)
c.start()

结果（trs[:1]）

puerto_llegada,pais,bl,peso,bultos,consignatario,embarcador,descripcion
BEANR,BEANR,MAEU216473186,47420.00,2160,AGROFAIR BENELUX BV,COOPERATIVA AGRARIA APPBOSA,YT GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS.
NLRTM,NLRTM,MAEU216473104,83890.00,5280,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.,FYT GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS.
BEANR,BEANR,MAEU216307459,19980.00,285,"Greencof B.V.,",COOPERATIVA AGRARIA RODRIGUEZ DE MENDOZA,285 BAGS OF 69 KG NET OF PERU ORGANIC GREEN COFFEE FAIRTRADE CERTIFIED
JPYOK,JPYOK,MAEU1KT407500,21320.00,709,"HOWA SHOJI CO., LTD",GEALE AGROTRADING E.I.R.L.,GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS. BAN
ITCVV,ITCVV,MAEU913779677,66950.00,3240,BATTAGLIO SPA,IREN PERU SOCIEDAD ANONIMA CERRADA - IREN PERU S.A,GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS. BAN
NLRTM,NLRTM,MAEU913798070,24700.00,5544,FRUTOS TROPICALES EUROPE B.V.,FRUTOS TROPICALES PERU EXPORT SOCIEDAD ANONIMA CER,"FRESH MANGOES NET WEIGHT: 22,176.00 KG P.A.: 0804.50.20.00 TR.: JKXYA0"
BEANR,BEANR,MAEU216473141,23710.00,1080,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.,FYT GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS.
BEANR,BEANR,MAEU216632137,22270.00,1080,FYFFES INTERNATIONAL,AGRO PACHA S.A.,"GREEN FRESH ORGANIC BANANAS, PACKED IN CARTON BOXES AND POLYETHILENE B"
KRPUS,KRPUS,MAEU913722041,24480.00,1175,TO THE ORDER,PERUPEZ S.A.C.,"NET WEIGHT: 23,500 KG GROSS WEIGHT: 24,480 KG 1,175 SACKS 23,500 KG FR"
NLRTM,NLRTM,MAEU216473211,22520.00,1080,AgroFair Benelux BV,COOPERATIVA AGRARIA DE USUARIOS RIO Y VALLE,ORGANIC FAIRTRADE BANANAS GREEN FRESH CAVENDISH PACKED CARDBOARD BOXES

链接没有 url 格式以便抓取它们

Links doesn't have url format in order to scrape them scrapy

scrapy-shell