Scrapy/Form 请求到下一页，回调不到下一个函数

Question

最近开始研究scrapy和web scraping。我正在做我的第一个项目，但遇到了困难。如果有人能帮我解决这个问题，我将不胜感激:)

我正在抓取页面 http://esg.krx.co.kr/contents/02/02020000/ESG02020000.jsp

到目前为止，我的程序已经抓取了所有 77 页（我知道它有点硬编码，我稍后会尝试更改它）并得到 company_name 和 company_share_id .所以现在我试图转到 company_page_url 并再次发送 post 请求以从图表中获取数据（并非每个公司都有图表）。但是它似乎没有调用 parse_company_result.

下面我上传我的代码：

import scrapy
import json
from scrapy.http import Request


class EsgKrx1Spider(scrapy.Spider):
name = 'esg_krx1'
allowed_domains = ['esg.krx.co.kr']

def start_requests(self):
    #sending a post request to the web
    return [scrapy.FormRequest("http://esg.krx.co.kr/contents/99/ESG99000001.jspx",
                               formdata={'sch_com_nm': '',
                                         'sch_yy': '2021',
                                         'pagePath': '/contents/02/02020000/ESG02020000.jsp',
                                         'code': '02/02020000/esg02020000',
                                         'pageFirstCall': 'Y'},
                               callback=self.parse)]

def parse(self, response):
    url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"

    total_pages = 77
    for page in range(total_pages):
        payload = {
            'sch_com_nm': '',
            'sch_yy': '2021',
            'pagePath': '/contents/02/02020000/ESG02020000.jsp',
            'code': '02/02020000/esg02020000',
            'curPage': str(page+1)
        }

        yield scrapy.FormRequest(url=url,
                                 method='POST',
                                 formdata=payload,
                                 callback=self.parse_result)

def parse_result(self, response):
    dict_data = json.loads(response.text)

    # looping in the result and assigning the company name
    for i in dict_data['result']:
        company_name = i['com_abbrv']
        compay_share_id = i['isu_cd']
        print(company_name, compay_share_id)

        company_page_url = f"http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd={compay_share_id}"
        yield Request(company_page_url)

        data_url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"

        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
        }

        # yield response.follow(url=data_url, method='POST', callback=self.parse_company_result, headers=headers)
        yield scrapy.FormRequest(url=data_url,
                                 method='POST',
                                 headers=headers,
                                 callback=self.parse_company_result)


def parse_company_result(self, response):
    graph_data = json.loads(response.text)
    print(graph_data)

所有的功能当然都在class，只是没有像我预期的那样粘贴代码。

所以我的问题是：

如何进入公司页面url？

或者请求是正确的，但后来我做错了什么？

也许我没有收到 data_url 的回复？

我会感谢所有的帮助。

Answer 1

我已经更新了你的脚本，因为有很多错误，即：

在 parse_result 中，最好创建另一个函数来解析公司 url，而不是在同一个函数中解析它们。
您需要包含有效负载以从 Request Url 解析 json，同样最好将它们拆分到单独的解析器中，这样您就可以看到正在发生的事情和正在发生的事情。

我构建了一个以分层方式执行此操作的抓取工具，以便您可以了解正在发生的事情top-down。

补充说明：

cb_kwargs 允许您将变量从一个解析器传递到另一个解析器。因此，我可以从 parse_result 中获取公司 ID 和名称，并在最后一个解析器中生成它。注意 - 公司 ID 对于 parse_company 中的有效载荷很重要。因此，您应该习惯于了解 cb_kwargs 的工作原理。

import scrapy
import json
from scrapy.http import Request

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:97.0) Gecko/20100101 Firefox/97.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'en-GB,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Origin': 'http://esg.krx.co.kr',
    'Connection': 'keep-alive',
    'Referer': 'http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd=004710',
}

class EsgKrx1Spider(scrapy.Spider):
    name = 'esg_krx1'
    allowed_domains = ['esg.krx.co.kr']
    
    def start_requests(self):
        #sending a post request to the web
        return [scrapy.FormRequest("http://esg.krx.co.kr/contents/99/ESG99000001.jspx",
                                formdata={'sch_com_nm': '',
                                            'sch_yy': '2021',
                                            'pagePath': '/contents/02/02020000/ESG02020000.jsp',
                                            'code': '02/02020000/esg02020000',
                                            'pageFirstCall': 'Y'},
                                callback=self.parse)]
    
    def parse(self, response):
        url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"
    
        total_pages = 77
        for page in range(total_pages):
            payload = {
                'sch_com_nm': '',
                'sch_yy': '2021',
                'pagePath': '/contents/02/02020000/ESG02020000.jsp',
                'code': '02/02020000/esg02020000',
                'curPage': str(page+1)
            }
    
            yield scrapy.FormRequest(url=url,
                                    method='POST',
                                    formdata=payload,
                                    callback=self.parse_result)
    
    def parse_result(self, response):
        dict_data = json.loads(response.text)
    
        # looping in the result and assigning the company name
        for i in dict_data['result']:
            company_name = i['com_abbrv']
            company_share_id = i['isu_cd']

            company_page_url = f"http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd={company_share_id}"
            yield Request(company_page_url,
            #headers=headers, 
            callback = self.parse_company, cb_kwargs = {
                'company_share_id':company_share_id,
                'company_name':company_name
            })

    def parse_company(self, response, company_share_id, company_name):
    """ Grab the chart ID from the webpage and store it as a list"""

        chart_id = response.xpath("(//div[@class='CHART-AREA'])[1]//div//@id").get()
        chart_id = [chart_id.split("chart")[-1]]

""" Notice that the number at the end of code in payload changes for each chart"""    

        for id_of_chart in chart_id:
            for code_no in  range(1, 3):
                yield scrapy.FormRequest(
                    url = 'http://esg.krx.co.kr/contents/99/ESG99000001.jspx',
                    method='POST',
                    # headers=headers,
                    formdata = {
                            'url_isu_cd': str(company_share_id),
                            'isu_cd': '',
                            'sch_com_nm': '',
                            'pagePath': '/contents/02/02010000/ESG02010000.jsp',
                            'code': f'02/02010000/esg02010000_0{code_no}',
                            'chartNo': f'{id_of_chart}'
                                                                    },
                    callback = self.parse_company_result,
                    cb_kwargs = {
                        'company_share_id':company_share_id,
                        'company_name':company_name
                    }
                )
        
    def parse_company_result(self, response, company_share_id, company_name):
        graph_data = json.loads(response.text)
        yield {
            'data':graph_data, 
            'company_name':company_name,
            'company_share_id':company_share_id
        }

输出：

{'data': {'block1': [{'yy': '2019', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}, {'yy': '2020', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}, {'yy': '2021', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}]}, 'company_name': '아남전자', 'company_share_id': '008700'}

...
...

Scrapy/Form 请求到下一页，回调不到下一个函数

Scrapy / Form request to the next page, callback does not go to the next function

callback

scrapy

web-scraping