Scrapy POST 请求表单数据无效

Question

我想从这个网站获取数据： https://livedragon.vdsc.com.vn/general/intradayBoard.rv 这是我检查响应的代码：

import scrapy
from scrapy import FormRequest
from scrapy.shell import inspect_response

class VdscSpider(scrapy.Spider):
    name = 'vdsc'
    start_urls = ['https://livedragon.vdsc.com.vn/general/intradayBoard.rv']

    def start_requests(self):
        headers= {
            
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"
        }
        formdata = {
            "stockCode": "AAA",
            "boardDate": "12/11/2021"
        }
        yield  FormRequest(url='https://livedragon.vdsc.com.vn/general/intradayBoard.rv',
                            headers=headers,
                            formdata=formdata,              
                            callback=self.parse_item
                            )

    def parse_item(self, response):
        inspect_response(response, self)

当通过 scrapy shell 查看（响应）检查时，它没有显示任何数据：

请看一下，帮我解决这个问题。谢谢。

Answer 1

您需要将请求发送至https://livedragon.vdsc.com.vn/general/intradaySearch.rv
从搜索页面获取headers。
您需要将方法设置为POST。
抓取 json 响应。

import scrapy
from scrapy import FormRequest


class VdscSpider(scrapy.Spider):
    name = 'vdsc'
    start_urls = ['https://livedragon.vdsc.com.vn/general/intradayBoard.rv']

    def parse(self, response):
        headers = {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.5",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "DNT": "1",
            "Host": "livedragon.vdsc.com.vn",
            "Origin": "https://livedragon.vdsc.com.vn",
            "Pragma": "no-cache",
            "Referer": "https://livedragon.vdsc.com.vn/general/intradayBoard.rv",
            "Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "same-origin",
            "Sec-GPC": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
            "X-KL-Ajax-Request": "Ajax_Request",
            "X-Requested-With": "XMLHttpRequest"
        }

        search_page = 'https://livedragon.vdsc.com.vn/general/intradaySearch.rv'

        formdata = {
            "stockCode": "AAA",
            "boardDate": "12/11/2021"
        }

        yield FormRequest(url=search_page,
                          method='POST',
                          headers=headers,
                          formdata=formdata,
                          callback=self.parse_item
                          )

    def parse_item(self, response):
        json_data = response.json()
        if json_data['success'] != True:
            self.logger.error("Couldn't retrieve json")
            return

        for item in json_data['list']:
            item_data = {
                "MatchedPrice": item["MatchedPrice"],
                "OfferVol1": item["OfferVol1"],
                "OfferVol2": item["OfferVol2"],
                "FlrPrice": item["FlrPrice"],
                "OfferVol3": item["OfferVol3"],
                "BidVol1": item["BidVol1"],
                "BidVol2": item["BidVol2"],
                "FSellVol": item["FSellVol"],
                "BidVol3": item["BidVol3"],
                "OfferPrice3": item["OfferPrice3"],
                "OfferPrice1": item["OfferPrice1"],
                "OfferPrice2": item["OfferPrice2"],
                "MatchedChange": item["MatchedChange"],
                "AvgPrice": item["AvgPrice"],
                "FloorCode": item["FloorCode"],
                "HigPrice": item["HigPrice"],
                "TradeTime": item["TradeTime"],
                "MatchedVol": item["MatchedVol"],
                "BidPrice2": item["BidPrice2"],
                "BidPrice3": item["BidPrice3"],
                "Code": item["Code"],
                "FBuyVol": item["FBuyVol"],
                "CeiPrice": item["CeiPrice"],
                "BidPrice1": item["BidPrice1"],
                "LowPrice": item["LowPrice"],
                "RefPrice": item["RefPrice"],
                "AmPm": item["AmPm"],
                "MatchedTotalVol": item["MatchedTotalVol"],
            }
            yield item_data

输出

{'MatchedPrice': 18.5, 'OfferVol1': 461100, 'OfferVol2': 160100, 'FlrPrice': 16.75, 'OfferVol3': 339900, 'BidVol1': 73500, 'BidVol2': 41100, 'FSellVol': 209200, 'BidVol3': 54900, 'OfferPrice3': 18.6, 'OfferPrice1': 18.5, 'OfferPrice2': 18.55, 'MatchedChange': 0.5, 'AvgPrice': 18.31, 'FloorCode': 'HSX', 'HigPrice': 18.7, 'TradeTime': '13:54:37', 'MatchedVol': 100, 'BidPrice2': 18.4, 'BidPrice3': 18.35, 'Code': 'AAA', 'FBuyVol': 75000, 'CeiPrice': 19.25, 'BidPrice1': 18.45, 'LowPrice': 17.9, 'RefPrice': 18.0, 'AmPm': 'PM', 'MatchedTotalVol': 7234200}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://livedragon.vdsc.com.vn/general/intradaySearch.rv>
{'MatchedPrice': 18.5, 'OfferVol1': 438600, 'OfferVol2': 160100, 'FlrPrice': 16.75, 'OfferVol3': 334900, 'BidVol1': 61800, 'BidVol2': 31800, 'FSellVol': 209200, 'BidVol3': 54500, 'OfferPrice3': 18.6, 'OfferPrice1': 18.5, 'OfferPrice2': 18.55, 'MatchedChange': 0.5, 'AvgPrice': 18.31, 'FloorCode': 'HSX', 'HigPrice': 18.7, 'TradeTime': '13:54:56', 'MatchedVol': 200, 'BidPrice2': 18.4, 'BidPrice3': 18.35, 'Code': 'AAA', 'FBuyVol': 75000, 'CeiPrice': 19.25, 'BidPrice1': 18.45, 'LowPrice': 17.9, 'RefPrice': 18.0, 'AmPm': 'PM', 'MatchedTotalVol': 7257600}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://livedragon.vdsc.com.vn/general/intradaySearch.rv>
{'MatchedPrice': 18.5, 'OfferVol1': 437400, 'OfferVol2': 160100, 'FlrPrice': 16.75, 'OfferVol3': 338000, 'BidVol1': 61900, 'BidVol2': 31800, 'FSellVol': 209200, 'BidVol3': 54500, 'OfferPrice3': 18.6, 'OfferPrice1': 18.5, 'OfferPrice2': 18.55, 'MatchedChange': 0.5, 'AvgPrice': 18.31, 'FloorCode': 'HSX', 'HigPrice': 18.7, 'TradeTime': '13:55:14', 'MatchedVol': 200, 'BidPrice2': 18.4, 'BidPrice3': 18.35, 'Code': 'AAA', 'FBuyVol': 75000, 'CeiPrice': 19.25, 'BidPrice1': 18.45, 'LowPrice': 17.9, 'RefPrice': 18.0, 'AmPm': 'PM', 'MatchedTotalVol': 7257600}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://livedragon.vdsc.com.vn/general/intradaySearch.rv>
...
...
...

编辑：

在 settings.py、mainy.py 或 start_requests 函数中设置用户代理。

main.py

中的示例

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
process.crawl('vdsc')
process.start()

start_requests中的示例：

def start_requests(self):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    }

    for url in self.start_urls:
        yield scrapy.Request(url=url, headers=headers)

Scrapy POST 请求表单数据无效

Scrapy POST request form data not working

python

web-crawler

scrapy