Scrapy.Request returns <GET url> 没有抓取任何东西

Scrapy.Request returns <GET url> without scraping anything

我想抓取 的提要,这是我的代码:

import scrapy
from urllib.parse import urljoin

class SitepointSpider(scrapy.Spider):
    # TODO: Add url tags (like /javascript) to the spider based on class paraneters
    name = "sitepoint"
    allowed_domains = [""]
    start_urls = [""]

    def parse(self, response):
        data = []
        for article in response.css("article"):
            title = article.css("a.t12xxw3g::text").get()
            href = article.css("a.t12xxw3g::attr(href)").get()
            img = article.css("img.f13hvvvv::attr(src)").get()
            time = article.css("time::text").get()
            url = urljoin("", href)
            text = scrapy.Request(url, callback=self.parse_article)

                {"title": title, "href": href, "img": img, "time": time, "text": text}
        yield data

    def parse_article(self, response):
        text = response.xpath(
        yield text


[{'title': 'How to Build an MVP with React and Firebase', 
'href': '/react-firebase-build-mvp/', 
'img': ' 
'time': 'September 28, 2021', 
'text': <GET>}]

它只是不抓取网址。我按照 中所说的一切进行操作,但仍然无法正常工作。


在这种情况下,您必须首先生成 URL,然后生成最后一个蜘蛛中的数据

此外,//*[@id="main-content"]/article/div/div/div[1]/section/text() 不会 return 你任何文本,因为 section 标签下有很多 HTML 元素

一个解决方案是您可以抓取 section 标签内的所有 HTML 元素,稍后清理它们以获取您的文章文本数据


import re

import scrapy
from urllib.parse import urljoin

class SitepointSpider(scrapy.Spider):
    # TODO: Add url tags (like /javascript) to the spider based on class paraneters
    name = "sitepoint"
    allowed_domains = [""]
    start_urls = [""]

    def clean_text(self, raw_html):
        :param raw_html: this will take raw html code
        :return: text without html tags
        cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        return re.sub(cleaner, '', raw_html)

    def parse(self, response):
        for article in response.css("article"):
            title = article.css("a.t12xxw3g::text").get()
            href = article.css("a.t12xxw3g::attr(href)").get()
            img = article.css("img.f13hvvvv::attr(src)").get()
            time = article.css("time::text").get()
            url = urljoin("", href)
            yield scrapy.Request(url, callback=self.parse_article, meta={"title": title,
                                                                         "href": href,
                                                                         "img": img,
                                                                         "time": time})

    def parse_article(self, response):
        title = response.request.meta["title"]
        href = response.request.meta["href"]
        img = response.request.meta["img"]
        time = response.request.meta["time"]
        all_data = {}
        article_html = response.xpath('//*[@id="main-content"]/article/div/div/div[1]/section').get()
        all_data["title"] = title
        all_data["href"] = href
        all_data["img"] = img
        all_data["time"] = time
        all_data["text"] = self.clean_text(article_html)

        yield all_data