Web Crawler - 使用 Scrapy 迭代 Postgres 数据库结果

Web Crawler - Iterating over Postgres database result with Scrapy

我正在尝试编写一个 scraper 从数据库结果中获取域。我能够从数据库中获取数据,但我不知道如何将它提供给 Scrapy。我看过这里并找到了很多建议,但 none 确实是我在做的事情。当我 运行 下面的代码时,没有任何反应,甚至没有错误。

scaper.py

#import json
import json

#import database library
import psycopg2

#import scrapy library
import scrapy

#create database connection
conn = psycopg2.connect(
    host="localhost",
    database="mydb",
    user="dbuser",
    password="postgres",
    port=5432
)

#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()

#execute query from database
query.execute('SELECT info FROM domains')

#create scrapy class
class MySpider(scrapy.Spider):  
    name = "scrap_domains"

    #start_requests with scrapy
    def start_requests(self):

        #iterate over database result
        for url in query:

            #iterate over each json object
            for item in url:

                #get domain name
                domain_name = item['domain']

                #grab information from url
                yield scrapy.Request()

        #print response
        def parse(self, response):
            print(response)

# we close the cursor and conn both
query.close()
conn.close()

我的爬虫终于开始工作了。问题是由于每次迭代都关闭游标和数据库连接引起的。 Python 不像 Node 那样异步,正如我一直在学习的那样。应该编写一个函数来检测迭代何时完成,然后继续执行进一步的任务,但出于本示例的目的,我们只是像在文件底部所做的那样将它们注释掉。我发布了详细的答案以供将来参考。

Notes : I use this scraper to scrape through a list of 300 millions records stored in my database. Just change your limit per page and the code below will do the rest for you until it's all done. When it' finished, just grab your json file and upload to your database. I suffered so that you don't have to.

我正在使用 PostgreSQL 并将数据存储在 JSONB 中。我的 table 只有 2 列,看起来像这样:

id (int) | info (jsonb)
1        | {"domain": "weerstation-aarschot.be","timestamp":1646106745}
2        | {"domain": "wereldvakanties.be","timestamp":1646106746}
3        | {"domain": "welzijnscentrum.be","timestamp":1646106747}

根据下面的 scrapy documents、copy/paste 代码和 运行 在您的终端中使用此命令将所有域写入 json 文件:

scrapy runspider scraper.py -o domains.json

使用Selectors从正文中提取HTML数据

scraper.py

#import datetime
from datetime import datetime

#calendar
import calendar
from email import header;
import time;

#import json
import json
from urllib import request
from wsgiref import headers

#import database library
import psycopg2

#import scrapy library
import scrapy

#create database connection
conn = psycopg2.connect(
    host="localhost",
    database="mydb",
    user="username",
    password="postgres",
    port=5432
)

#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()

#spiderclass
class MySpider(scrapy.Spider):  
    name = "domains"

    def start_requests(self):

            #database pagination
            #loop through extremely large datasets automatically
            current_page = ""
            offset = 0
            limit = 1000000
            flag = True

            #while its true
            while flag:

                #execute query from database
                query.execute("SELECT info FROM domains ORDER BY id ASC LIMIT "+str(limit)+" OFFSET "+str(offset))

                # query db with start and offset, example: select * from domains limit %start% offset %offset%
                unique_domains = query.fetchone()

                #condition
                if not unique_domains:
                    flag = False
                else:

                    # do processing with your data
                    offset += limit

                    #iterate over database result
                    for url in query:

                        #iterate over each json object
                        for item in url:

                            #variables from result
                            hostname = item['domain']
                            https_url = "https://"+hostname
                            http_url = "http://"+hostname

                            #fetch http request
                            yield scrapy.Request(url=https_url, callback=self.parse)

    #print response
    def parse(self, response):

        #current date
        currDate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        timestamp = calendar.timegm(time.gmtime())
        date_created = currDate

        #variables from response
        url = response.url
        status = response.status
        headers = response.headers
        request = response.request
        body = response.body

        #header information
        content_language = ""
        protocol = ""

        #meta tags
        favicon = response.css('link::attr(href)').get()
        title = response.xpath('//title/text()').get()
        description = response.xpath('//description/text()').get()
        keywords = response.xpath('//keywords/text()').get()
        author = response.xpath('//author/text()').get()
        type = response.xpath('//content-type/text()').get()

        #open graph tags
        og_title = ""
        og_type = ""
        og_url = ""
        og_image = ""
        og_site_name = ""
        og_description = ""

        #get all the links to follow
        links = response.css('a::attr(href)').getall()

        #get text from every header tag (<h>)
        h1_text = response.css('h1::text').getall()
        h2_text = response.css('h2::text').getall()
        h3_text = response.css('h3::text').getall()

        #get span text
        span_text = response.css('span::text').getall();

        #get text from every paragraph tag (<p>)
        p_text = response.css('p::text').getall()

        #get text from every div tag (<div>)
        div_text = response.css('div::text').getall()

        #get every image
        images = response.css('img').xpath('@src').getall()

        #get every video
        videos = []

        #category, score
        websiteScore = ""

        #grab information from url
        result =  yield{
                    'url': url,
                    'status': status, 
                    "score" : websiteScore,
                    "type" : type,
                    "category" : "",
                    "industry" : "",
                    "timestamp" : timestamp,
                    "date_created" :date_created,
                    "headers" :{
                    "content_language" : content_language,
                    "protocol" : protocol,
                    },
                    "metas" : [
                    {
                    "favicon ": favicon,
                    "title" :title,
                    "description": description,
                    "keywords" :keywords,
                    "author" : author,
                    }
                    ],
                    "open_graph" : [
                    {
                    "og_title" :og_title,
                    "og_type" :og_type,
                    "og_url" :og_url,
                    "og_image" :og_image,
                    "og_site_name" :og_site_name,
                    "og_description" : og_description,
                    }
                    ],
                    "links": links,
                    "h1_text": h1_text,
                    "h2_text": h2_text,
                    "h3_text": h3_text,
                    "div_text": div_text,
                    "p_text": p_text,
                    "span_text": span_text,
                    "images ": images,
                    "videos" : videos

                    }

        #print result
        #print(request)

    # we close the cursor and conn both
    #query.close()
    #conn.close()
    #scrapy runspider scraper.py -o domains.json

domains.json(示例输出)

[
{"url": "https://weerstation-aarschot.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535167.804621, "date_created": "2022-03-05 21:52:47", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://wereldvakanties.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.069924, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://welzijnscentrum.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.096689, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
]