Web Crawler - 使用 Scrapy 迭代 Postgres 数据库结果
Web Crawler - Iterating over Postgres database result with Scrapy
我正在尝试编写一个 scraper 从数据库结果中获取域。我能够从数据库中获取数据,但我不知道如何将它提供给 Scrapy。我看过这里并找到了很多建议,但 none 确实是我在做的事情。当我 运行 下面的代码时,没有任何反应,甚至没有错误。
scaper.py
#import json
import json
#import database library
import psycopg2
#import scrapy library
import scrapy
#create database connection
conn = psycopg2.connect(
host="localhost",
database="mydb",
user="dbuser",
password="postgres",
port=5432
)
#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()
#execute query from database
query.execute('SELECT info FROM domains')
#create scrapy class
class MySpider(scrapy.Spider):
name = "scrap_domains"
#start_requests with scrapy
def start_requests(self):
#iterate over database result
for url in query:
#iterate over each json object
for item in url:
#get domain name
domain_name = item['domain']
#grab information from url
yield scrapy.Request()
#print response
def parse(self, response):
print(response)
# we close the cursor and conn both
query.close()
conn.close()
我的爬虫终于开始工作了。问题是由于每次迭代都关闭游标和数据库连接引起的。 Python 不像 Node 那样异步,正如我一直在学习的那样。应该编写一个函数来检测迭代何时完成,然后继续执行进一步的任务,但出于本示例的目的,我们只是像在文件底部所做的那样将它们注释掉。我发布了详细的答案以供将来参考。
Notes : I use this scraper to scrape through a list of 300 millions records stored in my database. Just change your limit per page and the code below will do the rest for you until it's all done. When it' finished, just grab your json file and upload to your database. I suffered so that you don't have to.
我正在使用 PostgreSQL 并将数据存储在 JSONB 中。我的 table 只有 2 列,看起来像这样:
id (int) | info (jsonb)
1 | {"domain": "weerstation-aarschot.be","timestamp":1646106745}
2 | {"domain": "wereldvakanties.be","timestamp":1646106746}
3 | {"domain": "welzijnscentrum.be","timestamp":1646106747}
根据下面的 scrapy documents、copy/paste 代码和 运行 在您的终端中使用此命令将所有域写入 json 文件:
scrapy runspider scraper.py -o domains.json
使用Selectors从正文中提取HTML数据
scraper.py
#import datetime
from datetime import datetime
#calendar
import calendar
from email import header;
import time;
#import json
import json
from urllib import request
from wsgiref import headers
#import database library
import psycopg2
#import scrapy library
import scrapy
#create database connection
conn = psycopg2.connect(
host="localhost",
database="mydb",
user="username",
password="postgres",
port=5432
)
#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()
#spiderclass
class MySpider(scrapy.Spider):
name = "domains"
def start_requests(self):
#database pagination
#loop through extremely large datasets automatically
current_page = ""
offset = 0
limit = 1000000
flag = True
#while its true
while flag:
#execute query from database
query.execute("SELECT info FROM domains ORDER BY id ASC LIMIT "+str(limit)+" OFFSET "+str(offset))
# query db with start and offset, example: select * from domains limit %start% offset %offset%
unique_domains = query.fetchone()
#condition
if not unique_domains:
flag = False
else:
# do processing with your data
offset += limit
#iterate over database result
for url in query:
#iterate over each json object
for item in url:
#variables from result
hostname = item['domain']
https_url = "https://"+hostname
http_url = "http://"+hostname
#fetch http request
yield scrapy.Request(url=https_url, callback=self.parse)
#print response
def parse(self, response):
#current date
currDate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
timestamp = calendar.timegm(time.gmtime())
date_created = currDate
#variables from response
url = response.url
status = response.status
headers = response.headers
request = response.request
body = response.body
#header information
content_language = ""
protocol = ""
#meta tags
favicon = response.css('link::attr(href)').get()
title = response.xpath('//title/text()').get()
description = response.xpath('//description/text()').get()
keywords = response.xpath('//keywords/text()').get()
author = response.xpath('//author/text()').get()
type = response.xpath('//content-type/text()').get()
#open graph tags
og_title = ""
og_type = ""
og_url = ""
og_image = ""
og_site_name = ""
og_description = ""
#get all the links to follow
links = response.css('a::attr(href)').getall()
#get text from every header tag (<h>)
h1_text = response.css('h1::text').getall()
h2_text = response.css('h2::text').getall()
h3_text = response.css('h3::text').getall()
#get span text
span_text = response.css('span::text').getall();
#get text from every paragraph tag (<p>)
p_text = response.css('p::text').getall()
#get text from every div tag (<div>)
div_text = response.css('div::text').getall()
#get every image
images = response.css('img').xpath('@src').getall()
#get every video
videos = []
#category, score
websiteScore = ""
#grab information from url
result = yield{
'url': url,
'status': status,
"score" : websiteScore,
"type" : type,
"category" : "",
"industry" : "",
"timestamp" : timestamp,
"date_created" :date_created,
"headers" :{
"content_language" : content_language,
"protocol" : protocol,
},
"metas" : [
{
"favicon ": favicon,
"title" :title,
"description": description,
"keywords" :keywords,
"author" : author,
}
],
"open_graph" : [
{
"og_title" :og_title,
"og_type" :og_type,
"og_url" :og_url,
"og_image" :og_image,
"og_site_name" :og_site_name,
"og_description" : og_description,
}
],
"links": links,
"h1_text": h1_text,
"h2_text": h2_text,
"h3_text": h3_text,
"div_text": div_text,
"p_text": p_text,
"span_text": span_text,
"images ": images,
"videos" : videos
}
#print result
#print(request)
# we close the cursor and conn both
#query.close()
#conn.close()
#scrapy runspider scraper.py -o domains.json
domains.json(示例输出)
[
{"url": "https://weerstation-aarschot.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535167.804621, "date_created": "2022-03-05 21:52:47", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://wereldvakanties.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.069924, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://welzijnscentrum.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.096689, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
]
我正在尝试编写一个 scraper 从数据库结果中获取域。我能够从数据库中获取数据,但我不知道如何将它提供给 Scrapy。我看过这里并找到了很多建议,但 none 确实是我在做的事情。当我 运行 下面的代码时,没有任何反应,甚至没有错误。
scaper.py
#import json
import json
#import database library
import psycopg2
#import scrapy library
import scrapy
#create database connection
conn = psycopg2.connect(
host="localhost",
database="mydb",
user="dbuser",
password="postgres",
port=5432
)
#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()
#execute query from database
query.execute('SELECT info FROM domains')
#create scrapy class
class MySpider(scrapy.Spider):
name = "scrap_domains"
#start_requests with scrapy
def start_requests(self):
#iterate over database result
for url in query:
#iterate over each json object
for item in url:
#get domain name
domain_name = item['domain']
#grab information from url
yield scrapy.Request()
#print response
def parse(self, response):
print(response)
# we close the cursor and conn both
query.close()
conn.close()
我的爬虫终于开始工作了。问题是由于每次迭代都关闭游标和数据库连接引起的。 Python 不像 Node 那样异步,正如我一直在学习的那样。应该编写一个函数来检测迭代何时完成,然后继续执行进一步的任务,但出于本示例的目的,我们只是像在文件底部所做的那样将它们注释掉。我发布了详细的答案以供将来参考。
Notes : I use this scraper to scrape through a list of 300 millions records stored in my database. Just change your limit per page and the code below will do the rest for you until it's all done. When it' finished, just grab your json file and upload to your database. I suffered so that you don't have to.
我正在使用 PostgreSQL 并将数据存储在 JSONB 中。我的 table 只有 2 列,看起来像这样:
id (int) | info (jsonb)
1 | {"domain": "weerstation-aarschot.be","timestamp":1646106745}
2 | {"domain": "wereldvakanties.be","timestamp":1646106746}
3 | {"domain": "welzijnscentrum.be","timestamp":1646106747}
根据下面的 scrapy documents、copy/paste 代码和 运行 在您的终端中使用此命令将所有域写入 json 文件:
scrapy runspider scraper.py -o domains.json
使用Selectors从正文中提取HTML数据
scraper.py
#import datetime
from datetime import datetime
#calendar
import calendar
from email import header;
import time;
#import json
import json
from urllib import request
from wsgiref import headers
#import database library
import psycopg2
#import scrapy library
import scrapy
#create database connection
conn = psycopg2.connect(
host="localhost",
database="mydb",
user="username",
password="postgres",
port=5432
)
#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()
#spiderclass
class MySpider(scrapy.Spider):
name = "domains"
def start_requests(self):
#database pagination
#loop through extremely large datasets automatically
current_page = ""
offset = 0
limit = 1000000
flag = True
#while its true
while flag:
#execute query from database
query.execute("SELECT info FROM domains ORDER BY id ASC LIMIT "+str(limit)+" OFFSET "+str(offset))
# query db with start and offset, example: select * from domains limit %start% offset %offset%
unique_domains = query.fetchone()
#condition
if not unique_domains:
flag = False
else:
# do processing with your data
offset += limit
#iterate over database result
for url in query:
#iterate over each json object
for item in url:
#variables from result
hostname = item['domain']
https_url = "https://"+hostname
http_url = "http://"+hostname
#fetch http request
yield scrapy.Request(url=https_url, callback=self.parse)
#print response
def parse(self, response):
#current date
currDate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
timestamp = calendar.timegm(time.gmtime())
date_created = currDate
#variables from response
url = response.url
status = response.status
headers = response.headers
request = response.request
body = response.body
#header information
content_language = ""
protocol = ""
#meta tags
favicon = response.css('link::attr(href)').get()
title = response.xpath('//title/text()').get()
description = response.xpath('//description/text()').get()
keywords = response.xpath('//keywords/text()').get()
author = response.xpath('//author/text()').get()
type = response.xpath('//content-type/text()').get()
#open graph tags
og_title = ""
og_type = ""
og_url = ""
og_image = ""
og_site_name = ""
og_description = ""
#get all the links to follow
links = response.css('a::attr(href)').getall()
#get text from every header tag (<h>)
h1_text = response.css('h1::text').getall()
h2_text = response.css('h2::text').getall()
h3_text = response.css('h3::text').getall()
#get span text
span_text = response.css('span::text').getall();
#get text from every paragraph tag (<p>)
p_text = response.css('p::text').getall()
#get text from every div tag (<div>)
div_text = response.css('div::text').getall()
#get every image
images = response.css('img').xpath('@src').getall()
#get every video
videos = []
#category, score
websiteScore = ""
#grab information from url
result = yield{
'url': url,
'status': status,
"score" : websiteScore,
"type" : type,
"category" : "",
"industry" : "",
"timestamp" : timestamp,
"date_created" :date_created,
"headers" :{
"content_language" : content_language,
"protocol" : protocol,
},
"metas" : [
{
"favicon ": favicon,
"title" :title,
"description": description,
"keywords" :keywords,
"author" : author,
}
],
"open_graph" : [
{
"og_title" :og_title,
"og_type" :og_type,
"og_url" :og_url,
"og_image" :og_image,
"og_site_name" :og_site_name,
"og_description" : og_description,
}
],
"links": links,
"h1_text": h1_text,
"h2_text": h2_text,
"h3_text": h3_text,
"div_text": div_text,
"p_text": p_text,
"span_text": span_text,
"images ": images,
"videos" : videos
}
#print result
#print(request)
# we close the cursor and conn both
#query.close()
#conn.close()
#scrapy runspider scraper.py -o domains.json
domains.json(示例输出)
[
{"url": "https://weerstation-aarschot.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535167.804621, "date_created": "2022-03-05 21:52:47", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://wereldvakanties.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.069924, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://welzijnscentrum.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.096689, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
]