正在抓取一个页面 returns a 200,稍后检查该页面 returns a 403
Scraping a page returns a 200, checking that page later returns a 403
我正在使用 Scrapy 抓取一些网站的招聘信息。如果站点上的某个页面符合我的要求,我会将 link 存储到数据库中的页面。没问题。我还创建了一个脚本,遍历数据库中的每个 link 并 ping URL。如果它 returns 是 404,它就会被删除。我遇到的问题是,当我进行删除检查时,某些网站返回 403 错误。奇怪的是,他们都允许抓取,但他们阻止了检查。这是我用来进行删除检查的脚本:
from pymongo import MongoClient
import requests
import urllib3
from operator import itemgetter
import random
import time
client = MongoClient("path-to-mongo")
db = client["mongoDB"]
col = db['mongoCollection']
openings = list(col.find())
sorted_openings = sorted(openings, key=itemgetter('Company'))
del_counter = 0
user_agents = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
"Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko)"]
headers = {"User-Agent": user_agents[random.randint(0,11)]}
counter = 0
del_counter = 0
passed_counter = 0
deleted_links = []
passed_links = []
forbidden = []
for item in sorted_openings:
try:
if requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 200:
print(str(requests.get(item['Link'])) + ' ' + item['Link'])
counter += 1
print(counter)
elif requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 304:
print(requests.get(item['Link']))
counter += 1
print(counter)
elif requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 403:
forbidden.append(item['Link'])
print(requests.get(item['Link']))
counter += 1
print(counter)
else:
db.openings.remove(item)
deleted_links.append(item['Link'])
del_counter += 1
counter += 1
print('Deleted ' + item['Link'])
print(counter)
except:
pass
passed_links.append(item['Link'])
passed_counter += 1
counter += 1
print('Passed link ' + item['Link'])
print(counter)
您在每个条件下发送请求,发送一个请求并将结果存储在一个值中,然后检查条件。
from pymongo import MongoClient
import requests
import urllib3
from operator import itemgetter
import random
import time
client = MongoClient("path-to-mongo")
db = client["mongoDB"]
col = db['mongoCollection']
openings = list(col.find())
sorted_openings = sorted(openings, key=itemgetter('Company'))
del_counter = 0
user_agents = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
"Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko)"]
headers = {"User-Agent": user_agents[random.randint(0,11)]}
counter = 0
del_counter = 0
passed_counter = 0
deleted_links = []
passed_links = []
forbidden = []
for item in sorted_openings:
try:
response = requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers)
if response.status_code == 200:
print(str(response) + ' ' + item['Link'])
counter += 1
print(counter)
elif response.status_code == 304:
print(response)
counter += 1
print(counter)
elif response.status_code == 403:
forbidden.append(item['Link'])
print(response))
counter += 1
print(counter)
else:
db.openings.remove(item)
deleted_links.append(item['Link'])
del_counter += 1
counter += 1
print('Deleted ' + item['Link'])
print(counter)
except:
passed_links.append(item['Link'])
passed_counter += 1
counter += 1
print('Passed link ' + item['Link'])
print(counter)
我正在使用 Scrapy 抓取一些网站的招聘信息。如果站点上的某个页面符合我的要求,我会将 link 存储到数据库中的页面。没问题。我还创建了一个脚本,遍历数据库中的每个 link 并 ping URL。如果它 returns 是 404,它就会被删除。我遇到的问题是,当我进行删除检查时,某些网站返回 403 错误。奇怪的是,他们都允许抓取,但他们阻止了检查。这是我用来进行删除检查的脚本:
from pymongo import MongoClient
import requests
import urllib3
from operator import itemgetter
import random
import time
client = MongoClient("path-to-mongo")
db = client["mongoDB"]
col = db['mongoCollection']
openings = list(col.find())
sorted_openings = sorted(openings, key=itemgetter('Company'))
del_counter = 0
user_agents = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
"Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko)"]
headers = {"User-Agent": user_agents[random.randint(0,11)]}
counter = 0
del_counter = 0
passed_counter = 0
deleted_links = []
passed_links = []
forbidden = []
for item in sorted_openings:
try:
if requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 200:
print(str(requests.get(item['Link'])) + ' ' + item['Link'])
counter += 1
print(counter)
elif requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 304:
print(requests.get(item['Link']))
counter += 1
print(counter)
elif requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 403:
forbidden.append(item['Link'])
print(requests.get(item['Link']))
counter += 1
print(counter)
else:
db.openings.remove(item)
deleted_links.append(item['Link'])
del_counter += 1
counter += 1
print('Deleted ' + item['Link'])
print(counter)
except:
pass
passed_links.append(item['Link'])
passed_counter += 1
counter += 1
print('Passed link ' + item['Link'])
print(counter)
您在每个条件下发送请求,发送一个请求并将结果存储在一个值中,然后检查条件。
from pymongo import MongoClient
import requests
import urllib3
from operator import itemgetter
import random
import time
client = MongoClient("path-to-mongo")
db = client["mongoDB"]
col = db['mongoCollection']
openings = list(col.find())
sorted_openings = sorted(openings, key=itemgetter('Company'))
del_counter = 0
user_agents = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
"Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko)"]
headers = {"User-Agent": user_agents[random.randint(0,11)]}
counter = 0
del_counter = 0
passed_counter = 0
deleted_links = []
passed_links = []
forbidden = []
for item in sorted_openings:
try:
response = requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers)
if response.status_code == 200:
print(str(response) + ' ' + item['Link'])
counter += 1
print(counter)
elif response.status_code == 304:
print(response)
counter += 1
print(counter)
elif response.status_code == 403:
forbidden.append(item['Link'])
print(response))
counter += 1
print(counter)
else:
db.openings.remove(item)
deleted_links.append(item['Link'])
del_counter += 1
counter += 1
print('Deleted ' + item['Link'])
print(counter)
except:
passed_links.append(item['Link'])
passed_counter += 1
counter += 1
print('Passed link ' + item['Link'])
print(counter)