常见抓取请求 returns 403 WARC
Common Crawl Request returns 403 WARC
我正在尝试从公共爬网档案中爬网一些 WARC 文件,但我似乎没有成功地向服务器发出请求。下面提供了一个最小的 python 示例来复制错误。我尝试在请求 header 中添加 UserAgent,但确实有帮助。关于如何进行的任何想法?
import io
import time
import justext # >= 2.2.0
import argparse
import requests # >= 2.23.0
import pandas as pd # pandas >= 1.0.3
from tqdm import tqdm
from warcio.archiveiterator import ArchiveIterator warcio >= 1.7.3
def debug():
common_crawl_data = {"filename":"crawl-data/CC-MAIN-2016-07/segments/1454702018134.95/warc/CC-MAIN-20160205195338-00121-ip-10-236-182-209.ec2.internal.warc.gz",
"offset":244189209,
"length":989
}
offset, length = int(common_crawl_data['offset']), int(common_crawl_data['length'])
offset_end = offset + length - 1
prefix = 'https://commoncrawl.s3.amazonaws.com/'
resp = requests.get(prefix + common_crawl_data['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
raw_data = io.BytesIO(resp.content)
uri = None
page = None
for record in ArchiveIterator(raw_data, arc2warc=True):
uri = record.rec_headers.get_header('WARC-Target-URI')
R = record.content_stream().read()
try:
page = R.strip().decode('utf-8')
except:
page = R.strip().decode('latin1')
print(uri, page)
return uri, page
debug()
请参阅 this commoncrawl blog posting 了解最近为某些未经身份验证的请求生成 403 的更改。
我正在尝试从公共爬网档案中爬网一些 WARC 文件,但我似乎没有成功地向服务器发出请求。下面提供了一个最小的 python 示例来复制错误。我尝试在请求 header 中添加 UserAgent,但确实有帮助。关于如何进行的任何想法?
import io
import time
import justext # >= 2.2.0
import argparse
import requests # >= 2.23.0
import pandas as pd # pandas >= 1.0.3
from tqdm import tqdm
from warcio.archiveiterator import ArchiveIterator warcio >= 1.7.3
def debug():
common_crawl_data = {"filename":"crawl-data/CC-MAIN-2016-07/segments/1454702018134.95/warc/CC-MAIN-20160205195338-00121-ip-10-236-182-209.ec2.internal.warc.gz",
"offset":244189209,
"length":989
}
offset, length = int(common_crawl_data['offset']), int(common_crawl_data['length'])
offset_end = offset + length - 1
prefix = 'https://commoncrawl.s3.amazonaws.com/'
resp = requests.get(prefix + common_crawl_data['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
raw_data = io.BytesIO(resp.content)
uri = None
page = None
for record in ArchiveIterator(raw_data, arc2warc=True):
uri = record.rec_headers.get_header('WARC-Target-URI')
R = record.content_stream().read()
try:
page = R.strip().decode('utf-8')
except:
page = R.strip().decode('latin1')
print(uri, page)
return uri, page
debug()
请参阅 this commoncrawl blog posting 了解最近为某些未经身份验证的请求生成 403 的更改。