使用 Python 个请求抓取页面
Scrape page using Python requests
我在网页抓取方面遇到了一些问题,这是我的代码:
from bs4 import BeautifulSoup
import requests
import re
import csv
import argparse
def save_csv_file(filename, array):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["item_name","item_price","item_category"])
writer.writerows(array)
def process_data(name, price, category):
item_name = name.text if name else 'NA'
item_price = price.text if price else 'NA'
item_category = category.text if category else 'NA'
item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
return (item_name, item_price, item_category)
def do_scrap(filename, url, payload, headers):
# Request the URL with parameters and headers
r = requests.post(url, payload, headers = headers, allow_redirects = True)
if(r.status_code == 200):
# Save response content in html variable
html = r.content
# Parsed html variable into HTML file with bs4
parsed_html = BeautifulSoup(html, "html.parser")
# Print document title
print parsed_html.head.find('title').text
# Find all of the HTML elements which are describing hotels
tables = parsed_html.find_all("a", {"class" : "result-link"})
# Print the numbers of the hotels
print "Found %s records." % len(tables)
# Empty helpers
items = []
count = 0
# Looping the HTML elements and print properties for each hotel
for table in tables:
name = table.find("h3", {"class" : "result-title"})
price = table.find("p", {"class" : "price text-truncate"})
category = table.find("p", {"class" : "merchant-name text-truncate"})
items.append(process_data(name, price, category))
count += 1
if count > 0:
# Save array with data to csv file
save_csv_file(filename = filename, array = items)
# Print end of job info
print "\n%s records downloaded and saved to %s." % (count, filename)
else:
print "Code error: %s" % r.status_code
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")
args = vars(ap.parse_args())
product = args['product']
category = args['category']
payload = {
'siteSearchQuery':product,
'from':'colibri'
}
headers = {
'Host':'www.kelkoo.co.uk',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}
url = "http://www.kelkoo.co.uk/ctl/do/search"
filename = "%s_co_uk_kelkoo_data.csv" % product
do_scrap(
filename=filename,
url=url,
payload=payload,
headers=headers)
在这个请求之后,我得到的结果与我给出的结果不同:
www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri
进入我的网络浏览器,是什么导致了这个问题?是不是跟页面重定向什么的有关?
我可以看到多种因素会导致您得到不同的结果:
- 您发起的是 POST 而不是 GET。为
requests.get
. 查找 params
- 他们使用 javascript 修改页面。
我在网页抓取方面遇到了一些问题,这是我的代码:
from bs4 import BeautifulSoup
import requests
import re
import csv
import argparse
def save_csv_file(filename, array):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["item_name","item_price","item_category"])
writer.writerows(array)
def process_data(name, price, category):
item_name = name.text if name else 'NA'
item_price = price.text if price else 'NA'
item_category = category.text if category else 'NA'
item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
return (item_name, item_price, item_category)
def do_scrap(filename, url, payload, headers):
# Request the URL with parameters and headers
r = requests.post(url, payload, headers = headers, allow_redirects = True)
if(r.status_code == 200):
# Save response content in html variable
html = r.content
# Parsed html variable into HTML file with bs4
parsed_html = BeautifulSoup(html, "html.parser")
# Print document title
print parsed_html.head.find('title').text
# Find all of the HTML elements which are describing hotels
tables = parsed_html.find_all("a", {"class" : "result-link"})
# Print the numbers of the hotels
print "Found %s records." % len(tables)
# Empty helpers
items = []
count = 0
# Looping the HTML elements and print properties for each hotel
for table in tables:
name = table.find("h3", {"class" : "result-title"})
price = table.find("p", {"class" : "price text-truncate"})
category = table.find("p", {"class" : "merchant-name text-truncate"})
items.append(process_data(name, price, category))
count += 1
if count > 0:
# Save array with data to csv file
save_csv_file(filename = filename, array = items)
# Print end of job info
print "\n%s records downloaded and saved to %s." % (count, filename)
else:
print "Code error: %s" % r.status_code
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")
args = vars(ap.parse_args())
product = args['product']
category = args['category']
payload = {
'siteSearchQuery':product,
'from':'colibri'
}
headers = {
'Host':'www.kelkoo.co.uk',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}
url = "http://www.kelkoo.co.uk/ctl/do/search"
filename = "%s_co_uk_kelkoo_data.csv" % product
do_scrap(
filename=filename,
url=url,
payload=payload,
headers=headers)
在这个请求之后,我得到的结果与我给出的结果不同:
www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri
进入我的网络浏览器,是什么导致了这个问题?是不是跟页面重定向什么的有关?
我可以看到多种因素会导致您得到不同的结果:
- 您发起的是 POST 而不是 GET。为
requests.get
. 查找 - 他们使用 javascript 修改页面。
params