在 beautifulsoup 上请求 header 的剪贴图片
scrap image with request header on beautifulsoup
我有剪贴图片的代码:
import requests, base64
from bs4 import BeautifulSoup
baseurl = "https://www.google.com/search?q=cat&sxsrf=APq-WBuyx07rsOeGlVQpTsxLt262WbhlfA:1650636332756&source=lnms&tbm=shop&sa=X&ved=2ahUKEwjQr5HC66f3AhXxxzgGHejKC9sQ_AUoAXoECAIQAw&biw=1920&bih=937&dpr=1"
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0"}
r_images = requests.get(url=baseurl, headers=headers)
soup_for_image = BeautifulSoup(r_images.text, 'html.parser')
#find product images
productimages = []
product_images = soup_for_image.findAll('img')
for item in product_images:
# print(item['src'])
if "data:image/svg+xml" not in item['src']:
productimages.append(item.get('src'))
print(productimages)
如果没有header就好了,但是,如果我使用请求header,结果将是base64图像。那么有什么方法可以让我根据请求 headers?
废弃图像
您可以添加 cookie CONSENT 并且它有效。
也许将来某些选择器可以改变。
import requests, base64
from bs4 import BeautifulSoup
baseurl = "https://www.google.com/search?q=cat&sxsrf=APq-WBuyx07rsOeGlVQpTsxLt262WbhlfA:1650636332756&source=lnms&tbm=shop&sa=X&ved=2ahUKEwjQr5HC66f3AhXxxzgGHejKC9sQ_AUoAXoECAIQAw&biw=1920&bih=937&dpr=1"
headers = {"cookie": "CONSENT=YES+cb.20230531-04-p0.en+FX+908"}
result = requests.get(url=baseurl, headers=headers)
soup = BeautifulSoup(result.text, 'html.parser')
allProducts = soup.findAll(class_="u30d4")
number = 0
for product in allProducts:
name = product.find(class_="rgHvZc")
if name is not None:
number += 1
print("Product number %d:" % number)
print("Name : " + name.text)
productLink = product.find('a')
print("Link: " + productLink["href"][7:])
img = product.find('img')
print("Image: " + img["src"])
price = product.find(class_="HRLxBb")
print("Price " + price.text)
希望能帮到你。
我有剪贴图片的代码:
import requests, base64
from bs4 import BeautifulSoup
baseurl = "https://www.google.com/search?q=cat&sxsrf=APq-WBuyx07rsOeGlVQpTsxLt262WbhlfA:1650636332756&source=lnms&tbm=shop&sa=X&ved=2ahUKEwjQr5HC66f3AhXxxzgGHejKC9sQ_AUoAXoECAIQAw&biw=1920&bih=937&dpr=1"
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0"}
r_images = requests.get(url=baseurl, headers=headers)
soup_for_image = BeautifulSoup(r_images.text, 'html.parser')
#find product images
productimages = []
product_images = soup_for_image.findAll('img')
for item in product_images:
# print(item['src'])
if "data:image/svg+xml" not in item['src']:
productimages.append(item.get('src'))
print(productimages)
如果没有header就好了,但是,如果我使用请求header,结果将是base64图像。那么有什么方法可以让我根据请求 headers?
废弃图像您可以添加 cookie CONSENT 并且它有效。
也许将来某些选择器可以改变。
import requests, base64
from bs4 import BeautifulSoup
baseurl = "https://www.google.com/search?q=cat&sxsrf=APq-WBuyx07rsOeGlVQpTsxLt262WbhlfA:1650636332756&source=lnms&tbm=shop&sa=X&ved=2ahUKEwjQr5HC66f3AhXxxzgGHejKC9sQ_AUoAXoECAIQAw&biw=1920&bih=937&dpr=1"
headers = {"cookie": "CONSENT=YES+cb.20230531-04-p0.en+FX+908"}
result = requests.get(url=baseurl, headers=headers)
soup = BeautifulSoup(result.text, 'html.parser')
allProducts = soup.findAll(class_="u30d4")
number = 0
for product in allProducts:
name = product.find(class_="rgHvZc")
if name is not None:
number += 1
print("Product number %d:" % number)
print("Name : " + name.text)
productLink = product.find('a')
print("Link: " + productLink["href"][7:])
img = product.find('img')
print("Image: " + img["src"])
price = product.find(class_="HRLxBb")
print("Price " + price.text)
希望能帮到你。