python 图片抓取工具,无法在 bing 上正常工作
python image scraper, not working properly on bing
我正在尝试构建图像抓取工具,我首先尝试 Google 但没有图像被抓取
所以我尝试了 Bing 并且成功了,但是有一些问题
- 被抓取的图片链接只是显示内容的一小部分
在搜索引擎中。
- Scraped 图片来自预览中的未知页面。
- 图像在安全模式过滤器中默认被抓取。
我想抓取 bing 中显示的所有图像(或某些页面)。com/images/search
但它做的很少。
检查后我发现图像链接存储在 'thumb' class 中 bing 所以我 删除了 所有有拇指的链接class,
但看起来还不够。
在查看源代码后,只有拇指 class 链接被发现实际上最后有 .jpg
import requests
from bs4 import BeautifulSoup
import os
import random
from urllib.parse import urljoin
url = "https://www.bing.com"
search = input("enter the search term: ")
r = requests.get(url + "/images/search", params={"q":search})
soup = BeautifulSoup(r.content,"html.parser")
li = soup.find_all("a",class_="thumb")
# getting links from thumb class
links = [l.get("href") for l in li]
print("{0} results found with the search term: {1}".format(len(links), search))
choice = input("Do You Want To Extract The Images? Y or N ")
dir_name = "Result"
# Creating the Result named directory if it didn't existed
if os.path.isdir(dir_name) == False:
print("[+] Creating Directory Named '{0}'".format(dir_name))
os.mkdir(dir_name)
n = 1
if(choice == 'Y' or choice == 'y'):
for i in links:
req = requests.get(i)
#title = links[z].split("/")[-1]
#there were some issues with the default titles so I instead used names generated by
#random sequence
print("[+] Extracting Image #",n)
with open(("{0}/" + generateRandomSequence() + ".jpg").format(dir_name),"wb") as img:
img.write(req.content)
n += 1
#for generating random sequence
def generateRandomSequence():
seq = ""
letters = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z",
"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",
]
for i in range(0,5):
seq = seq + random.choice(letters) + str(random.randrange(1,1000))
return seq
这是给你的一个刮痕:
import requests
from bs4 import BeautifulSoup
seartext = input("enter the search term: ")
count = input("Enter the number of images you need:")
adlt = 'off' # can be set to 'moderate'
sear=seartext.strip()
sear=sear.replace(' ','+')
URL='https://bing.com/images/search?q=' + sear + '&safeSearch=' + adlt + '&count=' + count
print(URL)
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
results=[]
soup = BeautifulSoup(resp.content, "html.parser")
print(soup)
wow = soup.find_all('a',class_='iusc')
for i in wow:
try:
print(eval(i['m'])['murl'])
print()
except:
pass
Here ,您将找到 bing.
的查询参数
我正在尝试构建图像抓取工具,我首先尝试 Google 但没有图像被抓取 所以我尝试了 Bing 并且成功了,但是有一些问题
- 被抓取的图片链接只是显示内容的一小部分 在搜索引擎中。
- Scraped 图片来自预览中的未知页面。
- 图像在安全模式过滤器中默认被抓取。
我想抓取 bing 中显示的所有图像(或某些页面)。com/images/search 但它做的很少。
检查后我发现图像链接存储在 'thumb' class 中 bing 所以我 删除了 所有有拇指的链接class, 但看起来还不够。
在查看源代码后,只有拇指 class 链接被发现实际上最后有 .jpg
import requests
from bs4 import BeautifulSoup
import os
import random
from urllib.parse import urljoin
url = "https://www.bing.com"
search = input("enter the search term: ")
r = requests.get(url + "/images/search", params={"q":search})
soup = BeautifulSoup(r.content,"html.parser")
li = soup.find_all("a",class_="thumb")
# getting links from thumb class
links = [l.get("href") for l in li]
print("{0} results found with the search term: {1}".format(len(links), search))
choice = input("Do You Want To Extract The Images? Y or N ")
dir_name = "Result"
# Creating the Result named directory if it didn't existed
if os.path.isdir(dir_name) == False:
print("[+] Creating Directory Named '{0}'".format(dir_name))
os.mkdir(dir_name)
n = 1
if(choice == 'Y' or choice == 'y'):
for i in links:
req = requests.get(i)
#title = links[z].split("/")[-1]
#there were some issues with the default titles so I instead used names generated by
#random sequence
print("[+] Extracting Image #",n)
with open(("{0}/" + generateRandomSequence() + ".jpg").format(dir_name),"wb") as img:
img.write(req.content)
n += 1
#for generating random sequence
def generateRandomSequence():
seq = ""
letters = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z",
"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",
]
for i in range(0,5):
seq = seq + random.choice(letters) + str(random.randrange(1,1000))
return seq
这是给你的一个刮痕:
import requests
from bs4 import BeautifulSoup
seartext = input("enter the search term: ")
count = input("Enter the number of images you need:")
adlt = 'off' # can be set to 'moderate'
sear=seartext.strip()
sear=sear.replace(' ','+')
URL='https://bing.com/images/search?q=' + sear + '&safeSearch=' + adlt + '&count=' + count
print(URL)
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
results=[]
soup = BeautifulSoup(resp.content, "html.parser")
print(soup)
wow = soup.find_all('a',class_='iusc')
for i in wow:
try:
print(eval(i['m'])['murl'])
print()
except:
pass
Here ,您将找到 bing.
的查询参数