无法从 Flipkart.com 网站抓取图像,src 属性即将变空
Not able to scrape the images from Flipkart.com website the src attribute is coming emtpy
我能够抓取 flipkart 网站上除图片之外的所有数据
使用以下代码:
jobs = soup.find_all('div',{"class":"IIdQZO _1R0K0g _1SSAGr"})
for job in jobs:
product_name = job.find('a',{'class':'_2mylT6'})
product_name = product_name.text if product_name else "N/A"
product_offer_price = job.find('div',{'class':'_1vC4OE'})
product_offer_price = product_offer_price.text if product_offer_price else "N/A"
product_mrp = job.find('div',{'class':'_3auQ3N'})
product_mrp = product_mrp.text if product_mrp else "N/A"
product_link = job.find('a',{'class':'_3dqZjq'})
product_link = product_link.get('href') if product_link else "N/A"
product_link = url+ product_link
product_img = job.find('div',{'class':'_3ZJShS _31bMyl'})
print('product name {}\nproduct offer price {}\nproduct mrp {}\nproduct link {}\nproduct image {}'.\
format(product_name,product_offer_price,product_mrp,product_link,product_img))
print('\n')
结果例如:
product name UV Protection Wayfarer Sunglasses (54)
product offer price ₹8,000
product mrp ₹8,890
product link https://www.flipkart.com/search?q=rayban/ray-ban-wayfarer-
product image <img alt="" class="_3togXc" src=""/>
当我手动检查页面时,src 在那里,但是当抓取它时,它像上面一样变空了
图片来源由javascript部分动态添加。
您应该使用 selenium 获取页面源代码。
检查代码:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
url = 'https://www.flipkart.com/search?q=rayban'
driver = webdriver.Firefox()
driver.get(url)
html = driver.page_source
page = soup(html)
jobs = page.find_all('div',{"class":"IIdQZO _1R0K0g _1SSAGr"})
for job in jobs:
product_name = job.find('a',{'class':'_2mylT6'})
product_name = product_name.text if product_name else "N/A"
product_offer_price = job.find('div',{'class':'_1vC4OE'})
product_offer_price = product_offer_price.text if product_offer_price else "N/A"
product_mrp = job.find('div',{'class':'_3auQ3N'})
product_mrp = product_mrp.text if product_mrp else "N/A"
product_link = job.find('a',{'class':'_3dqZjq'})
product_link = product_link.get('href') if product_link else "N/A"
product_link = url+ product_link
product_img = job.find('div',{'class':'_3ZJShS _31bMyl'})
print('product name {}\nproduct offer price {}\nproduct mrp {}\nproduct link {}\nproduct image {}'.\
format(product_name,product_offer_price,product_mrp,product_link,product_img))
print('\n')
输出:
product name Aviator Sunglasses (58)
product offer price ₹4,760
product mrp ₹5,290
product link https://www.flipkart.com/search?q=rayban/ray-ban-aviator-sunglasses/p/itmf3yh25kzjsapz?pid=SGLDPXDY4CMRNEY9&lid=LSTSGLDPXDY4CMRNEY9TNASSX&marketplace=FLIPKART&srno=s_1_1&otracker=search&fm=organic&iid=5ed8f60d-e315-4440-bfdc-76049d80e5da.SGLDPXDY4CMRNEY9.SEARCH&qH=95c0daefc80c4a70
product image <div class="_3ZJShS _31bMyl" style="padding-top:120.00%"><img alt="" class="_3togXc" src="https://rukminim1.flixcart.com/image/454/545/sunglass/e/y/9/0rb3129iw0228-rayban-58-original-imadqb2nzmwzfup6.jpeg?q=50"/></div>
.
.
.
正如我在评论中提到的,使用硒。
from selenium import webdriver
from bs4 import BeautifulSoup
driver=webdriver.Chrome()
driver.get('https://www.flipkart.com/search?q=rayban/ray-ban-wayfarer')
time.sleep(3)
soup=BeautifulSoup(driver.page_source,'html.parser')
url='"https://www.flipkart.com'
jobs = soup.find_all('div',{"class":"IIdQZO _1R0K0g _1SSAGr"})
for job in jobs:
product_name = job.find('a',{'class':'_2mylT6'})
product_name = product_name.text if product_name else "N/A"
product_offer_price = job.find('div',{'class':'_1vC4OE'})
product_offer_price = product_offer_price.text if product_offer_price else "N/A"
product_mrp = job.find('div',{'class':'_3auQ3N'})
product_mrp = product_mrp.text if product_mrp else "N/A"
product_link = job.find('a',{'class':'_3dqZjq'})
product_link = product_link.get('href') if product_link else "N/A"
product_link = url+ product_link
product_img =job.find('div',{'class':'_3ZJShS _31bMyl'}).find('img')['src']
print('product name {}\nproduct offer price {}\nproduct mrp {}\nproduct link {}\nproduct image {}'.\
format(product_name,product_offer_price,product_mrp,product_link,product_img))
print('\n')
我能够抓取 flipkart 网站上除图片之外的所有数据 使用以下代码:
jobs = soup.find_all('div',{"class":"IIdQZO _1R0K0g _1SSAGr"})
for job in jobs:
product_name = job.find('a',{'class':'_2mylT6'})
product_name = product_name.text if product_name else "N/A"
product_offer_price = job.find('div',{'class':'_1vC4OE'})
product_offer_price = product_offer_price.text if product_offer_price else "N/A"
product_mrp = job.find('div',{'class':'_3auQ3N'})
product_mrp = product_mrp.text if product_mrp else "N/A"
product_link = job.find('a',{'class':'_3dqZjq'})
product_link = product_link.get('href') if product_link else "N/A"
product_link = url+ product_link
product_img = job.find('div',{'class':'_3ZJShS _31bMyl'})
print('product name {}\nproduct offer price {}\nproduct mrp {}\nproduct link {}\nproduct image {}'.\
format(product_name,product_offer_price,product_mrp,product_link,product_img))
print('\n')
结果例如:
product name UV Protection Wayfarer Sunglasses (54)
product offer price ₹8,000
product mrp ₹8,890
product link https://www.flipkart.com/search?q=rayban/ray-ban-wayfarer-
product image <img alt="" class="_3togXc" src=""/>
当我手动检查页面时,src 在那里,但是当抓取它时,它像上面一样变空了
图片来源由javascript部分动态添加。
您应该使用 selenium 获取页面源代码。
检查代码:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
url = 'https://www.flipkart.com/search?q=rayban'
driver = webdriver.Firefox()
driver.get(url)
html = driver.page_source
page = soup(html)
jobs = page.find_all('div',{"class":"IIdQZO _1R0K0g _1SSAGr"})
for job in jobs:
product_name = job.find('a',{'class':'_2mylT6'})
product_name = product_name.text if product_name else "N/A"
product_offer_price = job.find('div',{'class':'_1vC4OE'})
product_offer_price = product_offer_price.text if product_offer_price else "N/A"
product_mrp = job.find('div',{'class':'_3auQ3N'})
product_mrp = product_mrp.text if product_mrp else "N/A"
product_link = job.find('a',{'class':'_3dqZjq'})
product_link = product_link.get('href') if product_link else "N/A"
product_link = url+ product_link
product_img = job.find('div',{'class':'_3ZJShS _31bMyl'})
print('product name {}\nproduct offer price {}\nproduct mrp {}\nproduct link {}\nproduct image {}'.\
format(product_name,product_offer_price,product_mrp,product_link,product_img))
print('\n')
输出:
product name Aviator Sunglasses (58)
product offer price ₹4,760
product mrp ₹5,290
product link https://www.flipkart.com/search?q=rayban/ray-ban-aviator-sunglasses/p/itmf3yh25kzjsapz?pid=SGLDPXDY4CMRNEY9&lid=LSTSGLDPXDY4CMRNEY9TNASSX&marketplace=FLIPKART&srno=s_1_1&otracker=search&fm=organic&iid=5ed8f60d-e315-4440-bfdc-76049d80e5da.SGLDPXDY4CMRNEY9.SEARCH&qH=95c0daefc80c4a70
product image <div class="_3ZJShS _31bMyl" style="padding-top:120.00%"><img alt="" class="_3togXc" src="https://rukminim1.flixcart.com/image/454/545/sunglass/e/y/9/0rb3129iw0228-rayban-58-original-imadqb2nzmwzfup6.jpeg?q=50"/></div>
.
.
.
正如我在评论中提到的,使用硒。
from selenium import webdriver
from bs4 import BeautifulSoup
driver=webdriver.Chrome()
driver.get('https://www.flipkart.com/search?q=rayban/ray-ban-wayfarer')
time.sleep(3)
soup=BeautifulSoup(driver.page_source,'html.parser')
url='"https://www.flipkart.com'
jobs = soup.find_all('div',{"class":"IIdQZO _1R0K0g _1SSAGr"})
for job in jobs:
product_name = job.find('a',{'class':'_2mylT6'})
product_name = product_name.text if product_name else "N/A"
product_offer_price = job.find('div',{'class':'_1vC4OE'})
product_offer_price = product_offer_price.text if product_offer_price else "N/A"
product_mrp = job.find('div',{'class':'_3auQ3N'})
product_mrp = product_mrp.text if product_mrp else "N/A"
product_link = job.find('a',{'class':'_3dqZjq'})
product_link = product_link.get('href') if product_link else "N/A"
product_link = url+ product_link
product_img =job.find('div',{'class':'_3ZJShS _31bMyl'}).find('img')['src']
print('product name {}\nproduct offer price {}\nproduct mrp {}\nproduct link {}\nproduct image {}'.\
format(product_name,product_offer_price,product_mrp,product_link,product_img))
print('\n')