试图抓取图片网址但无法使用漂亮的汤和 python
Trying to scrape image urls but not able to get it using beautiful soup and python
并获取图片 url
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
AMEXurl = ['https://www.americanexpress.com/in/credit-cards/all-cards/?sourcecode=A0000FCRAA&cpid=100370494&dsparms=dc_pcrid_408453063287_kword_american%20express%20credit%20card_match_e&gclid=Cj0KCQiApY6BBhCsARIsAOI_GjaRsrXTdkvQeJWvKzFy_9BhDeBe2L2N668733FSHTHm96wrPGxkv7YaAl6qEALw_wcB&gclsrc=aw.ds']
identity = ['filmstrip_container']
html_1 = urlopen(AMEXurl[0])
soup_1 = BeautifulSoup(html_1,'lxml')
address = soup_1.find('div',attrs={"class" : identity[0]})
for x in address.find_all('div', class_ = 'filmstrip-imgContainer'):
print(x.find('div').get('img'))
但我得到如下输出:
None
None
None
None
None
None
None
下面是 html 代码的图像,我试图从中获取图像网址:
这是我想从中获取网址的页面部分
我想知道是否需要对代码进行任何更改,以便我获取所有图片网址。
试试这个
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import requests
import re
AMEXurl = ['https://www.americanexpress.com/in/credit-cards/all-cards/?sourcecode=A0000FCRAA&cpid=100370494&dsparms=dc_pcrid_408453063287_kword_american%20express%20credit%20card_match_e&gclid=Cj0KCQiApY6BBhCsARIsAOI_GjaRsrXTdkvQeJWvKzFy_9BhDeBe2L2N668733FSHTHm96wrPGxkv7YaAl6qEALw_wcB&gclsrc=aw.ds']
identity = ['filmstrip_container']
r = requests.get(AMEXurl[0])
html_1 = urlopen(AMEXurl[0])
soup_1 = BeautifulSoup(r.content,'lxml')
提取所有图像
images = soup_1.find_all('img', src=True)
for img in images:
print(img['src'])
显示 png 文件的所有图像标签。
platinum_card_image=soup_1.find('img', src=re.compile('Platinum_Card\.png$'))
print(platinum_card_image.get('src'))
显示 svg 文件的所有图像标签。
platinum_card_image=soup_1.find_all('img', src=re.compile('\.svg$'))
for img in platinum_card_image:
print(img.get('src'))
编辑
images_7 = soup_1.select('script')[8].string.split('__REDUX_STATE__ = ')
data = images_7[1]
for d in json.loads(data)["modelData"]['componentFeaturedCards']['cards']:
print(d['imgurl'])
它们是从脚本标签动态加载的。您可以轻松地从响应的 .text 中对它们进行正则表达式。下面的正则表达式专门匹配你说要检索并显示在图片中的 7 张图像。
import requests, re
r = requests.get('https://www.americanexpress.com/in/credit-cards/all-cards/?sourcecode=A0000FCRAA&cpid=100370494&dsparms=dc_pcrid_408453063287_kword_american%20express%20credit%20card_match_e&gclid=Cj0KCQiApY6BBhCsARIsAOI_GjaRsrXTdkvQeJWvKzFy_9BhDeBe2L2N668733FSHTHm96wrPGxkv7YaAl6qEALw_wcB&gclsrc=aw.ds').text
p = re.compile(r'imgurl":"(.*?)"')
links = p.findall(r)
print(links)
正则表达式解释:
您是否决定使用可以匹配的更昂贵的硒
links = [i['src'] for i in driver.find_all_elements_with_css_selector('.filmstrip-imgContainer img')]
并获取图片 url
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
AMEXurl = ['https://www.americanexpress.com/in/credit-cards/all-cards/?sourcecode=A0000FCRAA&cpid=100370494&dsparms=dc_pcrid_408453063287_kword_american%20express%20credit%20card_match_e&gclid=Cj0KCQiApY6BBhCsARIsAOI_GjaRsrXTdkvQeJWvKzFy_9BhDeBe2L2N668733FSHTHm96wrPGxkv7YaAl6qEALw_wcB&gclsrc=aw.ds']
identity = ['filmstrip_container']
html_1 = urlopen(AMEXurl[0])
soup_1 = BeautifulSoup(html_1,'lxml')
address = soup_1.find('div',attrs={"class" : identity[0]})
for x in address.find_all('div', class_ = 'filmstrip-imgContainer'):
print(x.find('div').get('img'))
但我得到如下输出:
None
None
None
None
None
None
None
下面是 html 代码的图像,我试图从中获取图像网址:
这是我想从中获取网址的页面部分
我想知道是否需要对代码进行任何更改,以便我获取所有图片网址。
试试这个
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import requests
import re
AMEXurl = ['https://www.americanexpress.com/in/credit-cards/all-cards/?sourcecode=A0000FCRAA&cpid=100370494&dsparms=dc_pcrid_408453063287_kword_american%20express%20credit%20card_match_e&gclid=Cj0KCQiApY6BBhCsARIsAOI_GjaRsrXTdkvQeJWvKzFy_9BhDeBe2L2N668733FSHTHm96wrPGxkv7YaAl6qEALw_wcB&gclsrc=aw.ds']
identity = ['filmstrip_container']
r = requests.get(AMEXurl[0])
html_1 = urlopen(AMEXurl[0])
soup_1 = BeautifulSoup(r.content,'lxml')
提取所有图像
images = soup_1.find_all('img', src=True)
for img in images:
print(img['src'])
显示 png 文件的所有图像标签。
platinum_card_image=soup_1.find('img', src=re.compile('Platinum_Card\.png$'))
print(platinum_card_image.get('src'))
显示 svg 文件的所有图像标签。
platinum_card_image=soup_1.find_all('img', src=re.compile('\.svg$'))
for img in platinum_card_image:
print(img.get('src'))
编辑
images_7 = soup_1.select('script')[8].string.split('__REDUX_STATE__ = ')
data = images_7[1]
for d in json.loads(data)["modelData"]['componentFeaturedCards']['cards']:
print(d['imgurl'])
它们是从脚本标签动态加载的。您可以轻松地从响应的 .text 中对它们进行正则表达式。下面的正则表达式专门匹配你说要检索并显示在图片中的 7 张图像。
import requests, re
r = requests.get('https://www.americanexpress.com/in/credit-cards/all-cards/?sourcecode=A0000FCRAA&cpid=100370494&dsparms=dc_pcrid_408453063287_kword_american%20express%20credit%20card_match_e&gclid=Cj0KCQiApY6BBhCsARIsAOI_GjaRsrXTdkvQeJWvKzFy_9BhDeBe2L2N668733FSHTHm96wrPGxkv7YaAl6qEALw_wcB&gclsrc=aw.ds').text
p = re.compile(r'imgurl":"(.*?)"')
links = p.findall(r)
print(links)
正则表达式解释:
您是否决定使用可以匹配的更昂贵的硒
links = [i['src'] for i in driver.find_all_elements_with_css_selector('.filmstrip-imgContainer img')]