如何从 python 列表中过滤某些网址?
How to filter some urls from python list?
我编写了这段代码,用于按照我给出的方式从网页中提取图像 url。它显示所有图像 url。但我需要过滤“https://images.unsplash.com/profile”网址并打印它们。
from bs4 import BeautifulSoup
import urllib.request
import re
url= "https://unsplash.com/t/nature"
html_page= urllib.request.urlopen(url)
soup= BeautifulSoup(html_page)
images= []
for img in soup.findAll('img'):
images.append(img.get('src'))
print(images)
我试过了;
from bs4 import BeautifulSoup
import urllib.request
import re
url= "https://unsplash.com/t/nature"
html_page= urllib.request.urlopen(url)
soup= BeautifulSoup(html_page)
images= []
for img in soup.findAll('img'):
images.append(img.get('src'))
if "https://images.unsplash.com/profile" in images:
print(images)
并没有奏效!
您需要遍历 images
,然后查看 images
中的每个 image
是否包含所需的字符串。
from bs4 import BeautifulSoup
import urllib.request
import re
url= "https://unsplash.com/t/nature"
html_page= urllib.request.urlopen(url)
soup= BeautifulSoup(html_page)
images=[]
for img in soup. find_all('img'):
images.append(img.get('src'))
for image in images:
if "https://images.unsplash.com/profile" in image:
print(image)
输出-
https://images.unsplash.com/profile-1544707963613-16baf868f301?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1604758536753-68fd6f23aaf7image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1516997253075-2a25da8007e7?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1628142977790-d9f66dcbc498image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1593541755358-41ff2a4e41efimage?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1635425197470-04119ef8fe14image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1578469980049-1a3edf161dd6image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1634467404821-bfebba1c1fa0image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1604529169445-f6fb0ce4419bimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1557995124272-d62d831ec026?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1629238389240-a728c73e5f2e?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1557995124272-d62d831ec026?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1639922696359-2aa9a8957e24image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1604529169445-f6fb0ce4419bimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1638495379168-a1d47187bac3?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1618382084065-fc61a6f289a5image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1604529169445-f6fb0ce4419bimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1638257899437-e27df4493c8a?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1613909172589-d6917d507f51image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1643893364520-052a85760bbeimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1613830526692-e8a5006d9b70image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1620741299521-03deb22f2a20image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1557995124272-d62d831ec026?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1641356925987-40e732340cc4?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1641356925987-40e732340cc4?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1582127823399-8b7c96db846eimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
您可以使用 css selectors
使您的选择更加具体:
soup.select('img[src*="https://images.unsplash.com/profile"]')
示例
from bs4 import BeautifulSoup
import urllib.request
url= "https://unsplash.com/t/nature"
html_page= urllib.request.urlopen(url)
soup= BeautifulSoup(html_page)
images = [img['src'] for img in soup.select('img[src*="https://images.unsplash.com/profile"]')]
print(images)
输出
['https://images.unsplash.com/profile-1544707963613-16baf868f301?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff', 'https://images.unsplash.com/profile-1604758536753-68fd6f23aaf7image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff', 'https://images.unsplash.com/profile-1516997253075-2a25da8007e7?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff', 'https://images.unsplash.com/profile-1628142977790-d9f66dcbc498image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff']
编辑
只是逐行打印 urls:
for img in soup.select('img[src*="https://images.unsplash.com/profile"]'):
print(img['src'])
我编写了这段代码,用于按照我给出的方式从网页中提取图像 url。它显示所有图像 url。但我需要过滤“https://images.unsplash.com/profile”网址并打印它们。
from bs4 import BeautifulSoup
import urllib.request
import re
url= "https://unsplash.com/t/nature"
html_page= urllib.request.urlopen(url)
soup= BeautifulSoup(html_page)
images= []
for img in soup.findAll('img'):
images.append(img.get('src'))
print(images)
我试过了;
from bs4 import BeautifulSoup
import urllib.request
import re
url= "https://unsplash.com/t/nature"
html_page= urllib.request.urlopen(url)
soup= BeautifulSoup(html_page)
images= []
for img in soup.findAll('img'):
images.append(img.get('src'))
if "https://images.unsplash.com/profile" in images:
print(images)
并没有奏效!
您需要遍历 images
,然后查看 images
中的每个 image
是否包含所需的字符串。
from bs4 import BeautifulSoup
import urllib.request
import re
url= "https://unsplash.com/t/nature"
html_page= urllib.request.urlopen(url)
soup= BeautifulSoup(html_page)
images=[]
for img in soup. find_all('img'):
images.append(img.get('src'))
for image in images:
if "https://images.unsplash.com/profile" in image:
print(image)
输出-
https://images.unsplash.com/profile-1544707963613-16baf868f301?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1604758536753-68fd6f23aaf7image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1516997253075-2a25da8007e7?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1628142977790-d9f66dcbc498image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1593541755358-41ff2a4e41efimage?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1635425197470-04119ef8fe14image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1578469980049-1a3edf161dd6image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1634467404821-bfebba1c1fa0image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1604529169445-f6fb0ce4419bimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1557995124272-d62d831ec026?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1629238389240-a728c73e5f2e?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1557995124272-d62d831ec026?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1639922696359-2aa9a8957e24image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1604529169445-f6fb0ce4419bimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1638495379168-a1d47187bac3?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1618382084065-fc61a6f289a5image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1604529169445-f6fb0ce4419bimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1638257899437-e27df4493c8a?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1613909172589-d6917d507f51image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1643893364520-052a85760bbeimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1613830526692-e8a5006d9b70image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1620741299521-03deb22f2a20image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1557995124272-d62d831ec026?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1641356925987-40e732340cc4?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1641356925987-40e732340cc4?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
https://images.unsplash.com/profile-1582127823399-8b7c96db846eimage?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
您可以使用 css selectors
使您的选择更加具体:
soup.select('img[src*="https://images.unsplash.com/profile"]')
示例
from bs4 import BeautifulSoup
import urllib.request
url= "https://unsplash.com/t/nature"
html_page= urllib.request.urlopen(url)
soup= BeautifulSoup(html_page)
images = [img['src'] for img in soup.select('img[src*="https://images.unsplash.com/profile"]')]
print(images)
输出
['https://images.unsplash.com/profile-1544707963613-16baf868f301?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff', 'https://images.unsplash.com/profile-1604758536753-68fd6f23aaf7image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff', 'https://images.unsplash.com/profile-1516997253075-2a25da8007e7?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff', 'https://images.unsplash.com/profile-1628142977790-d9f66dcbc498image?auto=format&fit=crop&w=16&h=16&q=60&crop=faces&bg=fff']
编辑
只是逐行打印 urls:
for img in soup.select('img[src*="https://images.unsplash.com/profile"]'):
print(img['src'])