在 BeautifulSoup/Python 中选择具有特定属性的标签
Selecting tags with specific attributes in BeautifulSoup/Python
import os
from bs4 import BeautifulSoup
do = dir_with_original_files = 'C:\Users\ADMIN\Desktop\new_folder'
dm = dir_with_modified_files = 'C:\Users\ADMIN\Desktop\new_folder\test'
for root, dirs, files in os.walk(do):
for f in files:
print f.title()
if f.endswith('~'): #you don't want to process backups
continue
original_file = os.path.join(root, f)
mf = f.split('.')
mf = ''.join(mf[:-1])+'_mod.'+mf[-1] # you can keep the same name
# if you omit the last two lines.
# They are in separate directories
# anyway. In that case, mf = f
modified_file = os.path.join(dm, mf)
with open(original_file, 'r') as orig_f, \
open(modified_file, 'w') as modi_f:
soup = BeautifulSoup(orig_f.read())
for t in soup.find_all('td', class_='findThisClass'):
for child in t.find_all("font"):
if child.string is not None:
child.string.wrap(soup.new_tag('h2'))
for t in soup.find_all('table', class_='tableClass'):
t.extract()
# This is where you create your new modified file.
modi_f.write(soup.prettify().encode(soup.original_encoding))
此代码将在 class <td class=findThisClass>
中找到所有 <font>
标签,并在这些字体标签中添加一个。
我想做的是找到所有带有此标记的 html:
<font color="#333333" face="Verdana" size="3" style="font-weight: bold; background-color: rgb(255, 255, 255);">
如果:
,最好的方法是什么?
(a) 我确信字体将始终遵循相同的形式(所有属性的顺序相同,使用此字符串的 ctrl + f 会找到我想要的所有匹配项) :
<font color="#333333" face="Verdana" size="3" style="font-weight: bold; background-color: rgb(255, 255, 255);">
(b) 如果我想让它工作,即使属性顺序被切换,例如:
<font color="#333333" face="Verdana" size="3" style="font-weight: bold; background-color: rgb(255, 255, 255);">
还要改
<font face="Verdana" color="#333333" size="3" style="font-weight: bold; background-color: rgb(255, 255, 255);">
非常感谢。
提供具有特定值的 attrs
字典:
t.find_all("font", attrs={'face': 'Verdana', 'color': '#333333', 'size': '3', 'style': 'font-weight: bold; background-color: rgb(255, 255, 255);'})
import os
from bs4 import BeautifulSoup
do = dir_with_original_files = 'C:\Users\ADMIN\Desktop\new_folder'
dm = dir_with_modified_files = 'C:\Users\ADMIN\Desktop\new_folder\test'
for root, dirs, files in os.walk(do):
for f in files:
print f.title()
if f.endswith('~'): #you don't want to process backups
continue
original_file = os.path.join(root, f)
mf = f.split('.')
mf = ''.join(mf[:-1])+'_mod.'+mf[-1] # you can keep the same name
# if you omit the last two lines.
# They are in separate directories
# anyway. In that case, mf = f
modified_file = os.path.join(dm, mf)
with open(original_file, 'r') as orig_f, \
open(modified_file, 'w') as modi_f:
soup = BeautifulSoup(orig_f.read())
for t in soup.find_all('td', class_='findThisClass'):
for child in t.find_all("font"):
if child.string is not None:
child.string.wrap(soup.new_tag('h2'))
for t in soup.find_all('table', class_='tableClass'):
t.extract()
# This is where you create your new modified file.
modi_f.write(soup.prettify().encode(soup.original_encoding))
此代码将在 class <td class=findThisClass>
中找到所有 <font>
标签,并在这些字体标签中添加一个。
我想做的是找到所有带有此标记的 html:
<font color="#333333" face="Verdana" size="3" style="font-weight: bold; background-color: rgb(255, 255, 255);">
如果:
,最好的方法是什么?(a) 我确信字体将始终遵循相同的形式(所有属性的顺序相同,使用此字符串的 ctrl + f 会找到我想要的所有匹配项) :
<font color="#333333" face="Verdana" size="3" style="font-weight: bold; background-color: rgb(255, 255, 255);">
(b) 如果我想让它工作,即使属性顺序被切换,例如:
<font color="#333333" face="Verdana" size="3" style="font-weight: bold; background-color: rgb(255, 255, 255);">
还要改
<font face="Verdana" color="#333333" size="3" style="font-weight: bold; background-color: rgb(255, 255, 255);">
非常感谢。
提供具有特定值的 attrs
字典:
t.find_all("font", attrs={'face': 'Verdana', 'color': '#333333', 'size': '3', 'style': 'font-weight: bold; background-color: rgb(255, 255, 255);'})