Python BeautifulSoup 替换img src
Python BeautifulSoup replace img src
我正在尝试从站点解析 HTML 内容,更改 href 和 img src。 href修改成功,img src修改失败
它在变量中改变了,但在 HTML 中没有改变(post_content):
<p><img alt="alt text" src="https://lifehacker.ru/wp-content/uploads/2016/08/15120903sa_d2__1471520915-630x523.jpg" title="Title"/></p>
不是_http://site.ru...
<p><img alt="alt text" src="http://site.ru/wp-content/uploads/2016/08/15120903sa_d2__1471520915-630x523.jpg" title="Title"/></p>
我的代码
if "app-store" not in url:
r = requests.get("https://lifehacker.ru/2016/08/23/kak-vybrat-trimmer/")
soup = BeautifulSoup(r.content)
post_content = soup.find("div", {"class", "post-content"})
for tag in post_content():
for attribute in ["class", "id", "style", "height", "width", "sizes"]:
del tag[attribute]
for a in post_content.find_all('a'):
a['href'] = a['href'].replace("https://lifehacker.ru", "http://site.ru")
for img in post_content.find_all('img'):
img_urls = img['src']
if "https:" not in img_urls:
img_urls="http:{}".format(img_urls)
thumb_url = img_urls.split('/')
urllib.urlretrieve(img_urls, "/Users/kr/PycharmProjects/education_py/{}/{}".format(folder_name, thumb_url[-1]))
file_url = "/Users/kr/PycharmProjects/education_py/{}/{}".format(folder_name, thumb_url[-1])
data = {
'name': '{}'.format(thumb_url[-1]),
'type': 'image/jpeg',
}
with open(file_url, 'rb') as img:
data['bits'] = xmlrpc_client.Binary(img.read())
response = client.call(media.UploadFile(data))
attachment_url = response['url']
img_urls = img_urls.replace(img_urls, attachment_url)
[s.extract() for s in post_content('script')]
post_content_insert = bleach.clean(post_content)
post_content_insert = post_content_insert.replace('<', '<')
post_content_insert = post_content_insert.replace('>', '>')
print post_content_insert
看起来您从未将 img_urls
分配回 img['src']
。尝试在块的末尾这样做。
img_urls = img_urls.replace(img_urls, attachment_url)
img['src'] = img_urls
... 但首先,您需要更改 with
语句,以便它使用 img
以外的其他名称作为您的文件对象。现在您遮盖了 dom 元素,您无法再访问它。
with open(file_url, 'rb') as some_file:
data['bits'] = xmlrpc_client.Binary(some_file.read())
我正在尝试从站点解析 HTML 内容,更改 href 和 img src。 href修改成功,img src修改失败
它在变量中改变了,但在 HTML 中没有改变(post_content):
<p><img alt="alt text" src="https://lifehacker.ru/wp-content/uploads/2016/08/15120903sa_d2__1471520915-630x523.jpg" title="Title"/></p>
不是_http://site.ru...
<p><img alt="alt text" src="http://site.ru/wp-content/uploads/2016/08/15120903sa_d2__1471520915-630x523.jpg" title="Title"/></p>
我的代码
if "app-store" not in url:
r = requests.get("https://lifehacker.ru/2016/08/23/kak-vybrat-trimmer/")
soup = BeautifulSoup(r.content)
post_content = soup.find("div", {"class", "post-content"})
for tag in post_content():
for attribute in ["class", "id", "style", "height", "width", "sizes"]:
del tag[attribute]
for a in post_content.find_all('a'):
a['href'] = a['href'].replace("https://lifehacker.ru", "http://site.ru")
for img in post_content.find_all('img'):
img_urls = img['src']
if "https:" not in img_urls:
img_urls="http:{}".format(img_urls)
thumb_url = img_urls.split('/')
urllib.urlretrieve(img_urls, "/Users/kr/PycharmProjects/education_py/{}/{}".format(folder_name, thumb_url[-1]))
file_url = "/Users/kr/PycharmProjects/education_py/{}/{}".format(folder_name, thumb_url[-1])
data = {
'name': '{}'.format(thumb_url[-1]),
'type': 'image/jpeg',
}
with open(file_url, 'rb') as img:
data['bits'] = xmlrpc_client.Binary(img.read())
response = client.call(media.UploadFile(data))
attachment_url = response['url']
img_urls = img_urls.replace(img_urls, attachment_url)
[s.extract() for s in post_content('script')]
post_content_insert = bleach.clean(post_content)
post_content_insert = post_content_insert.replace('<', '<')
post_content_insert = post_content_insert.replace('>', '>')
print post_content_insert
看起来您从未将 img_urls
分配回 img['src']
。尝试在块的末尾这样做。
img_urls = img_urls.replace(img_urls, attachment_url)
img['src'] = img_urls
... 但首先,您需要更改 with
语句,以便它使用 img
以外的其他名称作为您的文件对象。现在您遮盖了 dom 元素,您无法再访问它。
with open(file_url, 'rb') as some_file:
data['bits'] = xmlrpc_client.Binary(some_file.read())