selenium 和 lxml - 如何设置 html?
selenium and lxml - how to set html?
我有一个脚本可以解析文档并使用 lxml 更改表单值。现在我想使用selenium导航到页面并执行它。
不幸的是,selenium 似乎不够先进,无法支持我在 lxml 中做的一些事情,例如,"if 'attrName' in tag.attrib"
然后我说,"Aha!, But selenium provides webdriver.page_source where I can get the html!"
是的,有点......我可以得到它,然后用 lxml 解析它......但令人讨厌的是,page_source 不可写,所以我无法坚持我的更改!
我仍然觉得我的方向是正确的,但是有什么方法可以将修改后的 html 写回浏览器吗?
更新:
我尝试了几种尝试重写所有页面内容的方法,但是 firefox webdriver 开始抛出安全异常,就像它不喜欢我尝试做的事情一样。下面我将 post 我使用的替代方法。
我最后做的是创建一个类似 lxml 的中介 class。这只实现了我需要的那些功能,但我仍然认为我会作为一个起点分享给其他尝试做同样事情的人。
import lxml.etree
import collections.abc
class AttribWrapper(collections.abc.MutableMapping):
"""
Make something that acts like a dict to use as a mock lxml.etree.element attrib value
see also:
https://docs.python.org/3/reference/datamodel.html?emulating-container-types#emulating-container-types
"""
def __init__(self,seleniumTag):
self.seleniumTag=seleniumTag
while not hasattr(seleniumTag,'execute_script'):
seleniumTag=seleniumTag.parent
self.webdriver=seleniumTag
def hasAttribute(self,attrName):
"""
determine if a tag has an attribute
"""
try:
val=self.seleniumTag.get_attribute(attrName)
except Exception:
return False
return val is not None
def getAttribute(self,attrName,default=None):
"""
get a tag's attribute or default if not present
"""
try:
return self.seleniumTag.get_attribute(attrName)
except Exception:
pass
return default
def setAttribute(self,attrName,attrValue):
"""
get a tag's attribute
"""
self.webdriver.execute_script("arguments[0].setAttribute(arguments[1],arguments[2]);",self.seleniumTag,attrName,attrValue);
def __getitem__(self,key):
if key not in self.keys():
raise KeyError()
return self.getAttribute(key)
def __setitem__(self,key,value):
self.setAttribute(key,value)
def __delitem__(self,key):
self.webdriver.execute_script('arguments[0].attributes.removeNamedItem("%s");'%key,self.seleniumTag)
def __iter__(self):
items=[]
for k in self.keys():
v=self[k]
items.append((k,v))
return self.keys().__iter__()
def __len__(self):
return self.webdriver.execute_script('return arguments[0].attributes.length',self.seleniumTag)
def keys(self):
n=self.webdriver.execute_script('return arguments[0].attributes.length',self.seleniumTag)
names=[]
for idx in range(n):
result=self.webdriver.execute_script('return arguments[0].attributes.item(%d).nodeName'%idx,self.seleniumTag)
names.append(result)
return names
def __repr__(self):
ret=[]
for k,v in self.items():
ret.append('("%s":"%s")'%(k,v))
return "{%s}"%(', '.join(ret))
class LikeEtreeElement:
def __init__(self,seleniumTag):
self.seleniumTag=seleniumTag
self.attrib=AttribWrapper(seleniumTag)
while not hasattr(seleniumTag,'execute_script'):
seleniumTag=seleniumTag.parent
self.webdriver=seleniumTag
@property
def tag(self):
return self.seleniumTag.tag_name
def getchildren(self):
"""
get all child elements
(NOTE: this will dip into the html every single time just
in case things have changed.)
"""
return self.seleniumTag.find_elements_by_xpath('/*')
@property
def innerHTML(self):
return self.webdriver.execute_script('return arguments[0].innerHTML',self.seleniumTag)
@innerHTML.setter
def innerHTML(self,value):
value=str(value).replace('\\','\').replace('\n','\n').replace('"','\"')
return self.webdriver.execute_script('arguments[0].innerHTML="%s"'%(value),self.seleniumTag)
def __getitem__(self,idx):
return self.getchildren()[idx]
def __iter__(self):
return self.getchildren().__iter__()
def __len__(self):
return len(self.getchildren())
我有一个脚本可以解析文档并使用 lxml 更改表单值。现在我想使用selenium导航到页面并执行它。
不幸的是,selenium 似乎不够先进,无法支持我在 lxml 中做的一些事情,例如,"if 'attrName' in tag.attrib"
然后我说,"Aha!, But selenium provides webdriver.page_source where I can get the html!" 是的,有点......我可以得到它,然后用 lxml 解析它......但令人讨厌的是,page_source 不可写,所以我无法坚持我的更改!
我仍然觉得我的方向是正确的,但是有什么方法可以将修改后的 html 写回浏览器吗?
更新: 我尝试了几种尝试重写所有页面内容的方法,但是 firefox webdriver 开始抛出安全异常,就像它不喜欢我尝试做的事情一样。下面我将 post 我使用的替代方法。
我最后做的是创建一个类似 lxml 的中介 class。这只实现了我需要的那些功能,但我仍然认为我会作为一个起点分享给其他尝试做同样事情的人。
import lxml.etree
import collections.abc
class AttribWrapper(collections.abc.MutableMapping):
"""
Make something that acts like a dict to use as a mock lxml.etree.element attrib value
see also:
https://docs.python.org/3/reference/datamodel.html?emulating-container-types#emulating-container-types
"""
def __init__(self,seleniumTag):
self.seleniumTag=seleniumTag
while not hasattr(seleniumTag,'execute_script'):
seleniumTag=seleniumTag.parent
self.webdriver=seleniumTag
def hasAttribute(self,attrName):
"""
determine if a tag has an attribute
"""
try:
val=self.seleniumTag.get_attribute(attrName)
except Exception:
return False
return val is not None
def getAttribute(self,attrName,default=None):
"""
get a tag's attribute or default if not present
"""
try:
return self.seleniumTag.get_attribute(attrName)
except Exception:
pass
return default
def setAttribute(self,attrName,attrValue):
"""
get a tag's attribute
"""
self.webdriver.execute_script("arguments[0].setAttribute(arguments[1],arguments[2]);",self.seleniumTag,attrName,attrValue);
def __getitem__(self,key):
if key not in self.keys():
raise KeyError()
return self.getAttribute(key)
def __setitem__(self,key,value):
self.setAttribute(key,value)
def __delitem__(self,key):
self.webdriver.execute_script('arguments[0].attributes.removeNamedItem("%s");'%key,self.seleniumTag)
def __iter__(self):
items=[]
for k in self.keys():
v=self[k]
items.append((k,v))
return self.keys().__iter__()
def __len__(self):
return self.webdriver.execute_script('return arguments[0].attributes.length',self.seleniumTag)
def keys(self):
n=self.webdriver.execute_script('return arguments[0].attributes.length',self.seleniumTag)
names=[]
for idx in range(n):
result=self.webdriver.execute_script('return arguments[0].attributes.item(%d).nodeName'%idx,self.seleniumTag)
names.append(result)
return names
def __repr__(self):
ret=[]
for k,v in self.items():
ret.append('("%s":"%s")'%(k,v))
return "{%s}"%(', '.join(ret))
class LikeEtreeElement:
def __init__(self,seleniumTag):
self.seleniumTag=seleniumTag
self.attrib=AttribWrapper(seleniumTag)
while not hasattr(seleniumTag,'execute_script'):
seleniumTag=seleniumTag.parent
self.webdriver=seleniumTag
@property
def tag(self):
return self.seleniumTag.tag_name
def getchildren(self):
"""
get all child elements
(NOTE: this will dip into the html every single time just
in case things have changed.)
"""
return self.seleniumTag.find_elements_by_xpath('/*')
@property
def innerHTML(self):
return self.webdriver.execute_script('return arguments[0].innerHTML',self.seleniumTag)
@innerHTML.setter
def innerHTML(self,value):
value=str(value).replace('\\','\').replace('\n','\n').replace('"','\"')
return self.webdriver.execute_script('arguments[0].innerHTML="%s"'%(value),self.seleniumTag)
def __getitem__(self,idx):
return self.getchildren()[idx]
def __iter__(self):
return self.getchildren().__iter__()
def __len__(self):
return len(self.getchildren())