用什么替换 xml.dom.minidom 以获得可以有效腌制的东西?
What to replace xml.dom.minidom with to get something that can pickle efficiently?
我有一个处理 ~1-2 兆字节 XML 文件的应用程序。听起来不多,但我 运行 遇到了性能问题。
因为我有一些我想加快的计算绑定任务,所以我尝试使用 multiprocessing.imap 来做到这一点 - 这需要酸洗这个 XML 数据。将包含引用的数据结构酸洗到这个 DOM 结果比那些计算绑定进程慢,罪魁祸首似乎是递归 - 我必须将递归限制设置为 10'000 才能让泡菜工作第一名:-S.
无论如何,我的问题是:
如果我想从参考性能的角度来解决这个问题,我应该用什么来代替minidom?标准既是酸洗性能又是过渡的难易程度。
为了让您了解需要什么样的方法,我粘贴了一个包装器 class(有时为了加快 getElementsByTagName
调用而提前编写)。用遵循与此 class 相同接口的节点替换所有 minidom 节点是可以接受的,即我不需要 minidom 中的所有方法。摆脱 parentNode
方法也是可以接受的(并且可能是提高酸洗性能的好主意)。
是的,如果我现在要设计这个,我一开始就不会选择 XML 节点引用,但现在要删除所有这些内容需要做很多工作,所以我希望这可以被修补。
我应该使用 python 内置插件还是集合库自己编写该死的东西?
class ImmutableDOMNode(object):
def __init__(self, node):
self.node = node
self.cachedElementsByTagName = {}
@property
def nodeType(self):
return self.node.nodeType
@property
def tagName(self):
return self.node.tagName
@property
def ownerDocument(self):
return self.node.ownerDocument
@property
def nodeName(self):
return self.node.nodeName
@property
def nodeValue(self):
return self.node.nodeValue
@property
def attributes(self):
return self.node.attributes
@property
def parentNode(self):
return ImmutableDOMNode(self.node.parentNode)
@property
def firstChild(self):
return ImmutableDOMNode(self.node.firstChild)
@property
def childNodes(self):
return [ImmutableDOMNode(node) for node in self.node.childNodes]
def getElementsByTagName(self, name):
result = self.cachedElementsByTagName.get(name)
if result != None:
return result
uncachedResult = self.node.getElementsByTagName(name)
cachedResult = [ImmutableDOMNode(node) for node in uncachedResult]
self.cachedElementsByTagName[name] = cachedResult
return cachedResult
def getAttribute(self, qName):
return self.node.getAttribute(qName)
def toxml(self, encoding=None):
return self.node.toxml(encoding)
def toprettyxml(self, indent="", newl="", encoding=None):
return self.node.toprettyxml(indent, newl, encoding)
def appendChild(self, node):
raise Exception("cannot append child to immutable node")
def removeChild(self, node):
raise Exception("cannot remove child from immutable node")
def cloneNode(self, deep):
raise Exception("clone node not implemented")
def createElement(self, tagName):
raise Exception("cannot create element for immutable node")
def createTextNode(self, tagName):
raise Exception("cannot create text node for immutable node")
def createAttribute(self, qName):
raise Exception("cannot create attribute for immutable node")
所以我决定只制作自己的 DOM 实现来满足我的要求,我将其粘贴在下面以防它对某人有所帮助。这取决于 memoization library for python 2.7 and @Raymond Hettinger's immutable dict from Immutable dictionary, only use as a key for another dictionary 中的 lru_cache。但是,如果您不介意的话,这些依赖项很容易删除 safety/performance。
class CycleFreeDOMNode(object):
def __init__(self, minidomNode=None):
if minidomNode is None:
return
if not isinstance(minidomNode, xml.dom.minidom.Node):
raise ValueError("%s needs to be instantiated with a minidom.Node" %(
type(self).__name__
))
if minidomNode.nodeValue and minidomNode.childNodes:
raise ValueError(
"both nodeValue and childNodes in same node are not supported"
)
self._tagName = minidomNode.tagName \
if hasattr(minidomNode, "tagName") else None
self._nodeType = minidomNode.nodeType
self._nodeName = minidomNode.nodeName
self._nodeValue = minidomNode.nodeValue
self._attributes = dict(
item
for item in minidomNode.attributes.items()
) if minidomNode.attributes else {}
self._childNodes = tuple(
CycleFreeDOMNode(cn)
for cn in minidomNode.childNodes
)
childNodesByTagName = defaultdict(list)
for cn in self._childNodes:
childNodesByTagName[cn.tagName].append(cn)
self._childNodesByTagName = ImmutableDict(childNodesByTagName)
@property
def nodeType(self):
return self._nodeType
@property
def tagName(self):
return self._tagName
@property
def nodeName(self):
return self._nodeName
@property
def nodeValue(self):
return self._nodeValue
@property
def attributes(self):
return self._attributes
@property
def firstChild(self):
return self._childNodes[0] if self._childNodes else None
@property
def childNodes(self):
return self._childNodes
@lru_cache(maxsize = 100)
def getElementsByTagName(self, name):
result = self._childNodesByTagName.get(name, [])
for cn in self.childNodes:
result += cn.getElementsByTagName(name)
return result
def cloneNode(self, deep=False):
clone = CycleFreeDOMNode()
clone._tagName = self._tagName
clone._nodeType = self._nodeType
clone._nodeName = self._nodeName
clone._nodeValue = self._nodeValue
clone._attributes = copy.copy(self._attributes)
if deep:
clone._childNodes = tuple(
cn.cloneNode(deep)
for cn in self.childNodes
)
childNodesByTagName = defaultdict(list)
for cn in clone._childNodes:
childNodesByTagName[cn.tagName].append(cn)
clone._childNodesByTagName = ImmutableDict(childNodesByTagName)
else:
clone._childNodes = tuple(cn for cn in self.childNodes)
clone._childNodesByTagName = self._childNodesByTagName
return clone
def toxml(self):
def makeXMLForContent():
return self.nodeValue or "".join([
cn.toxml() for cn in self.childNodes
])
if not self.tagName:
return makeXMLForContent()
return "<%s%s>%s</%s>" %(
self.tagName,
" " + ", ".join([
"%s=\"%s\"" %(k,v)
for k,v in self.attributes.items()
]) if any(self.attributes) else "",
makeXMLForContent(),
self.tagName
)
def getAttribute(self, name):
return self._attributes.get(name, "")
def setAttribute(self, name, value):
self._attributes[name] = value
我有一个处理 ~1-2 兆字节 XML 文件的应用程序。听起来不多,但我 运行 遇到了性能问题。
因为我有一些我想加快的计算绑定任务,所以我尝试使用 multiprocessing.imap 来做到这一点 - 这需要酸洗这个 XML 数据。将包含引用的数据结构酸洗到这个 DOM 结果比那些计算绑定进程慢,罪魁祸首似乎是递归 - 我必须将递归限制设置为 10'000 才能让泡菜工作第一名:-S.
无论如何,我的问题是:
如果我想从参考性能的角度来解决这个问题,我应该用什么来代替minidom?标准既是酸洗性能又是过渡的难易程度。
为了让您了解需要什么样的方法,我粘贴了一个包装器 class(有时为了加快 getElementsByTagName
调用而提前编写)。用遵循与此 class 相同接口的节点替换所有 minidom 节点是可以接受的,即我不需要 minidom 中的所有方法。摆脱 parentNode
方法也是可以接受的(并且可能是提高酸洗性能的好主意)。
是的,如果我现在要设计这个,我一开始就不会选择 XML 节点引用,但现在要删除所有这些内容需要做很多工作,所以我希望这可以被修补。
我应该使用 python 内置插件还是集合库自己编写该死的东西?
class ImmutableDOMNode(object):
def __init__(self, node):
self.node = node
self.cachedElementsByTagName = {}
@property
def nodeType(self):
return self.node.nodeType
@property
def tagName(self):
return self.node.tagName
@property
def ownerDocument(self):
return self.node.ownerDocument
@property
def nodeName(self):
return self.node.nodeName
@property
def nodeValue(self):
return self.node.nodeValue
@property
def attributes(self):
return self.node.attributes
@property
def parentNode(self):
return ImmutableDOMNode(self.node.parentNode)
@property
def firstChild(self):
return ImmutableDOMNode(self.node.firstChild)
@property
def childNodes(self):
return [ImmutableDOMNode(node) for node in self.node.childNodes]
def getElementsByTagName(self, name):
result = self.cachedElementsByTagName.get(name)
if result != None:
return result
uncachedResult = self.node.getElementsByTagName(name)
cachedResult = [ImmutableDOMNode(node) for node in uncachedResult]
self.cachedElementsByTagName[name] = cachedResult
return cachedResult
def getAttribute(self, qName):
return self.node.getAttribute(qName)
def toxml(self, encoding=None):
return self.node.toxml(encoding)
def toprettyxml(self, indent="", newl="", encoding=None):
return self.node.toprettyxml(indent, newl, encoding)
def appendChild(self, node):
raise Exception("cannot append child to immutable node")
def removeChild(self, node):
raise Exception("cannot remove child from immutable node")
def cloneNode(self, deep):
raise Exception("clone node not implemented")
def createElement(self, tagName):
raise Exception("cannot create element for immutable node")
def createTextNode(self, tagName):
raise Exception("cannot create text node for immutable node")
def createAttribute(self, qName):
raise Exception("cannot create attribute for immutable node")
所以我决定只制作自己的 DOM 实现来满足我的要求,我将其粘贴在下面以防它对某人有所帮助。这取决于 memoization library for python 2.7 and @Raymond Hettinger's immutable dict from Immutable dictionary, only use as a key for another dictionary 中的 lru_cache。但是,如果您不介意的话,这些依赖项很容易删除 safety/performance。
class CycleFreeDOMNode(object):
def __init__(self, minidomNode=None):
if minidomNode is None:
return
if not isinstance(minidomNode, xml.dom.minidom.Node):
raise ValueError("%s needs to be instantiated with a minidom.Node" %(
type(self).__name__
))
if minidomNode.nodeValue and minidomNode.childNodes:
raise ValueError(
"both nodeValue and childNodes in same node are not supported"
)
self._tagName = minidomNode.tagName \
if hasattr(minidomNode, "tagName") else None
self._nodeType = minidomNode.nodeType
self._nodeName = minidomNode.nodeName
self._nodeValue = minidomNode.nodeValue
self._attributes = dict(
item
for item in minidomNode.attributes.items()
) if minidomNode.attributes else {}
self._childNodes = tuple(
CycleFreeDOMNode(cn)
for cn in minidomNode.childNodes
)
childNodesByTagName = defaultdict(list)
for cn in self._childNodes:
childNodesByTagName[cn.tagName].append(cn)
self._childNodesByTagName = ImmutableDict(childNodesByTagName)
@property
def nodeType(self):
return self._nodeType
@property
def tagName(self):
return self._tagName
@property
def nodeName(self):
return self._nodeName
@property
def nodeValue(self):
return self._nodeValue
@property
def attributes(self):
return self._attributes
@property
def firstChild(self):
return self._childNodes[0] if self._childNodes else None
@property
def childNodes(self):
return self._childNodes
@lru_cache(maxsize = 100)
def getElementsByTagName(self, name):
result = self._childNodesByTagName.get(name, [])
for cn in self.childNodes:
result += cn.getElementsByTagName(name)
return result
def cloneNode(self, deep=False):
clone = CycleFreeDOMNode()
clone._tagName = self._tagName
clone._nodeType = self._nodeType
clone._nodeName = self._nodeName
clone._nodeValue = self._nodeValue
clone._attributes = copy.copy(self._attributes)
if deep:
clone._childNodes = tuple(
cn.cloneNode(deep)
for cn in self.childNodes
)
childNodesByTagName = defaultdict(list)
for cn in clone._childNodes:
childNodesByTagName[cn.tagName].append(cn)
clone._childNodesByTagName = ImmutableDict(childNodesByTagName)
else:
clone._childNodes = tuple(cn for cn in self.childNodes)
clone._childNodesByTagName = self._childNodesByTagName
return clone
def toxml(self):
def makeXMLForContent():
return self.nodeValue or "".join([
cn.toxml() for cn in self.childNodes
])
if not self.tagName:
return makeXMLForContent()
return "<%s%s>%s</%s>" %(
self.tagName,
" " + ", ".join([
"%s=\"%s\"" %(k,v)
for k,v in self.attributes.items()
]) if any(self.attributes) else "",
makeXMLForContent(),
self.tagName
)
def getAttribute(self, name):
return self._attributes.get(name, "")
def setAttribute(self, name, value):
self._attributes[name] = value