从 Objectified XML 中高效地获取多个切片

Obtain multiple slices efficiently from Objectified XML

如果我有一个 XML 并使用 lxml 进行对象化,我如何有效地获取列表的切片?

我的脚本。

# from lxml import etree
from lxml import objectify
import argparse
import os

parser = argparse.ArgumentParser()
parser.add_argument("path", type=str, nargs="+")
parser.add_argument('-e',
                    '--extension',
                    default='',
                    help='File extension to filter by.')

args = parser.parse_args()
name_pattern = "*" + args.extension
my_dir = args.path[0]

for dir_path, subdir_list, file_list in os.walk(my_dir):
    for name_pattern in file_list:
        full_path = os.path.join(dir_path, name_pattern)


def getsMeet(file_list):
    for filename in sorted(file_list):
        filename=my_dir + filename
        yield filename

def parseXML():
    """
    """
    for file in getsMeet(file_list):
        with open(file) as f:
            xml = f.read()

            root = objectify.fromstring(xml)
            print(objectify.dump(root.race.nomination[0]))
            find = objectify.ObjectPath(".race.nomination")
            print(find.hasattr(root))


parseXML()

XML流出根=会议>俱乐部>比赛>条件|提名

所以这张图展示了提名的结构

print(objectify.dump(root.race.nomination[0]))

(pyxml) [sayth@localhost pyxml]$ python xrace.py data/ -e .xml
nomination = '' [StringElement]
  * number = '8'
  * saddlecloth = '8'
  * horse = 'Chipanda'
  * id = '198926'
  * idnumber = ''
  * regnumber = ''
  * blinkers = '0'
  * trainernumber = '235'
  * trainersurname = "O'Shea"
  * trainerfirstname = 'John'
  * trainertrack = 'Agnes Banks/Hawkesbury'
  * rsbtrainername = "John O'Shea"
  * jockeynumber = '84015'
  * jockeysurname = 'Avdulla'
  * jockeyfirstname = 'Brenton'
  * barrier = '5'
  * weight = '54'
  * rating = '0'
  * description = 'B F 2 Sepoy x Lobola (Anabaa(USA))'
  * colours = 'Royal Blue'
  * owners = 'Godolphin '
  * dob = '2013-10-08T00:00:00'
  * age = '3'
  * sex = 'F'
  * career = '2-0-0-2 225.00'
  * thistrack = '1-0-0-1 000.00'
  * thisdistance = '0-0-0-0'
  * goodtrack = '0-0-0-0'
  * heavytrack = '0-0-0-0'
  * slowtrack = ''
  * deadtrack = ''
  * fasttrack = '0-0-0-0'
  * firstup = '2-0-0-2 225.00'
  * secondup = '0-0-0-0'
  * mindistancewin = '0'
  * maxdistancewin = '0'
  * finished = '1'
  * weightvariation = '0'
  * variedweight = '54'
  * decimalmargin = '0.00'
  * penalty = '0'
  * pricestarting = '.50'
  * sectional200 = '0'
  * sectional400 = '0'
  * sectional600 = '0'
  * sectional800 = '0'
  * sectional1200 = '0'
  * bonusindicator = 'E'
True

如果我想return这些元素应该怎么做?

 * number = '8'
  * saddlecloth = '8'
  * horse = 'Chipanda'
  * id = '198926'
  * barrier = '5'
  * weight = '54'
  * rating = '0'
  * description = 'B F 2 Sepoy x Lobola (Anabaa(USA))'
  * colours = 'Royal Blue'
  * owners = 'Godolphin '
  * dob = '2013-10-08T00:00:00'
  * age = '3'
  * sex = 'F'
  * career = '2-0-0-2 225.00'
  * thistrack = '1-0-0-1 000.00'
  * thisdistance = '0-0-0-0'
  * goodtrack = '0-0-0-0'
  * heavytrack = '0-0-0-0'
  * finished = '1'
  * weightvariation = '0'
  * variedweight = '54'
  * decimalmargin = '0.00'
  * penalty = '0'
  * pricestarting = '.50'

样本XML

<meeting id="42977" barriertrial="0" venue="Rosehill Gardens" date="2016-05-21T00:00:00" gearchanges="-1" stewardsreport="-1" gearlist="-1" racebook="0" postracestewards="0" meetingtype="TAB" rail="Timing - Electronic : Rail - +6m" weather="Fine      " trackcondition="Good 3    " nomsdeadline="2016-05-16T11:00:00" weightsdeadline="2016-05-17T16:00:00" acceptdeadline="2016-05-18T09:00:00" jockeydeadline="2016-05-18T12:00:00">
  <club abbrevname="Australian Turf Club" code="56398" associationclass="1" website="http://" />
  <race id="215411" number="1" nomnumber="9" division="0" name="LES CARLYON AC PLATE" mediumname="2Y-SWP" shortname="2Y-SWP" stage="Results" distance="1200" minweight="0" raisedweight="0" class="~         " age="2         " grade="0" weightcondition="SWP       " trophy="0" owner="0" trainer="0" jockey="0" strapper="0" totalprize="85000" first="48750" second="16750" third="8350" fourth="4150" fifth="2000" time="2016-05-21T11:25:00" bonustype="BOB7      " nomsfee="0" acceptfee="0" trackcondition="Good 3    " timingmethod="Electronic" fastesttime="1-10.22   " sectionaltime="600/34.78 " formavailable="0" racebookprize="Of 000. First 750, second 750, third 50, fourth 50, fifth 00, sixth 00, seventh 00, eighth 00, ninth 00, tenth 00">
    <condition line="1">Of 000. First 750, second 750, third 50, fourth 50, fifth 00, sixth 00, seventh 00, eighth 00, ninth 00, tenth 00</condition>
    <condition line="2">Starter Subsidy: 0 for non-prize earning runners.</condition>
    <condition line="3">No class restriction, Set Weights plus Penalties, For Two-Years-Old, No sex restriction</condition>
    <condition line="4">BOBS Bonus available: ,000</condition>
    <condition line="5">Apprentices can claim. Field Limit: 16 + 4 EM</condition>
    <nomination number="8" saddlecloth="8" horse="Chipanda" id="198926" idnumber="" regnumber="" blinkers="0" trainernumber="235" trainersurname="O'Shea" trainerfirstname="John" trainertrack="Agnes Banks/Hawkesbury" rsbtrainername="John O'Shea" jockeynumber="84015" jockeysurname="Avdulla" jockeyfirstname="Brenton" barrier="5" weight="54" rating="0" description="B F 2 Sepoy x Lobola (Anabaa(USA))" colours="Royal Blue" owners="Godolphin " dob="2013-10-08T00:00:00" age="3" sex="F" career="2-0-0-2 225.00" thistrack="1-0-0-1 000.00" thisdistance="0-0-0-0" goodtrack="0-0-0-0" heavytrack="0-0-0-0" slowtrack="" deadtrack="" fasttrack="0-0-0-0" firstup="2-0-0-2 225.00" secondup="0-0-0-0" mindistancewin="0" maxdistancewin="0" finished="1" weightvariation="0" variedweight="54" decimalmargin="0.00" penalty="0" pricestarting=".50" sectional200="0" sectional400="0" sectional600="0" sectional800="0" sectional1200="0" bonusindicator="E" />
    <nomination number="1" saddlecloth="1" horse="Legerity" id="200769" idnumber="" regnumber="" blinkers="0" trainernumber="77974" trainersurname="Hawkes" trainerfirstname="Michael" trainertrack="Rosehill" rsbtrainername="Michael, Wayne &amp; John Hawkes" jockeynumber="2687" jockeysurname="Reith" jockeyfirstname="Christian" barrier="1" weight="57.5" rating="0" description="B C 2 Snitzel x Simply Spiteful(USA) (Speightstown(USA))" colours="Purple, Gold Checks, Quartered Cap" owners="Highgrove Stud Syndicate (Mgr: R T Gilbert)" dob="2013-08-30T00:00:00" age="3" sex="C" career="4-1-1-1 075.00" thistrack="1-1-0-0 750.00" thisdistance="0-0-0-0" goodtrack="3-1-0-1 150.00" heavytrack="0-0-0-0" slowtrack="" deadtrack="" fasttrack="0-0-0-0" firstup="2-0-1-1 125.00" secondup="2-1-0-0 950.00" mindistancewin="0" maxdistancewin="0" finished="2" weightvariation="0" variedweight="57.5" decimalmargin="0.50" penalty="0" pricestarting=".50F" sectional200="0" sectional400="0" sectional600="0" sectional800="0" sectional1200="0" bonusindicator="E" />
</race>
</meeting>

我可以使用这个 defaultdict 获取所有值,但是我似乎没有正确使用 objectify。

d = defaultdict(list)
    # nomItems = ['id', 'horse']
    for sample in root.xpath('//race/nomination'):
        for attr_name, attr_value in sample.items():
            d[attr_name].append(attr_value)

 pprint(dict(d))

您可以在单个表达式中获得多个属性,但如果您想知道什么属于哪个,则需要按照它们出现的顺序添加它们:

.xpath(('//race/nomination/@*[name() = "number" or name() = "saddlecloth" or name() = "horse"]'

另一种选择是从 attrib 字典中提取,使用 operator.itemgetter:

from operator import itemgetter

atts = ("number", "id", "horse", "saddlecloth", "barrier", "weight", "rating", "description", "colours",
    "owners", "dob", "age", "sex", "career", "thistrack", "thisdistance", "goodtrack", "heavytrack",
    "finished", "weightvariation", "variedweight", "decimalmargin", "penalty", "pricestarting")

for sample in root.xpath('//race/nomination'):
      print(dict(zip(atts, (itemgetter(*atts)(sample.attrib)

对于您的示例 xml 将输出:

{'thistrack': '1-0-0-1 000.00', 'rating': '0', 'weight': '54', 'number': '8', 'sex': 'F', 'id': '198926', 'penalty': '0', 'horse': 'Chipanda', 'pricestarting': '.50', 'colours': 'Royal Blue', 'saddlecloth': '8', 'description': 'B F 2 Sepoy x Lobola (Anabaa(USA))', 'barrier': '5', 'weightvariation': '0', 'finished': '1', 'variedweight': '54', 'goodtrack': '0-0-0-0', 'owners': 'Godolphin ', 'decimalmargin': '0.00', 'dob': '2013-10-08T00:00:00', 'thisdistance': '0-0-0-0', 'age': '3', 'heavytrack': '0-0-0-0', 'career': '2-0-0-2 225.00'}
{'thistrack': '1-1-0-0 750.00', 'rating': '0', 'weight': '57.5', 'number': '1', 'sex': 'C', 'id': '200769', 'penalty': '0', 'horse': 'Legerity', 'pricestarting': '.50F', 'colours': 'Purple, Gold Checks, Quartered Cap', 'saddlecloth': '1', 'description': 'B C 2 Snitzel x Simply Spiteful(USA) (Speightstown(USA))', 'barrier': '1', 'weightvariation': '0', 'finished': '2', 'variedweight': '57.5', 'goodtrack': '3-1-0-1 150.00', 'owners': 'Highgrove Stud Syndicate (Mgr: R T Gilbert)', 'decimalmargin': '0.50', 'dob': '2013-08-30T00:00:00', 'thisdistance': '0-0-0-0', 'age': '3', 'heavytrack': '0-0-0-0', 'career': '4-1-1-1 075.00'}

或者,如果您想在 defauldict 中分组:

from collections import defaultdict
from operator import itemgetter

d = defaultdict(list)



for sample in root.xpath('//race/nomination'):
      for k,v in zip(atts, itemgetter(*atts)(sample.attrib)):
          d[k].append(v)

print(d)

哪个会给你:

 defaultdict(<type 'list'>, {'thistrack': ['1-0-0-1 000.00', '1-1-0-0 750.00'], 'rating': ['0', '0'], 'weight': ['54', '57.5'], 'number': ['8', '1'], 'sex': ['F', 'C'], 'id': ['198926', '200769'], 'penalty': ['0', '0'], 'horse': ['Chipanda', 'Legerity'], 'pricestarting': ['.50', '.50F'], 'colours': ['Royal Blue', 'Purple, Gold Checks, Quartered Cap'], 'saddlecloth': ['8', '1'], 'description': ['B F 2 Sepoy x Lobola (Anabaa(USA))', 'B C 2 Snitzel x Simply Spiteful(USA) (Speightstown(USA))'], 'barrier': ['5', '1'], 'weightvariation': ['0', '0'], 'finished': ['1', '2'], 'variedweight': ['54', '57.5'], 'goodtrack': ['0-0-0-0', '3-1-0-1 150.00'], 'owners': ['Godolphin ', 'Highgrove Stud Syndicate (Mgr: R T Gilbert)'], 'decimalmargin': ['0.00', '0.50'], 'dob': ['2013-10-08T00:00:00', '2013-08-30T00:00:00'], 'thisdistance': ['0-0-0-0', '0-0-0-0'], 'age': ['3', '3'], 'heavytrack': ['0-0-0-0', '0-0-0-0'], 'career': ['2-0-0-2 225.00', '4-1-1-1 075.00']})

或者:

d = defaultdict(list)

for sample in root.xpath('//race/nomination'):
    dct = sample.attrib
    for k in atts:
        d[k].append(dct[k])

print(d)

对于任何缺失的keys/attributes,设置默认值None:

for sample in root.xpath('//race/nomination'):
       print(dict(zip(atts, map(sample.attrib.get, atts))))

dict 查找的复杂度为 O(1),因此我认为您不会获得更有效的方法来提取所需的属性。