解析 xml 文件并提供包含检索到的数据的对象列表
Parsing an xml file and feed a list of objects with retrieved data
编辑:在 post.
的底部添加了一个用于审查的工作解决方案
所以每次我触摸 xml 我都想把头撞到墙上。通常这是为了写一个文件,我终于找到了解决所有不一致的方法,但这次我必须解析一个文档。
场景如下:我有一个 xml 文件列出游戏,每个游戏都有一些属性(或子节点?实际上我不确定)。我想要的是:
For each game:
Gets it's path, name, and genre
Build a Game object with this
Store the object in an array list
我理解 "findall" 命令,但我不明白如何 link 它们之间的数据。因为它是一棵树,我想我应该能够从一个游戏走到另一个游戏,获取我需要的数据,然后继续下一个游戏,但是见鬼,我卡住了。
这里是我需要解析的 xml 文件的摘录:
<?xml version="1.0"?>
<gameList>
<provider>
<System>Megadrive</System>
<software>Skraper</software>
<database>ScreenScraper.fr</database>
<web>http://www.screenscraper.fr</web>
</provider>
<game id="574" source="ScreenScraper.fr">
<path>./3 Ninjas Kick Back.zip</path>
<name>3 Ninjas Kick Back</name>
<genre>Platform-Action</genre>
</game>
<game id="394" source="ScreenScraper.fr">
<path>./688 Attack Sub.zip</path>
<name>688 Attack Sub</name>
<genre>Simulation</genre>
</game>
</gameList>
这是我当前的代码,在沙箱中,尝试和体验状态:
import os
from xml.etree import ElementTree
class GameListParser:
GAMELIST_FILE = 'gamelist.xml'
GAMELIST_KEY = "gameList"
GAME_KEY = "game"
GENRE_KEY = "genre"
PATH_KEY = "path"
NAME_KEY = "name"
keys_map = {
GAMELIST_KEY: {
GAME_KEY: [NAME_KEY, GENRE_KEY, PATH_KEY]
}
}
def __init__(self, gamelist_path):
self.gamelist = os.path.join(gamelist_path, self.GAMELIST_FILE)
self.parsed_gamelist = None
self.__parse()
def __parse(self):
self.parsed_gamelist = ElementTree.parse(self.gamelist)
def __get_root(self):
return self.parsed_gamelist.getroot()
def get_all_games(self):
return self.parsed_gamelist.findall(self.GAME_KEY)
def print_games_details(self):
for node in self.get_all_games():
for game in node.getiterator():
name = game.attrib.get(self.NAME_KEY)
genre = game.attrib.get(self.GENRE_KEY)
使用print_games_details
方法我只是希望打印游戏数据,但实际上节点和游戏对象是相同的,所以名称和类型都是None,我没有检索数据我需要
我很确定它很简单,但我一生中只用过 3 到 4 次 xml,我唯一一次必须解析成对象是用 C++,它是一个系统完成重构。另外两次是在 Matlab 中,Python 在指向 xml 文件的对象中。每次我都难以理解树的逻辑,如何parse/create它,在线资源对我帮助不大。
编辑:所以我研究了一个解决方案,虽然它给了我预期的结果,但我对它一点也不满意。
我的问题是这个解决方案意味着我非常了解 xml 文件的结构,而我只是走它。
我无法用它做一些通用的事情,这是我对 xml 方法的主要担忧之一。
如果你们中的任何人可以审阅以下代码并提供反馈和改进,我将不胜感激:
import os
from xml.etree import ElementTree
class GameListParser:
GAMELIST_FILE = 'gamelist.xml'
GAME_ID = 'id'
GAME_KEY = "game"
GENRE_KEY = "genre"
PATH_KEY = "path"
NAME_KEY = "name"
keys_map = [NAME_KEY, GENRE_KEY, PATH_KEY]
game_map = {}
def __init__(self, gamelist_path):
self.gamelist = os.path.join(gamelist_path, self.GAMELIST_FILE)
self.parsed_gamelist = None
self.__parse()
def __str__(self):
text_output = []
for game_id, game in self.game_map.items():
text_output.append("Game " + game_id + " has properties:")
for key, value in game.items():
text_output.append(key + ": " + value)
text_output.append("\n")
return "\n".join(text_output)
def __get_game_id(self, game):
return game.get(self.GAME_ID)
def __game_is_valid(self, game):
return self.__get_game_id(game) is not None
def __get_all_games(self):
return self.parsed_gamelist.findall(self.GAME_KEY)
def __process_all_games(self):
for game in self.__get_all_games():
self.__process_game_nodes(game)
def __process_game_nodes(self, game):
if self.__game_is_valid(game):
details = {}
self.game_map[self.__get_game_id(game)] = details
for key in self.keys_map:
game_child = game.find(key)
if game_child is not None:
details[key] = game_child.text
else:
details[key] = ""
def __parse(self):
self.parsed_gamelist = ElementTree.parse(self.gamelist)
self.__process_all_games()
推荐一个第三方库:SimplifiedDoc。 pip install -U simplified_scrapy
from simplified_scrapy import SimplifiedDoc
html = '''
<?xml version="1.0"?>
<gameList>
<provider>
<System>Megadrive</System>
<software>Skraper</software>
<database>ScreenScraper.fr</database>
<web>http://www.screenscraper.fr</web>
</provider>
<game id="574" source="ScreenScraper.fr">
<path>./3 Ninjas Kick Back.zip</path>
<name>3 Ninjas Kick Back</name>
<genre>Platform-Action</genre>
</game>
<game id="394" source="ScreenScraper.fr">
<path>./688 Attack Sub.zip</path>
<name>688 Attack Sub</name>
<genre>Simulation</genre>
</game>
</gameList>
'''
doc = SimplifiedDoc(html)
games = doc.gameList.games
datas = [[g.path.text,g.name.text,g.genre.text] for g in games]
print (datas)
结果:
[['./3 Ninjas Kick Back.zip', '3 Ninjas Kick Back', 'Platform-Action'], ['./688 Attack Sub.zip', '688 Attack Sub', 'Simulation']]
这里有更多例子:https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples
编辑:在 post.
的底部添加了一个用于审查的工作解决方案所以每次我触摸 xml 我都想把头撞到墙上。通常这是为了写一个文件,我终于找到了解决所有不一致的方法,但这次我必须解析一个文档。
场景如下:我有一个 xml 文件列出游戏,每个游戏都有一些属性(或子节点?实际上我不确定)。我想要的是:
For each game:
Gets it's path, name, and genre
Build a Game object with this
Store the object in an array list
我理解 "findall" 命令,但我不明白如何 link 它们之间的数据。因为它是一棵树,我想我应该能够从一个游戏走到另一个游戏,获取我需要的数据,然后继续下一个游戏,但是见鬼,我卡住了。
这里是我需要解析的 xml 文件的摘录:
<?xml version="1.0"?>
<gameList>
<provider>
<System>Megadrive</System>
<software>Skraper</software>
<database>ScreenScraper.fr</database>
<web>http://www.screenscraper.fr</web>
</provider>
<game id="574" source="ScreenScraper.fr">
<path>./3 Ninjas Kick Back.zip</path>
<name>3 Ninjas Kick Back</name>
<genre>Platform-Action</genre>
</game>
<game id="394" source="ScreenScraper.fr">
<path>./688 Attack Sub.zip</path>
<name>688 Attack Sub</name>
<genre>Simulation</genre>
</game>
</gameList>
这是我当前的代码,在沙箱中,尝试和体验状态:
import os
from xml.etree import ElementTree
class GameListParser:
GAMELIST_FILE = 'gamelist.xml'
GAMELIST_KEY = "gameList"
GAME_KEY = "game"
GENRE_KEY = "genre"
PATH_KEY = "path"
NAME_KEY = "name"
keys_map = {
GAMELIST_KEY: {
GAME_KEY: [NAME_KEY, GENRE_KEY, PATH_KEY]
}
}
def __init__(self, gamelist_path):
self.gamelist = os.path.join(gamelist_path, self.GAMELIST_FILE)
self.parsed_gamelist = None
self.__parse()
def __parse(self):
self.parsed_gamelist = ElementTree.parse(self.gamelist)
def __get_root(self):
return self.parsed_gamelist.getroot()
def get_all_games(self):
return self.parsed_gamelist.findall(self.GAME_KEY)
def print_games_details(self):
for node in self.get_all_games():
for game in node.getiterator():
name = game.attrib.get(self.NAME_KEY)
genre = game.attrib.get(self.GENRE_KEY)
使用print_games_details
方法我只是希望打印游戏数据,但实际上节点和游戏对象是相同的,所以名称和类型都是None,我没有检索数据我需要
我很确定它很简单,但我一生中只用过 3 到 4 次 xml,我唯一一次必须解析成对象是用 C++,它是一个系统完成重构。另外两次是在 Matlab 中,Python 在指向 xml 文件的对象中。每次我都难以理解树的逻辑,如何parse/create它,在线资源对我帮助不大。
编辑:所以我研究了一个解决方案,虽然它给了我预期的结果,但我对它一点也不满意。 我的问题是这个解决方案意味着我非常了解 xml 文件的结构,而我只是走它。
我无法用它做一些通用的事情,这是我对 xml 方法的主要担忧之一。
如果你们中的任何人可以审阅以下代码并提供反馈和改进,我将不胜感激:
import os
from xml.etree import ElementTree
class GameListParser:
GAMELIST_FILE = 'gamelist.xml'
GAME_ID = 'id'
GAME_KEY = "game"
GENRE_KEY = "genre"
PATH_KEY = "path"
NAME_KEY = "name"
keys_map = [NAME_KEY, GENRE_KEY, PATH_KEY]
game_map = {}
def __init__(self, gamelist_path):
self.gamelist = os.path.join(gamelist_path, self.GAMELIST_FILE)
self.parsed_gamelist = None
self.__parse()
def __str__(self):
text_output = []
for game_id, game in self.game_map.items():
text_output.append("Game " + game_id + " has properties:")
for key, value in game.items():
text_output.append(key + ": " + value)
text_output.append("\n")
return "\n".join(text_output)
def __get_game_id(self, game):
return game.get(self.GAME_ID)
def __game_is_valid(self, game):
return self.__get_game_id(game) is not None
def __get_all_games(self):
return self.parsed_gamelist.findall(self.GAME_KEY)
def __process_all_games(self):
for game in self.__get_all_games():
self.__process_game_nodes(game)
def __process_game_nodes(self, game):
if self.__game_is_valid(game):
details = {}
self.game_map[self.__get_game_id(game)] = details
for key in self.keys_map:
game_child = game.find(key)
if game_child is not None:
details[key] = game_child.text
else:
details[key] = ""
def __parse(self):
self.parsed_gamelist = ElementTree.parse(self.gamelist)
self.__process_all_games()
推荐一个第三方库:SimplifiedDoc。 pip install -U simplified_scrapy
from simplified_scrapy import SimplifiedDoc
html = '''
<?xml version="1.0"?>
<gameList>
<provider>
<System>Megadrive</System>
<software>Skraper</software>
<database>ScreenScraper.fr</database>
<web>http://www.screenscraper.fr</web>
</provider>
<game id="574" source="ScreenScraper.fr">
<path>./3 Ninjas Kick Back.zip</path>
<name>3 Ninjas Kick Back</name>
<genre>Platform-Action</genre>
</game>
<game id="394" source="ScreenScraper.fr">
<path>./688 Attack Sub.zip</path>
<name>688 Attack Sub</name>
<genre>Simulation</genre>
</game>
</gameList>
'''
doc = SimplifiedDoc(html)
games = doc.gameList.games
datas = [[g.path.text,g.name.text,g.genre.text] for g in games]
print (datas)
结果:
[['./3 Ninjas Kick Back.zip', '3 Ninjas Kick Back', 'Platform-Action'], ['./688 Attack Sub.zip', '688 Attack Sub', 'Simulation']]
这里有更多例子:https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples