SAX 混淆 XML 中同名的两个节点
SAX Confusing Two Nodes With Same Name in XML
我有一个 XML 需要使用 SAX 在 python 中解析。
这是我的一小部分 XML :
<MovieRating>
<Movie Id="1">
<Title>Father Figures</Title>
<Duration>01:53:00</Duration>
<Description>Upon learning that their mother has been lying to them for years about their allegedly deceased father, two fraternal twin brothers hit the road in order to find him.</Description>
<Release_Date>2017-12-22</Release_Date>
<Image_URL>https://image.com/1.jpg</Image_URL>
<Country>USA</Country>
<Genres>
<Genre Id="5">
<Title>Comedy</Title>
<Description>None</Description>
</Genre>
</Genres>
</Movies>
</MovieRating>
我正在尝试解析它并将其保存到数据库中,具体如下:
import sqlite3
import xml.sax
class MoviesHandler(xml.sax.ContentHandler):
def __init__(self):
self.sql_attr_name = None
self.sql_attrs = dict()
self.conn = None
def startDocument(self):
self.conn = sqlite3.connect('moviez_sax.db')
c = self.conn.cursor()
c.execute('DROP TABLE IF EXISTS MOVIE')
c.execute('''
CREATE TABLE IF NOT EXISTS Movie (
Id INTEGER NOT NULL,
Title VARCHAR (1000) NOT NULL,
Duration TIME NOT NULL,
Description VARCHAR (5000),
Release_Date DATE NOT NULL,
Image_URL VARCHAR (1000),
Country VARCHAR (150),
PRIMARY KEY (Id)
);''');
def endDocument(self):
self.conn.commit()
self.conn.close()
def startElement(self, xml_name, xml_attrs):
#print("start element", xml_name)
if xml_name.lower() == 'movierating':
pass
if xml_name.lower() == 'movie':
self.sql_attr_name = None
self.sql_attrs = {
'Id' : '',
'Title' : '',
'Duration' : '',
'Description' : '',
'Release_Date' : '',
'Image_URL' : '',
'Country':''
}
self.sql_attrs['Id'] += xml_attrs['Id']
elif xml_name.lower() in ['id', 'title', 'duration', 'description' , 'release_date', 'image_url','country']:
self.sql_attr_name = xml_name
else:
pass
def characters(self, text):
if self.sql_attr_name is not None:
self.sql_attrs[self.sql_attr_name] += text
def endElement(self, xml_name):
if xml_name.lower() == 'movie':
c = self.conn.cursor()
c.execute('INSERT INTO MOVIE(Id,Title,Duration,\
Description,Release_Date,Image_URL, Country) VALUES \
(?,?,?,?,?,?,?)',
(self.sql_attrs['Id'].strip(),
self.sql_attrs['Title'].strip(),
self.sql_attrs['Duration'].strip(),
self.sql_attrs['Description'].strip(),
self.sql_attrs['Release_Date'].strip(),
self.sql_attrs['Image_URL'].strip(),
self.sql_attrs['Country'].strip()))
if __name__ == '__main__':
parser = xml.sax.make_parser()
parser.setContentHandler(MovieRatingHandler())
parser.parse(open('movies.xml','r'))
我的问题是,每当我读取 self.sql_attrs['Title'].strip()
时,它都会从两个节点读取 TITLE 节点:电影和流派
并且它正在保存两个连接的值。就像在那个例子中一样,我为标题获得的价值是:
Father Figures \n Comedy
有没有办法向 SAX 指定要读取的节点标题以及要读取的节点路径?因为我有许多同名的节点,但我想单独阅读 "Movie" 和 "Genre" 并将它们保存在两个不同的表中。
谢谢。
如果genre
和movie
都需要记录一个事件。然后,您将使用 stack/list 推送当前节点。
在那种情况下,您也应该维护一个基于节点的字典。下面只是对您的代码的更新,不是最有效的代码,但展示了如何扩展您的示例
import sqlite3
import xml.sax
class MoviesHandler(xml.sax.ContentHandler):
def __init__(self):
self.sql_attr_name = None
self.nodes = []
self.sql_attrs = dict()
self.conn = None
def startDocument(self):
self.conn = sqlite3.connect('moviez_sax.db')
c = self.conn.cursor()
c.execute('DROP TABLE IF EXISTS MOVIE')
c.execute('''
CREATE TABLE IF NOT EXISTS Movie (
Id INTEGER NOT NULL,
Title VARCHAR (1000) NOT NULL,
Duration TIME NOT NULL,
Description VARCHAR (5000),
Release_Date DATE NOT NULL,
Image_URL VARCHAR (1000),
Country VARCHAR (150),
PRIMARY KEY (Id)
);''');
def endDocument(self):
self.conn.commit()
self.conn.close()
def startElement(self, xml_name, xml_attrs):
#print("start element", xml_name)
if xml_name.lower() == 'movierating':
pass
if xml_name.lower() == 'genre':
self.nodes.append(xml_name)
self.sql_attrs[xml_name] = {
'Title': '',
'Description': '',
}
if xml_name.lower() == 'movie':
self.nodes.append(xml_name)
self.sql_attr_name = None
self.sql_attrs[xml_name] = {
'Id' : '',
'Title' : '',
'Duration' : '',
'Description' : '',
'Release_Date' : '',
'Image_URL' : '',
'Country':''
}
self.sql_attrs[xml_name]['Id'] += xml_attrs['Id']
elif xml_name.lower() in ['id', 'title', 'duration', 'description' , 'release_date', 'image_url','country']:
self.sql_attr_name = xml_name
else:
self.sql_attr_name = None
pass
def characters(self, text):
if self.sql_attr_name is None or len(self.nodes) == 0:
return
if self.sql_attrs[self.nodes[-1]] is not None:
self.sql_attrs[self.nodes[-1]][self.sql_attr_name] += text
def endElement(self, xml_name):
if xml_name.lower() in ['movie', 'genre']:
self.nodes.pop()
if xml_name.lower() == 'movie':
c = self.conn.cursor()
c.execute('INSERT INTO MOVIE(Id,Title,Duration,\
Description,Release_Date,Image_URL, Country) VALUES \
(?,?,?,?,?,?,?)',
(self.sql_attrs[xml_name]['Id'].strip(),
self.sql_attrs[xml_name]['Title'].strip(),
self.sql_attrs[xml_name]['Duration'].strip(),
self.sql_attrs[xml_name]['Description'].strip(),
self.sql_attrs[xml_name]['Release_Date'].strip(),
self.sql_attrs[xml_name]['Image_URL'].strip(),
self.sql_attrs[xml_name]['Country'].strip()))
if __name__ == '__main__':
parser = xml.sax.make_parser()
parser.setContentHandler(MoviesHandler())
parser.parse(open('movies.xml','r'))
如您所见,调试 session 标题是正确的
我有一个 XML 需要使用 SAX 在 python 中解析。
这是我的一小部分 XML :
<MovieRating>
<Movie Id="1">
<Title>Father Figures</Title>
<Duration>01:53:00</Duration>
<Description>Upon learning that their mother has been lying to them for years about their allegedly deceased father, two fraternal twin brothers hit the road in order to find him.</Description>
<Release_Date>2017-12-22</Release_Date>
<Image_URL>https://image.com/1.jpg</Image_URL>
<Country>USA</Country>
<Genres>
<Genre Id="5">
<Title>Comedy</Title>
<Description>None</Description>
</Genre>
</Genres>
</Movies>
</MovieRating>
我正在尝试解析它并将其保存到数据库中,具体如下:
import sqlite3
import xml.sax
class MoviesHandler(xml.sax.ContentHandler):
def __init__(self):
self.sql_attr_name = None
self.sql_attrs = dict()
self.conn = None
def startDocument(self):
self.conn = sqlite3.connect('moviez_sax.db')
c = self.conn.cursor()
c.execute('DROP TABLE IF EXISTS MOVIE')
c.execute('''
CREATE TABLE IF NOT EXISTS Movie (
Id INTEGER NOT NULL,
Title VARCHAR (1000) NOT NULL,
Duration TIME NOT NULL,
Description VARCHAR (5000),
Release_Date DATE NOT NULL,
Image_URL VARCHAR (1000),
Country VARCHAR (150),
PRIMARY KEY (Id)
);''');
def endDocument(self):
self.conn.commit()
self.conn.close()
def startElement(self, xml_name, xml_attrs):
#print("start element", xml_name)
if xml_name.lower() == 'movierating':
pass
if xml_name.lower() == 'movie':
self.sql_attr_name = None
self.sql_attrs = {
'Id' : '',
'Title' : '',
'Duration' : '',
'Description' : '',
'Release_Date' : '',
'Image_URL' : '',
'Country':''
}
self.sql_attrs['Id'] += xml_attrs['Id']
elif xml_name.lower() in ['id', 'title', 'duration', 'description' , 'release_date', 'image_url','country']:
self.sql_attr_name = xml_name
else:
pass
def characters(self, text):
if self.sql_attr_name is not None:
self.sql_attrs[self.sql_attr_name] += text
def endElement(self, xml_name):
if xml_name.lower() == 'movie':
c = self.conn.cursor()
c.execute('INSERT INTO MOVIE(Id,Title,Duration,\
Description,Release_Date,Image_URL, Country) VALUES \
(?,?,?,?,?,?,?)',
(self.sql_attrs['Id'].strip(),
self.sql_attrs['Title'].strip(),
self.sql_attrs['Duration'].strip(),
self.sql_attrs['Description'].strip(),
self.sql_attrs['Release_Date'].strip(),
self.sql_attrs['Image_URL'].strip(),
self.sql_attrs['Country'].strip()))
if __name__ == '__main__':
parser = xml.sax.make_parser()
parser.setContentHandler(MovieRatingHandler())
parser.parse(open('movies.xml','r'))
我的问题是,每当我读取 self.sql_attrs['Title'].strip()
时,它都会从两个节点读取 TITLE 节点:电影和流派
并且它正在保存两个连接的值。就像在那个例子中一样,我为标题获得的价值是:
Father Figures \n Comedy
有没有办法向 SAX 指定要读取的节点标题以及要读取的节点路径?因为我有许多同名的节点,但我想单独阅读 "Movie" 和 "Genre" 并将它们保存在两个不同的表中。
谢谢。
如果genre
和movie
都需要记录一个事件。然后,您将使用 stack/list 推送当前节点。
在那种情况下,您也应该维护一个基于节点的字典。下面只是对您的代码的更新,不是最有效的代码,但展示了如何扩展您的示例
import sqlite3
import xml.sax
class MoviesHandler(xml.sax.ContentHandler):
def __init__(self):
self.sql_attr_name = None
self.nodes = []
self.sql_attrs = dict()
self.conn = None
def startDocument(self):
self.conn = sqlite3.connect('moviez_sax.db')
c = self.conn.cursor()
c.execute('DROP TABLE IF EXISTS MOVIE')
c.execute('''
CREATE TABLE IF NOT EXISTS Movie (
Id INTEGER NOT NULL,
Title VARCHAR (1000) NOT NULL,
Duration TIME NOT NULL,
Description VARCHAR (5000),
Release_Date DATE NOT NULL,
Image_URL VARCHAR (1000),
Country VARCHAR (150),
PRIMARY KEY (Id)
);''');
def endDocument(self):
self.conn.commit()
self.conn.close()
def startElement(self, xml_name, xml_attrs):
#print("start element", xml_name)
if xml_name.lower() == 'movierating':
pass
if xml_name.lower() == 'genre':
self.nodes.append(xml_name)
self.sql_attrs[xml_name] = {
'Title': '',
'Description': '',
}
if xml_name.lower() == 'movie':
self.nodes.append(xml_name)
self.sql_attr_name = None
self.sql_attrs[xml_name] = {
'Id' : '',
'Title' : '',
'Duration' : '',
'Description' : '',
'Release_Date' : '',
'Image_URL' : '',
'Country':''
}
self.sql_attrs[xml_name]['Id'] += xml_attrs['Id']
elif xml_name.lower() in ['id', 'title', 'duration', 'description' , 'release_date', 'image_url','country']:
self.sql_attr_name = xml_name
else:
self.sql_attr_name = None
pass
def characters(self, text):
if self.sql_attr_name is None or len(self.nodes) == 0:
return
if self.sql_attrs[self.nodes[-1]] is not None:
self.sql_attrs[self.nodes[-1]][self.sql_attr_name] += text
def endElement(self, xml_name):
if xml_name.lower() in ['movie', 'genre']:
self.nodes.pop()
if xml_name.lower() == 'movie':
c = self.conn.cursor()
c.execute('INSERT INTO MOVIE(Id,Title,Duration,\
Description,Release_Date,Image_URL, Country) VALUES \
(?,?,?,?,?,?,?)',
(self.sql_attrs[xml_name]['Id'].strip(),
self.sql_attrs[xml_name]['Title'].strip(),
self.sql_attrs[xml_name]['Duration'].strip(),
self.sql_attrs[xml_name]['Description'].strip(),
self.sql_attrs[xml_name]['Release_Date'].strip(),
self.sql_attrs[xml_name]['Image_URL'].strip(),
self.sql_attrs[xml_name]['Country'].strip()))
if __name__ == '__main__':
parser = xml.sax.make_parser()
parser.setContentHandler(MoviesHandler())
parser.parse(open('movies.xml','r'))
如您所见,调试 session 标题是正确的