解析根文件夹及其子文件夹中的 xml 个文件
parse xml files in root folder and its sub folders
我正在处理一个包含 27 个文件夹和每个文件夹内的各种 XML 文件的目录。
能够:
- 解析一个 XML 文件并写入 CSV 文件
- 遍历一个文件夹并读取并解析其中的所有XML个文件
挑战:
- 尝试遍历并解析所有 XML 文件从根文件夹到其子文件夹时遇到问题
发送帮助,谢谢。下面的代码片段
# working in one folder only
import csv
import xml.etree.ElementTree as ET
import os
## directory
path = "/Users.../y"
filenames = []
## Count the number of xml files of each folder
files = os.listdir(path)
print("\n")
xml_data_to_csv = open('/Users.../xml_extract.csv', 'w')
list_head = []
csvwriter = csv.writer(xml_data_to_csv)
# Read XML files in a folder
for filename in os.listdir(path):
if not filename.endswith('.xml'):
continue
fullname = os.path.join(path,filename)
print("\n", fullname)
filenames.append(fullname)
# parse elements in each XML file
for filename in filenames:
tree = ET.parse(filename)
root = tree.getroot()
extract_xml=[]
## extract child elements per xml file
print("\n")
for x in root.iter('Info'):
for element in x:
print(element.tag,element.text)
extract_xml.append(element.text)
## Write list nodes to csv
csvwriter.writerow(extract_xml)
## Close CSV file
xml_data_to_csv.close()
您可以使用 os.walk:
import os
for dir_name, dirs, files in os.walk('<root_dir>'):
# parse files
您可以使用
获取给定路径中所有 XML 文件的列表
import os
path = "main/root"
filelist = []
for root, dirs, files in os.walk(path):
for file in files:
if not file.endswith('.xml'):
continue
filelist.append(os.path.join(root, file))
for file in filelist:
print(file)
# or in your case parse the XML 'file'
如果例如:
$ tree /main/root
/main/root
├── a
│ ├── a.xml
│ ├── b.xml
│ └── c.xml
├── b
│ ├── d.xml
│ ├── e.xml
│ └── x.txt
└── c
├── f.xml
└── g.xml
我们得到:
/main/root/c/g.xml
/main/root/c/f.xml
/main/root/b/e.xml
/main/root/b/d.xml
/main/root/a/c.xml
/main/root/a/b.xml
/main/root/a/a.xml
如果要对目录和文件进行排序:
for root, dirs, files in os.walk(path):
dirs.sort()
for file in sorted(files):
if not file.endswith('.xml'):
continue
filelist.append(os.path.join(root, file))
您可以使用 pathlib
模块来“glob” XML 文件。它将在所有子目录中搜索您提供的模式和 return Path
已经包含文件路径的对象。稍微清理一下你的脚本,你会
import csv
import xml.etree.ElementTree as ET
from pathlib import Path
## directory
path = Path("/Users.../y")
with open('/Users.../xml_extract.csv', 'w') as xml_data_to_csv:
csvwriter = csv.writer(xml_data_to_csv)
# Read XML files in a folder
for filepath in path.glob("**/*.xml"):
tree = ET.parse(filename)
root = tree.getroot()
extract_xml=[]
## extract child elements per xml file
print("\n")
for x in root.iter('Info'):
for element in x:
print(element.tag,element.text)
extract_xml.append(element.text)
## Write list nodes to csv
csvwriter.writerow(extract_xml)
我正在处理一个包含 27 个文件夹和每个文件夹内的各种 XML 文件的目录。
能够:
- 解析一个 XML 文件并写入 CSV 文件
- 遍历一个文件夹并读取并解析其中的所有XML个文件
挑战:
- 尝试遍历并解析所有 XML 文件从根文件夹到其子文件夹时遇到问题
发送帮助,谢谢。下面的代码片段
# working in one folder only
import csv
import xml.etree.ElementTree as ET
import os
## directory
path = "/Users.../y"
filenames = []
## Count the number of xml files of each folder
files = os.listdir(path)
print("\n")
xml_data_to_csv = open('/Users.../xml_extract.csv', 'w')
list_head = []
csvwriter = csv.writer(xml_data_to_csv)
# Read XML files in a folder
for filename in os.listdir(path):
if not filename.endswith('.xml'):
continue
fullname = os.path.join(path,filename)
print("\n", fullname)
filenames.append(fullname)
# parse elements in each XML file
for filename in filenames:
tree = ET.parse(filename)
root = tree.getroot()
extract_xml=[]
## extract child elements per xml file
print("\n")
for x in root.iter('Info'):
for element in x:
print(element.tag,element.text)
extract_xml.append(element.text)
## Write list nodes to csv
csvwriter.writerow(extract_xml)
## Close CSV file
xml_data_to_csv.close()
您可以使用 os.walk:
import os
for dir_name, dirs, files in os.walk('<root_dir>'):
# parse files
您可以使用
获取给定路径中所有 XML 文件的列表import os
path = "main/root"
filelist = []
for root, dirs, files in os.walk(path):
for file in files:
if not file.endswith('.xml'):
continue
filelist.append(os.path.join(root, file))
for file in filelist:
print(file)
# or in your case parse the XML 'file'
如果例如:
$ tree /main/root
/main/root
├── a
│ ├── a.xml
│ ├── b.xml
│ └── c.xml
├── b
│ ├── d.xml
│ ├── e.xml
│ └── x.txt
└── c
├── f.xml
└── g.xml
我们得到:
/main/root/c/g.xml
/main/root/c/f.xml
/main/root/b/e.xml
/main/root/b/d.xml
/main/root/a/c.xml
/main/root/a/b.xml
/main/root/a/a.xml
如果要对目录和文件进行排序:
for root, dirs, files in os.walk(path):
dirs.sort()
for file in sorted(files):
if not file.endswith('.xml'):
continue
filelist.append(os.path.join(root, file))
您可以使用 pathlib
模块来“glob” XML 文件。它将在所有子目录中搜索您提供的模式和 return Path
已经包含文件路径的对象。稍微清理一下你的脚本,你会
import csv
import xml.etree.ElementTree as ET
from pathlib import Path
## directory
path = Path("/Users.../y")
with open('/Users.../xml_extract.csv', 'w') as xml_data_to_csv:
csvwriter = csv.writer(xml_data_to_csv)
# Read XML files in a folder
for filepath in path.glob("**/*.xml"):
tree = ET.parse(filename)
root = tree.getroot()
extract_xml=[]
## extract child elements per xml file
print("\n")
for x in root.iter('Info'):
for element in x:
print(element.tag,element.text)
extract_xml.append(element.text)
## Write list nodes to csv
csvwriter.writerow(extract_xml)