python解析xml文件增删查找
XML文件

判断bndbox大小,并为object添加子节点difficult
import os
import os.path
from xml.etree import ElementTree as ET
path='C:/Users/my/Desktop/xmllll/city/1'
new_path='C:/Users/my/Desktop/xmllll/city/1/'
files=os.listdir(path)
my_area = int(input("请输入area区域大小:"))
for xmlFile in files:
if not os.path.isdir(xmlFile):
print(xmlFile)
per = ET.parse(new_path + xmlFile)
p=per.findall('./object')
number_name = 0
area = []
for oneper in p:
for child in oneper.getchildren():
for grandson in child.getchildren():
area.append(int(child.find('xmin').text))
area.append(int(child.find('ymin').text))
area.append(int(child.find('xmax').text))
area.append(int(child.find('ymax').text))
if len(area) == 4:
element = ET.Element("difficult")
if ((area[2]-area[0]) > my_area) & ((area[3]-area[1])> my_area):
element.text = "0"
else:
element.text = "1"
oneper.insert(1,element)
per.write(new_path + xmlFile, encoding="utf-8",xml_declaration=True)
area.clear()
break
print('-------')
- 该文件可以批处理xml文件
- 可以添加子节点,索引
- 但是存在速度问题,一直没法提高速度,应该和使用了大量for循环有关
-
注意:修改的xml文件是缓存在内存当中的,因此一定要保存
- 添加子节点可以用append或者insert,指定位置用insert
Result

xml增删查找
"""
Created on Mon Mar 18 17:36:45 2019
@author: psqk
"""
from xml.etree import ElementTree
from xml.etree.ElementTree import Element, SubElement
from lxml import etree
import codecs
XML_EXT = '.xml'
ENCODE_METHOD = 'utf-8'
class PascalVocWriter:
def __init__(self, foldername, filename, imgSize,databaseSrc='Unknown', localImgPath=None):
self.foldername = foldername
self.filename = filename
self.databaseSrc = databaseSrc
self.imgSize = imgSize
self.boxlist = []
self.localImgPath = localImgPath
self.verified = False
def prettify(self, elem):
"""
Return a pretty-printed XML string for the Element.
"""
rough_string = ElementTree.tostring(elem, 'utf8')
root = etree.fromstring(rough_string)
return etree.tostring(root, pretty_print=True, encoding=ENCODE_METHOD).replace(" ".encode(), "\t".encode())
'''reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent="\t", encoding=ENCODE_METHOD)'''
def genXML(self):
"""
Return XML root
"""
if self.filename is None or \
self.foldername is None or \
self.imgSize is None:
return None
top = Element('annotation')
if self.verified:
top.set('verified', 'yes')
folder = SubElement(top, 'folder')
folder.text = self.foldername
filename = SubElement(top, 'filename')
filename.text = self.filename[-6:]+'.jpg'
if self.localImgPath is not None:
localImgPath = SubElement(top, 'path')
localImgPath.text = self.localImgPath
source = SubElement(top, 'source')
database = SubElement(source, 'database')
database.text = self.databaseSrc
annotation = SubElement(source,'annotation')
annotation.text = 'PASCAL VOC2007'
size_part = SubElement(top, 'size')
width = SubElement(size_part, 'width')
height = SubElement(size_part, 'height')
depth = SubElement(size_part, 'depth')
width.text = str(self.imgSize[1])
height.text = str(self.imgSize[0])
if len(self.imgSize) == 3:
depth.text = str(self.imgSize[2])
else:
depth.text = '1'
return top
def addBndBox(self, xmin, ymin, xmax, ymax, name, difficult):
bndbox = {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}
bndbox['name'] = name
bndbox['difficult'] = difficult
self.boxlist.append(bndbox)
def appendObjects(self, top):
for each_object in self.boxlist:
object_item = SubElement(top, 'object')
name = SubElement(object_item, 'name')
try:
name.text = unicode(each_object['name'])
except NameError:
name.text = each_object['name']
difficult = SubElement(object_item, 'difficult')
difficult.text = str( bool(each_object['difficult']) & 1 )
bndbox = SubElement(object_item, 'bndbox')
xmin = SubElement(bndbox, 'xmin')
xmin.text = str(each_object['xmin'])
ymin = SubElement(bndbox, 'ymin')
ymin.text = str(each_object['ymin'])
xmax = SubElement(bndbox, 'xmax')
xmax.text = str(each_object['xmax'])
ymax = SubElement(bndbox, 'ymax')
ymax.text = str(each_object['ymax'])
def save(self, targetFile=None):
root = self.genXML()
self.appendObjects(root)
out_file = None
if targetFile is None:
out_file = codecs.open(
self.filename + XML_EXT, 'w', encoding=ENCODE_METHOD)
else:
out_file = codecs.open(targetFile, 'w', encoding=ENCODE_METHOD)
prettifyResult = self.prettify(root)
out_file.write(prettifyResult.decode('utf8'))
out_file.close()
class PascalVocReader:
def __init__(self, filepath):
self.shapes = []
self.filepath = filepath
self.verified = False
try:
self.parseXML()
except:
pass
def getShapes(self):
return self.shapes
def addShape(self, label, bndbox, difficult):
xmin = int(bndbox.find('xmin').text)
ymin = int(bndbox.find('ymin').text)
xmax = int(bndbox.find('xmax').text)
ymax = int(bndbox.find('ymax').text)
points = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
self.shapes.append((label, points, None, None, difficult))
def parseXML(self):
assert self.filepath.endswith(XML_EXT), "Unsupport file format"
parser = etree.XMLParser(encoding=ENCODE_METHOD)
xmltree = ElementTree.parse(self.filepath, parser=parser).getroot()
filename = xmltree.find('filename').text
try:
verified = xmltree.attrib['verified']
if verified == 'yes':
self.verified = True
except KeyError:
self.verified = False
for object_iter in xmltree.findall('object'):
bndbox = object_iter.find("bndbox")
label = object_iter.find('name').text
difficult = False
if object_iter.find('difficult') is not None:
difficult = bool(int(object_iter.find('difficult').text))
self.addShape(label, bndbox, difficult)
return True
class PascalVocReader2:
def __init__(self, filepath):
self.shapes = []
self.filepath = filepath
self.verified = False
try:
self.parseXML()
except:
pass
def getShapes(self):
return self.shapes
def addShape(self, label, bndbox, difficult):
xmin = int(bndbox.find('xmin').text)
ymin = int(bndbox.find('ymin').text)
xmax = int(bndbox.find('xmax').text)
ymax = int(bndbox.find('ymax').text)
points = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
self.shapes.append((label, points, None, None, difficult))
def parseXML(self):
assert self.filepath.endswith(XML_EXT), "Unsupport file format"
parser = etree.XMLParser(encoding=ENCODE_METHOD)
xmltree = ElementTree.parse(self.filepath, parser=parser).getroot()
filename = xmltree.find('filename').text
try:
verified = xmltree.attrib['verified']
if verified == 'yes':
self.verified = True
except KeyError:
self.verified = False
for object_iter in xmltree.findall('object'):
bndbox = object_iter.find("bndbox")
xmin = bndbox.find('xmin').text
xmax = bndbox.find('xmax').text
ymin = bndbox.find('ymin').text
ymax = bndbox.find('ymax').text
label = object_iter.find('name').text
print(label)
difficult = False
if object_iter.find('difficult') is not None:
difficult = bool(int(object_iter.find('difficult').text))
self.addShape(label, bndbox, difficult)
return True,xmin,xmax,ymin,ymax
'''
data_path = 'VOC2007'
xmax = 100
ymax = 235
xmin = 25
ymin = 12
pw = PascalVocWriter(data_path,filename = 'haha.jpg',imgSize = (12,12,3))
pw.genXML()
pw.addBndBox(xmin,ymin,xmax,ymax,'car',difficult = 1)
pw.addBndBox(xmin*2,ymin,xmax*3,ymax,'carban',difficult = 0)
pw.save()
print("finished")
'''
if __name__ == '__main__':
for i in range(1,22063):
string = repr(i)
strings = string.zfill(6)
pr = ('Annotations/{}.xml'.format(strings))
parser = etree.XMLParser(encoding=ENCODE_METHOD)
xmltree = ElementTree.parse(pr, parser=parser).getroot()
foldername = xmltree.find('folder').text
file_name = xmltree.find('filename').text
source = xmltree.find('source')
database_name = source.find('database').text
annotation_name = source.find('annotation').text
owner = xmltree.find('owner')
size = xmltree.find('size')
width = size.find('width').text
height = size.find('height').text
depth = size.find('depth').text
img_size = (height,width,depth)
file_names = ('output/{}'.format(file_name[:-4]))
pw = PascalVocWriter(foldername,filename = file_names,imgSize = img_size,databaseSrc = database_name)
pw.genXML()
for object_iter in xmltree.findall('object'):
bndbox = object_iter.find("bndbox")
xmin = bndbox.find('xmin').text
xmax = bndbox.find('xmax').text
ymin = bndbox.find('ymin').text
ymax = bndbox.find('ymax').text
label = object_iter.find('name').text
if int(xmax)-int(xmin)<15 and int(ymax)-int(ymin)<15:
difficult_val = 1
print(pr[-10:])
else:
difficult_val = 0
pw.addBndBox(xmin,ymin,xmax,ymax,label,difficult = difficult_val)
pw.save()