本文主要参考官方文档https://lxml.de/tutorial.html 整理,如有错误欢迎指出。
1 2 from lxml import etreefrom copy import deepcopy
1 2 3 4 5 6 7 8 9 10 11 12 13 root = etree.Element("root" ) root.append(etree.Element("child1" )) child2 = etree.SubElement(root, "child2" ) child3 = etree.SubElement(root, "child3" ) print(etree.tostring(root)) print(etree.tostring(root, pretty_print=True )) print(etree.tostring(root, encoding='iso-8859-1' )) print(str(etree.tostring(root, pretty_print=True ),encoding='utf-8' ))
b'<root><child1/><child2/><child3/></root>'
b'<root>\n <child1/>\n <child2/>\n <child3/>\n</root>\n'
b"<?xml version='1.0' encoding='iso-8859-1'?>\n<root><child1/><child2/><child3/></root>"
<root>
<child1/>
<child2/>
<child3/>
</root>
1 2 3 4 5 6 7 child1 = root[0 ] print(child1.tag, len(root)) root.insert(0 , etree.Element("child0" )) print(root[0 ].tag)
child1 3
child0
1 2 3 4 children = list(root) print(root) print(children)
<Element root at 0x107b03888>
[<Element child0 at 0x107b034c8>, <Element child1 at 0x107b03448>, <Element child2 at 0x107b038c8>, <Element child3 at 0x107b03908>]
1 2 3 4 5 6 7 if len(root): print('root has children' ) if len(child1): print('child1 has children' ) else : print('child1 has no child' )
root has children
child1 has no child
1 2 3 4 5 6 7 8 9 10 print(etree.tostring(root)) root[0 ] = root[-1 ] print(etree.tostring(root)) l = [0 ,1 ,2 ,3 ] print(l) l[0 ] = l[-1 ] print(l)
b'<root><child0/><child1/><child2/><child3/></root>'
b'<root><child3/><child1/><child2/></root>'
[0, 1, 2, 3]
[3, 1, 2, 3]
1 2 3 4 5 newroot = etree.Element('newroot' ) newroot.append(deepcopy(root[1 ])) print(etree.tostring(root)) print(etree.tostring(newroot))
b'<root><child3/><child1/><child2/></root>'
b'<newroot><child1/></newroot>'
1 2 3 4 print(root is child1.getparent()) print(root[0 ] is root[1 ].getprevious()) print(root[1 ] is root[0 ].getnext())
True
True
True
1 2 3 4 5 6 7 8 9 root = etree.Element("root" , interesting="totally" ) print(etree.tostring(root)) print(root.get("interesting" )) print(root.get("hello" )) root.set("hello" , "Huhu" ) print(etree.tostring(root))
b'<root interesting="totally"/>'
totally
None
b'<root interesting="totally" hello="Huhu"/>'
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 print(root.items(), root.keys(), root.values()) attributes = root.attrib print(attributes.get("no-such-attribute" )) None attributes["hello" ] = "Guten Tag" print(attributes["hello" ]) print(root.get("hello" )) d = dict(root.attrib) print(d.items())
[('interesting', 'totally'), ('hello', 'Huhu')] ['interesting', 'hello'] ['totally', 'Huhu']
None
Guten Tag
Guten Tag
dict_items([('interesting', 'totally'), ('hello', 'Guten Tag')])
1 2 3 4 5 root = etree.Element("root" ) root.text = "Hello World" print(root.text) print(etree.tostring(root))
Hello World
b'<root>Hello World</root>'
1 2 3 4 5 6 7 8 9 10 11 12 13 14 html = etree.Element("html" ) body = etree.SubElement(html, "body" ) body.text = "TEXT" print(etree.tostring(html)) br = etree.SubElement(body, "br" ) print(etree.tostring(html)) br.tail = "TAIL" print(etree.tostring(html)) body.text = "HEAD" print(etree.tostring(html))
b'<html><body>TEXT</body></html>'
b'<html><body>TEXT<br/></body></html>'
b'<html><body>TEXT<br/>TAIL</body></html>'
b'<html><body>HEAD<br/>TAIL</body></html>'
1 2 3 4 print(etree.tostring(br)) print(etree.tostring(br, with_tail=False )) print(etree.tostring(html, method="text" ))
b'<br/>TAIL'
b'<br/>'
b'HEADTAIL'
1 2 3 print(html.xpath("string()" )) print(html.xpath("//text()" ))
HEADTAIL
['HEAD', 'TAIL']
1 2 3 build_text_list = etree.XPath("//text()" ) print(build_text_list(html))
['HEAD', 'TAIL']
1 2 3 4 5 6 7 8 9 texts = build_text_list(html) print(texts[0 ]) parent = texts[0 ].getparent() print(parent.tag) print(texts[1 ]) print(texts[1 ].getparent().tag)
HEAD
body
TAIL
br
1 2 3 4 print(texts[0 ].is_text) print(texts[1 ].is_text) print(texts[1 ].is_tail)
True
False
True
1 2 3 4 stringify = etree.XPath("string()" ) print(stringify(html)) print(stringify(html).getparent())
HEADTAIL
None
1 2 3 4 5 6 7 8 9 root = etree.Element("root" ) etree.SubElement(root, "child" ).text = "Child 1" etree.SubElement(root, "child" ).text = "Child 2" etree.SubElement(root, "another" ).text = "Child 3" print(etree.tostring(root, pretty_print=True )) for element in root.iter(): print("%s - %s" % (element.tag, element.text))
b'<root>\n <child>Child 1</child>\n <child>Child 2</child>\n <another>Child 3</another>\n</root>\n'
root - None
child - Child 1
child - Child 2
another - Child 3
1 2 3 4 5 for element in root.iter("child" ): print("%s - %s" % (element.tag, element.text)) for element in root.iter("another" , "child" ): print("%s - %s" % (element.tag, element.text))
child - Child 1
child - Child 2
child - Child 1
child - Child 2
another - Child 3
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 root.append(etree.Entity("#234" )) root.append(etree.Comment("some comment" )) print(etree.tostring(root)) for element in root.iter(): if isinstance(element.tag, str): print("%s - %s" % (element.tag, element.text)) else : print("SPECIAL: %s - %s" % (element, element.text)) for element in root.iter(tag=etree.Element): print("%s - %s" % (element.tag, element.text)) for element in root.iter(tag=etree.Entity): print(element.text)
b'<root><child>Child 1</child><child>Child 2</child><another>Child 3</another>ê<!--some comment--></root>'
root - None
child - Child 1
child - Child 2
another - Child 3
SPECIAL: ê - ê
SPECIAL: <!--some comment--> - some comment
root - None
child - Child 1
child - Child 2
another - Child 3
ê
1 2 3 4 5 6 7 root = etree.XML( '<html><head/><body><p>Hello<br/>World</p></body></html>' ) print(etree.tostring(root)) print(etree.tostring(root, method='html' )) print(etree.tostring(root, method='text' ))
b'<html><head/><body><p>Hello<br/>World</p></body></html>'
b'<html><head></head><body><p>Hello<br>World</p></body></html>'
b'HelloWorld'
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 root = etree.XML('''\ <?xml version="1.0"?> <!DOCTYPE root SYSTEM "test" [ <!ENTITY tasty "parsnips"> ]> <root> <a>&tasty;</a> </root> ''' )tree = etree.ElementTree(root) print(tree.docinfo.xml_version) print(tree.docinfo.doctype) tree.docinfo.public_id = '-//W3C//DTD XHTML 1.0 Transitional//EN' tree.docinfo.system_url = 'file://local.dtd' print(tree.docinfo.doctype)
1.0
<!DOCTYPE root SYSTEM "test">
<!DOCTYPE root PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "file://local.dtd">
1 2 print(etree.tostring(tree)) print(etree.tostring(tree.getroot()))
b'<!DOCTYPE root PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "file://local.dtd" [\n<!ENTITY tasty "parsnips">\n]>\n<root>\n <a>parsnips</a>\n</root>'
b'<root>\n <a>parsnips</a>\n</root>'
1 2 3 4 5 6 some_xml_data = "<root>data</root>" root = etree.fromstring(some_xml_data) print(etree.tostring(root))
b'<root>data</root>'
1 2 3 4 5 6 7 8 9 10 11 12 13 14 from io import BytesIOsome_file_or_file_like_object = BytesIO(b"<root>data</root>" ) tree = etree.parse(some_file_or_file_like_object) print(etree.tostring(tree)) root = tree.getroot() print(etree.tostring(root)) try : print("tree执行:" , tree.tag) except : print("root执行:" , root.tag)
b'<root>data</root>'
b'<root>data</root>'
root执行: root
1 2 3 4 parser = etree.XMLParser(remove_blank_text=True ) root = etree.XML("<root> <a/> <b> </b> </root>" , parser) print(etree.tostring(root))
b'<root><a/><b> </b></root>'
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 class DataSource : data = [ b"<roo" , b"t><" , b"a/" , b"><" , b"/root>" ] def read (self, requested_size) : try : return self.data.pop(0 ) except IndexError: return b'' tree = etree.parse(DataSource()) print(etree.tostring(tree)) parser = etree.XMLParser() parser.feed("<roo" ) parser.feed("t><" ) parser.feed("a/" ) parser.feed("><" ) parser.feed("/root>" ) root = parser.close() print(etree.tostring(root))
b'<root>data</root>'
b'<root><a/></root>'
1 2 3 4 5 6 7 8 9 10 11 12 13 14 some_file_like = BytesIO(b"<root><a>data</a></root>" ) for event, element in etree.iterparse(some_file_like): print("%s, %4s, %s" % (event, element.tag, element.text)) print() some_file_like = BytesIO(b"<root><a>data</a></root>" ) for event, element in etree.iterparse(some_file_like, events=("start" , "end" )): print("%5s, %4s, %s" % (event, element.tag, element.text))
end, a, data
end, root, None
start, root, None
start, a, data
end, a, data
end, root, None
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 class ParserTarget : events = [] close_count = 0 def start (self, tag, attrib) : self.events.append(("start" , tag, attrib)) def close (self) : events, self.events = self.events, [] self.close_count += 1 return events parser_target = ParserTarget() parser = etree.XMLParser(target=parser_target) events = etree.fromstring('<root test="true"/>' , parser) print(parser_target.close_count) for event in events: print('event: %s - tag: %s' % (event[0 ], event[1 ])) for attr, value in event[2 ].items(): print(' * %s = %s' % (attr, value))
1
event: start - tag: root
* test = true
/Users/imonce/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:15: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
from ipykernel import kernelapp as app
1 2 3 4 events = etree.fromstring('<root test="true"/>' , parser) print(parser_target.close_count) events = etree.fromstring('<root test="true"/>' , parser) print(parser_target.close_count)
2
3
1 2 3 4 for event in events: print('event: %s - tag: %s' % (event[0 ], event[1 ])) for attr, value in event[2 ].items(): print(' * %s = %s' % (attr, value))
event: start - tag: root
* test = true
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 xhtml = etree.Element("{http://www.w3.org/1999/test}testhtml" ) body = etree.SubElement(xhtml, "{http://www.w3.org/1999/test}tbody" ) body.text = "Hello World" print(etree.tostring(xhtml)) xhtml = etree.Element("{http://www.w3.org/1999/xhtml}html" ) body = etree.SubElement(xhtml, "{http://www.w3.org/1999/xhtml}body" ) body.text = "Hello World" print(etree.tostring(xhtml))
b'<ns0:testhtml xmlns:ns0="http://www.w3.org/1999/test"><ns0:tbody>Hello World</ns0:tbody></ns0:testhtml>'
b'<html:html xmlns:html="http://www.w3.org/1999/xhtml"><html:body>Hello World</html:body></html:html>'
1 2 3 4 5 6 7 8 9 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" XHTML = "{%s}" % XHTML_NAMESPACE NSMAP = {None : XHTML_NAMESPACE} xhtml = etree.Element(XHTML + "html" , nsmap=NSMAP) body = etree.SubElement(xhtml, XHTML + "body" ) body.text = "Hello World" print(etree.tostring(xhtml))
b'<html xmlns="http://www.w3.org/1999/xhtml"><body>Hello World</body></html>'
1 2 3 4 5 6 7 8 9 tag = etree.QName('http://www.w3.org/1999/xhtml' , 'html' ) print(tag.localname) print(tag.namespace) print(tag.text) root = etree.Element('{http://www.w3.org/1999/xhtml}html' ) tag = etree.QName(root) print(tag.localname)
html
http://www.w3.org/1999/xhtml
{http://www.w3.org/1999/xhtml}html
html
{None: 'http://www.w3.org/1999/xhtml'}
1 2 3 4 5 6 root = etree.Element('root' , nsmap={'a' : 'http://a.b/c' }) child = etree.SubElement(root, 'child' , nsmap={'b' : 'http://b.c/d' }) print(root.nsmap) print(child.nsmap)
{'a': 'http://a.b/c'}
{'b': 'http://b.c/d', 'a': 'http://a.b/c'}
1 2 3 4 5 6 body.set(XHTML + "bgcolor" , "#CCFFAA" ) print(etree.tostring(xhtml)) print(body.get("bgcolor" )) print(body.get(XHTML+"bgcolor" ))
b'<html xmlns="http://www.w3.org/1999/xhtml"><body xmlns:html="http://www.w3.org/1999/xhtml" html:bgcolor="#CCFFAA">Hello World</body></html>'
None
#CCFFAA
1 2 3 4 5 6 find_xhtml_body = etree.ETXPath( "//{%s}body" % XHTML_NAMESPACE) results = find_xhtml_body(xhtml) print(results[0 ].tag)
{http://www.w3.org/1999/xhtml}body
1 2 for el in xhtml.iter('{*}body' ): print(el.tag)
{http://www.w3.org/1999/xhtml}body
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 from lxml.builder import Edef CLASS (*args) : return {"class" :' ' .join(args)} html = page = ( E.html( E.head( E.title("This is a sample document" ) ), E.body( E.h1("Hello!" , CLASS("title" )), E.p("This is a paragraph with " , E.b("bold" ), " text in it!" ), E.p("This is another paragraph, with a" , "\n " , E.a("link" , href="http://www.python.org" ), "." ), E.p("Here are some reserved characters: <spam&egg>." ), etree.XML("<p>And finally an embedded XHTML fragment.</p>" ), ) ) ) print(str(etree.tostring(page, pretty_print=True ),encoding='utf-8' ))
<html>
<head>
<title>This is a sample document</title>
</head>
<body>
<h1 class="title">Hello!</h1>
<p>This is a paragraph with <b>bold</b> text in it!</p>
<p>This is another paragraph, with a
<a href="http://www.python.org">link</a>.</p>
<p>Here are some reserved characters: <spam&egg>.</p>
<p>And finally an embedded XHTML fragment.</p>
</body>
</html>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 from lxml.builder import ElementMaker E = ElementMaker(namespace="http://my.de/fault/namespace" , nsmap={'p' : "http://my.de/fault/namespace" }) DOC = E.doc TITLE = E.title SECTION = E.section PAR = E.par my_doc = DOC( TITLE("The dog and the hog" ), SECTION( TITLE("The dog" , tType='title' ), PAR("Once upon a time, ..." ), PAR("And then ..." ) ), SECTION( TITLE("The hog" ), PAR("Sooner or later ..." ) ) ) print(str(etree.tostring(my_doc, pretty_print=True ),encoding='utf-8' ))
<p:doc xmlns:p="http://my.de/fault/namespace">
<p:title>The dog and the hog</p:title>
<p:section>
<p:title tType="title">The dog</p:title>
<p:par>Once upon a time, ...</p:par>
<p:par>And then ...</p:par>
</p:section>
<p:section>
<p:title>The hog</p:title>
<p:par>Sooner or later ...</p:par>
</p:section>
</p:doc>
1 2 3 4 5 root = etree.XML("<root><a x='123'>aText<b/><c/><b/></a></root>" ) print(root.find("b" )) print(root.find("a" ).tag)
None
a
1 2 print(root.find(".//b" ).tag)
b
1 2 print(root.findall(".//a[@x]" )[0 ].tag)
a
1 2 3 4 5 6 tree = etree.ElementTree(root) a = root[0 ] print(tree.getelementpath(a[0 ])) print(tree.getelementpath(a[1 ])) print(tree.getelementpath(a[2 ]))
a/b[1]
a/c
a/b[2]