cpython/Lib/xml/dom/pulldom.py
Fred Drake fbdeaad069 expunge the xmlcore changes:
41667, 41668 - initial switch to xmlcore
  47044        - mention of xmlcore in What's New
  50687        - mention of xmlcore in the library reference

re-apply xmlcore changes to xml:
  41674        - line ending changes (re-applied manually), directory props
  41677        - add cElementTree wrapper
  41678        - PSF licensing for etree
  41812        - whitespace normalization
  42724        - fix svn:eol-style settings
  43681, 43682 - remove Python version-compatibility cruft from minidom
  46773        - fix encoding of \r\n\t in attr values in saxutils
  47269        - added XMLParser alias for cElementTree compatibility

additional tests were added in Lib/test/test_sax.py that failed with
the xmlcore changes; these relate to SF bugs #1511497, #1513611
2006-07-29 16:56:15 +00:00

351 lines
12 KiB
Python

import xml.sax
import xml.sax.handler
import types
try:
_StringTypes = [types.StringType, types.UnicodeType]
except AttributeError:
_StringTypes = [types.StringType]
START_ELEMENT = "START_ELEMENT"
END_ELEMENT = "END_ELEMENT"
COMMENT = "COMMENT"
START_DOCUMENT = "START_DOCUMENT"
END_DOCUMENT = "END_DOCUMENT"
PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
CHARACTERS = "CHARACTERS"
class PullDOM(xml.sax.ContentHandler):
_locator = None
document = None
def __init__(self, documentFactory=None):
from xml.dom import XML_NAMESPACE
self.documentFactory = documentFactory
self.firstEvent = [None, None]
self.lastEvent = self.firstEvent
self.elementStack = []
self.push = self.elementStack.append
try:
self.pop = self.elementStack.pop
except AttributeError:
# use class' pop instead
pass
self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
self._current_context = self._ns_contexts[-1]
self.pending_events = []
def pop(self):
result = self.elementStack[-1]
del self.elementStack[-1]
return result
def setDocumentLocator(self, locator):
self._locator = locator
def startPrefixMapping(self, prefix, uri):
if not hasattr(self, '_xmlns_attrs'):
self._xmlns_attrs = []
self._xmlns_attrs.append((prefix or 'xmlns', uri))
self._ns_contexts.append(self._current_context.copy())
self._current_context[uri] = prefix or None
def endPrefixMapping(self, prefix):
self._current_context = self._ns_contexts.pop()
def startElementNS(self, name, tagName , attrs):
# Retrieve xml namespace declaration attributes.
xmlns_uri = 'http://www.w3.org/2000/xmlns/'
xmlns_attrs = getattr(self, '_xmlns_attrs', None)
if xmlns_attrs is not None:
for aname, value in xmlns_attrs:
attrs._attrs[(xmlns_uri, aname)] = value
self._xmlns_attrs = []
uri, localname = name
if uri:
# When using namespaces, the reader may or may not
# provide us with the original name. If not, create
# *a* valid tagName from the current context.
if tagName is None:
prefix = self._current_context[uri]
if prefix:
tagName = prefix + ":" + localname
else:
tagName = localname
if self.document:
node = self.document.createElementNS(uri, tagName)
else:
node = self.buildDocument(uri, tagName)
else:
# When the tagname is not prefixed, it just appears as
# localname
if self.document:
node = self.document.createElement(localname)
else:
node = self.buildDocument(None, localname)
for aname,value in attrs.items():
a_uri, a_localname = aname
if a_uri == xmlns_uri:
if a_localname == 'xmlns':
qname = a_localname
else:
qname = 'xmlns:' + a_localname
attr = self.document.createAttributeNS(a_uri, qname)
node.setAttributeNodeNS(attr)
elif a_uri:
prefix = self._current_context[a_uri]
if prefix:
qname = prefix + ":" + a_localname
else:
qname = a_localname
attr = self.document.createAttributeNS(a_uri, qname)
node.setAttributeNodeNS(attr)
else:
attr = self.document.createAttribute(a_localname)
node.setAttributeNode(attr)
attr.value = value
self.lastEvent[1] = [(START_ELEMENT, node), None]
self.lastEvent = self.lastEvent[1]
self.push(node)
def endElementNS(self, name, tagName):
self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
self.lastEvent = self.lastEvent[1]
def startElement(self, name, attrs):
if self.document:
node = self.document.createElement(name)
else:
node = self.buildDocument(None, name)
for aname,value in attrs.items():
attr = self.document.createAttribute(aname)
attr.value = value
node.setAttributeNode(attr)
self.lastEvent[1] = [(START_ELEMENT, node), None]
self.lastEvent = self.lastEvent[1]
self.push(node)
def endElement(self, name):
self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
self.lastEvent = self.lastEvent[1]
def comment(self, s):
if self.document:
node = self.document.createComment(s)
self.lastEvent[1] = [(COMMENT, node), None]
self.lastEvent = self.lastEvent[1]
else:
event = [(COMMENT, s), None]
self.pending_events.append(event)
def processingInstruction(self, target, data):
if self.document:
node = self.document.createProcessingInstruction(target, data)
self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
self.lastEvent = self.lastEvent[1]
else:
event = [(PROCESSING_INSTRUCTION, target, data), None]
self.pending_events.append(event)
def ignorableWhitespace(self, chars):
node = self.document.createTextNode(chars)
self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
self.lastEvent = self.lastEvent[1]
def characters(self, chars):
node = self.document.createTextNode(chars)
self.lastEvent[1] = [(CHARACTERS, node), None]
self.lastEvent = self.lastEvent[1]
def startDocument(self):
if self.documentFactory is None:
import xml.dom.minidom
self.documentFactory = xml.dom.minidom.Document.implementation
def buildDocument(self, uri, tagname):
# Can't do that in startDocument, since we need the tagname
# XXX: obtain DocumentType
node = self.documentFactory.createDocument(uri, tagname, None)
self.document = node
self.lastEvent[1] = [(START_DOCUMENT, node), None]
self.lastEvent = self.lastEvent[1]
self.push(node)
# Put everything we have seen so far into the document
for e in self.pending_events:
if e[0][0] == PROCESSING_INSTRUCTION:
_,target,data = e[0]
n = self.document.createProcessingInstruction(target, data)
e[0] = (PROCESSING_INSTRUCTION, n)
elif e[0][0] == COMMENT:
n = self.document.createComment(e[0][1])
e[0] = (COMMENT, n)
else:
raise AssertionError("Unknown pending event ",e[0][0])
self.lastEvent[1] = e
self.lastEvent = e
self.pending_events = None
return node.firstChild
def endDocument(self):
self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
self.pop()
def clear(self):
"clear(): Explicitly release parsing structures"
self.document = None
class ErrorHandler:
def warning(self, exception):
print exception
def error(self, exception):
raise exception
def fatalError(self, exception):
raise exception
class DOMEventStream:
def __init__(self, stream, parser, bufsize):
self.stream = stream
self.parser = parser
self.bufsize = bufsize
if not hasattr(self.parser, 'feed'):
self.getEvent = self._slurp
self.reset()
def reset(self):
self.pulldom = PullDOM()
# This content handler relies on namespace support
self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
self.parser.setContentHandler(self.pulldom)
def __getitem__(self, pos):
rc = self.getEvent()
if rc:
return rc
raise IndexError
def next(self):
rc = self.getEvent()
if rc:
return rc
raise StopIteration
def __iter__(self):
return self
def expandNode(self, node):
event = self.getEvent()
parents = [node]
while event:
token, cur_node = event
if cur_node is node:
return
if token != END_ELEMENT:
parents[-1].appendChild(cur_node)
if token == START_ELEMENT:
parents.append(cur_node)
elif token == END_ELEMENT:
del parents[-1]
event = self.getEvent()
def getEvent(self):
# use IncrementalParser interface, so we get the desired
# pull effect
if not self.pulldom.firstEvent[1]:
self.pulldom.lastEvent = self.pulldom.firstEvent
while not self.pulldom.firstEvent[1]:
buf = self.stream.read(self.bufsize)
if not buf:
self.parser.close()
return None
self.parser.feed(buf)
rc = self.pulldom.firstEvent[1][0]
self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
return rc
def _slurp(self):
""" Fallback replacement for getEvent() using the
standard SAX2 interface, which means we slurp the
SAX events into memory (no performance gain, but
we are compatible to all SAX parsers).
"""
self.parser.parse(self.stream)
self.getEvent = self._emit
return self._emit()
def _emit(self):
""" Fallback replacement for getEvent() that emits
the events that _slurp() read previously.
"""
rc = self.pulldom.firstEvent[1][0]
self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
return rc
def clear(self):
"""clear(): Explicitly release parsing objects"""
self.pulldom.clear()
del self.pulldom
self.parser = None
self.stream = None
class SAX2DOM(PullDOM):
def startElementNS(self, name, tagName , attrs):
PullDOM.startElementNS(self, name, tagName, attrs)
curNode = self.elementStack[-1]
parentNode = self.elementStack[-2]
parentNode.appendChild(curNode)
def startElement(self, name, attrs):
PullDOM.startElement(self, name, attrs)
curNode = self.elementStack[-1]
parentNode = self.elementStack[-2]
parentNode.appendChild(curNode)
def processingInstruction(self, target, data):
PullDOM.processingInstruction(self, target, data)
node = self.lastEvent[0][1]
parentNode = self.elementStack[-1]
parentNode.appendChild(node)
def ignorableWhitespace(self, chars):
PullDOM.ignorableWhitespace(self, chars)
node = self.lastEvent[0][1]
parentNode = self.elementStack[-1]
parentNode.appendChild(node)
def characters(self, chars):
PullDOM.characters(self, chars)
node = self.lastEvent[0][1]
parentNode = self.elementStack[-1]
parentNode.appendChild(node)
default_bufsize = (2 ** 14) - 20
def parse(stream_or_string, parser=None, bufsize=None):
if bufsize is None:
bufsize = default_bufsize
if type(stream_or_string) in _StringTypes:
stream = open(stream_or_string)
else:
stream = stream_or_string
if not parser:
parser = xml.sax.make_parser()
return DOMEventStream(stream, parser, bufsize)
def parseString(string, parser=None):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
bufsize = len(string)
buf = StringIO(string)
if not parser:
parser = xml.sax.make_parser()
return DOMEventStream(buf, parser, bufsize)