Sjoerd's latest.

2025-08-04 00:48:58 +00:00 · 1998-04-03 16:02:39 +00:00 · 1998-04-03 16:02:39 +00:00 · 7e07b3845b
commit 7e07b3845b
parent 0454b51282
1 changed files with 194 additions and 116 deletions
--- a/Lib/xmllib.py
+++ b/Lib/xmllib.py
@ -5,34 +5,50 @@ import re
 import string


+version = '0.1'
+
 # Regular expressions used for parsing

 _S = '[ \t\r\n]+'
 _opS = '[ \t\r\n]*'
 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
-interesting = re.compile('[&<]')
-incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|'
-                           '<([a-zA-Z_:][^<>]*|'
-                              '/([a-zA-Z_:][^<>]*)?|'
-                              '![^<>]*|'
-                              r'\?[^<>]*)?')
+illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
+interesting = re.compile('[]&<]')

-ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?')
+amp = re.compile('&')
+ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
-space = re.compile(_S)
+space = re.compile(_S + '$')
 newline = re.compile('\n')

 starttagopen = re.compile('<' + _Name)
 endtagopen = re.compile('</')
 starttagend = re.compile(_opS + '(?P<slash>/?)>')
-endbracket = re.compile('>')
+endbracket = re.compile(_opS + '>')
 tagfind = re.compile(_Name)
 cdataopen = re.compile(r'<!\[CDATA\[')
 cdataclose = re.compile(r'\]\]>')
-doctype = re.compile('<!DOCTYPE' + _S + '(?P<name>' + _Name + ')' + _S)
-special = re.compile('<!(?P<special>[^<>]*)>')
-procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _S)
+# this matches one of the following:
+# SYSTEM SystemLiteral
+# PUBLIC PubidLiteral SystemLiteral
+_SystemLiteral = '(?P<%s>\'[^\']*\'|"[^"]*")'
+_PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
+                        "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
+_ExternalId = '(?:SYSTEM|' \
+                 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
+              ')'+_S+_SystemLiteral%'syslit'
+doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
+                     '(?:'+_S+_ExternalId+')?'+_opS)
+xmldecl = re.compile('<\?xml'+_S+
+                     'version'+_opS+'='+_opS+'(?P<version>\'[^\']*\'|"[^"]*")'+
+                     '(?:'+_S+'encoding'+_opS+'='+_opS+
+                        "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
+                        '"[A-Za-z][-A-Za-z0-9._]*"))?'
+                     '(?:'+_S+'standalone'+_opS+'='+_opS+
+                        '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
+                     _opS+'\?>')
+procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
 procclose = re.compile(_opS + r'\?>')
 commentopen = re.compile('<!--')
 commentclose = re.compile('-->')
@ -41,6 +57,7 @@ attrfind = re.compile(
    _S + '(?P<name>' + _Name + ')'
    '(' + _opS + '=' + _opS +
    '(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
+attrtrans = string.maketrans(' \r\n\t', '    ')


 # XML parser base class -- find tags and call handler functions.
@ -92,30 +109,43 @@ class XMLParser:
        self.goahead(1)

    # Interface -- translate references
-    def translate_references(self, data):
-        newdata = []
+    def translate_references(self, data, all = 1):
        i = 0
        while 1:
-            res = ref.search(data, i)
+            res = amp.search(data, i)
            if res is None:
-                newdata.append(data[i:])
-                return string.join(newdata, '')
-            if data[res.end(0) - 1] != ';':
+                return data
+            res = ref.match(data, res.start(0))
+            if res is None:
+                self.syntax_error("bogus `&'")
+                i =i+1
+                continue
+            i = res.end(0)
+            if data[i - 1] != ';':
                self.syntax_error("`;' missing after entity/char reference")
-            newdata.append(data[i:res.start(0)])
+                i = i-1
            str = res.group(1)
+            pre = data[:res.start(0)]
+            post = data[i:]
            if str[0] == '#':
                if str[1] == 'x':
-                    newdata.append(chr(string.atoi(str[2:], 16)))
+                    str = chr(string.atoi(str[2:], 16))
                else:
-                    newdata.append(chr(string.atoi(str[1:])))
-            else:
-                try:
-                    newdata.append(self.entitydefs[str])
-                except KeyError:
+                    str = chr(string.atoi(str[1:]))
+                data = pre + str + post
+                i = res.start(0)+len(str)
+            elif all:
+                if self.entitydefs.has_key(str):
+                    data = pre + self.entitydefs[str] + post
+                    i = res.start(0)    # rescan substituted text
+                else:
+                    self.syntax_error('reference to unknown entity')
                    # can't do it, so keep the entity ref in
-                    newdata.append('&' + str + ';')
-            i = res.end(0)
+                    data = pre + '&' + str + ';' + post
+                    i = res.start(0) + len(str) + 2
+            else:
+                # just translating character references
+                pass                    # i is already postioned correctly

    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
@ -139,8 +169,14 @@ class XMLParser:
            else:
                    j = n
            if i < j:
+                if self.__at_start:
+                    self.syntax_error('illegal data at start of file')
                self.__at_start = 0
                data = rawdata[i:j]
+                if not self.stack and not space.match(data):
+                    self.syntax_error('data not in content')
+                if illegal.search(data):
+                    self.syntax_error('illegal character in content')
                self.handle_data(data)
                self.lineno = self.lineno + string.count(data, '\n')
            i = j
@ -184,6 +220,20 @@ class XMLParser:
                    self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
                    i = k
                    continue
+                res = xmldecl.match(rawdata, i)
+                if res:
+                    if not self.__at_start:
+                        self.syntax_error("<?xml?> declaration not at start of document")
+                    version, encoding, standalone = res.group('version',
+                                                              'encoding',
+                                                              'standalone')
+                    if version[1:-1] != '1.0':
+                        raise RuntimeError, 'only XML version 1.0 supported'
+                    if encoding: encoding = encoding[1:-1]
+                    if standalone: standalone = standalone[1:-1]
+                    self.handle_xml(encoding, standalone)
+                    i = res.end(0)
+                    continue
                res = procopen.match(rawdata, i)
                if res:
                    k = self.parse_proc(i)
@ -209,18 +259,6 @@ class XMLParser:
                    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
                    i = k
                    continue
-                res = special.match(rawdata, i)
-                if res:
-                    if self.literal:
-                        data = rawdata[i]
-                        self.handle_data(data)
-                        self.lineno = self.lineno + string.count(data, '\n')
-                        i = i+1
-                        continue
-                    self.handle_special(res.group('special'))
-                    self.lineno = self.lineno + string.count(res.group(0), '\n')
-                    i = res.end(0)
-                    continue
            elif rawdata[i] == '&':
                res = charref.match(rawdata, i)
                if res is not None:
@ -228,6 +266,8 @@ class XMLParser:
                    if rawdata[i-1] != ';':
                        self.syntax_error("`;' missing in charref")
                        i = i-1
+                    if not self.stack:
+                        self.syntax_error('data not in content')
                    self.handle_charref(res.group('char')[:-1])
                    self.lineno = self.lineno + string.count(res.group(0), '\n')
                    continue
@ -237,36 +277,45 @@ class XMLParser:
                    if rawdata[i-1] != ';':
                        self.syntax_error("`;' missing in entityref")
                        i = i-1
-                    self.handle_entityref(res.group('name'))
+                    name = res.group('name')
+                    if self.entitydefs.has_key(name):
+                        self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
+                        n = len(rawdata)
+                        i = res.start(0)
+                    else:
+                        self.syntax_error('reference to unknown entity')
+                        self.unknown_entityref(name)
                    self.lineno = self.lineno + string.count(res.group(0), '\n')
                    continue
+            elif rawdata[i] == ']':
+                if n-i < 3:
+                    break
+                if cdataclose.match(rawdata, i):
+                    self.syntax_error("bogus `]]>'")
+                self.handle_data(rawdata[i])
+                i = i+1
+                continue
            else:
                raise RuntimeError, 'neither < nor & ??'
            # We get here only if incomplete matches but
            # nothing else
-            res = incomplete.match(rawdata, i)
-            if not res:
-                data = rawdata[i]
-                self.handle_data(data)
-                self.lineno = self.lineno + string.count(data, '\n')
-                i = i+1
-                continue
-            j = res.end(0)
-            if j == n:
-                break # Really incomplete
-            self.syntax_error("bogus `<' or `&'")
-            data = res.group(0)
-            self.handle_data(data)
-            self.lineno = self.lineno + string.count(data, '\n')
-            i = j
+            break
        # end while
+        if i > 0:
+            self.__at_start = 0
        if end and i < n:
-            data = rawdata[i:n]
+            data = rawdata[i]
+            self.syntax_error("bogus `%s'" % data)
+            if illegal.search(data):
+                self.syntax_error('illegal character in content')
            self.handle_data(data)
            self.lineno = self.lineno + string.count(data, '\n')
-            i = n
+            self.rawdata = rawdata[i+1:]
+            return self.goahead(end)
        self.rawdata = rawdata[i:]
        if end:
+            if not self.__seen_starttag:
+                self.syntax_error('no elements in file')
            if self.stack:
                self.syntax_error('missing end tags')
                while self.stack:
@ -280,9 +329,12 @@ class XMLParser:
        res = commentclose.search(rawdata, i+4)
        if not res:
            return -1
-        # doubledash search will succeed because it's a subset of commentclose
-        if doubledash.search(rawdata, i+4).start(0) < res.start(0):
+        if doubledash.search(rawdata, i+4, res.start(0)):
            self.syntax_error("`--' inside comment")
+        if rawdata[res.start(0)-1] == '-':
+            self.syntax_error('comment cannot end in three dashes')
+        if illegal.search(rawdata, i+4, res.start(0)):
+            self.syntax_error('illegal character in comment')
        self.handle_comment(rawdata[i+4: res.start(0)])
        return res.end(0)

@ -291,28 +343,59 @@ class XMLParser:
        rawdata = self.rawdata
        n = len(rawdata)
        name = res.group('name')
+        pubid, syslit = res.group('pubid', 'syslit')
+        if pubid is not None:
+            pubid = pubid[1:-1]         # remove quotes
+            pubid = string.join(string.split(pubid)) # normalize
+        if syslit is not None: syslit = syslit[1:-1] # remove quotes
        j = k = res.end(0)
-        level = 0
-        while k < n:
-            c = rawdata[k]
-            if c == '<':
-                level = level + 1
-            elif c == '>':
-                if level == 0:
-                    self.handle_doctype(name, rawdata[j:k])
-                    return k+1
-                level = level - 1
+        if k >= n:
+            return -1
+        if rawdata[k] == '[':
+            level = 0
            k = k+1
-        return -1
+            dq = sq = 0
+            while k < n:
+                c = rawdata[k]
+                if not sq and c == '"':
+                    dq = not dq
+                elif not dq and c == "'":
+                    sq = not sq
+                elif sq or dq:
+                    pass
+                elif level <= 0 and c == ']':
+                    res = endbracket.match(rawdata, k+1)
+                    if not res:
+                        return -1
+                    self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
+                    return res.end(0)
+                elif c == '<':
+                    level = level + 1
+                elif c == '>':
+                    level = level - 1
+                    if level < 0:
+                        self.syntax_error("bogus `>' in DOCTYPE")
+                k = k+1
+        res = endbracket.search(rawdata, k)
+        if not res:
+            return -1
+        if res.start(0) != k:
+            self.syntax_error('garbage in DOCTYPE')
+        self.handle_doctype(name, pubid, syslit, None)
+        return res.end(0)

    # Internal -- handle CDATA tag, return length or -1 if not terminated
    def parse_cdata(self, i):
        rawdata = self.rawdata
        if rawdata[i:i+9] <> '<![CDATA[':
-            raise RuntimeError, 'unexpected call to handle_cdata'
+            raise RuntimeError, 'unexpected call to parse_cdata'
        res = cdataclose.search(rawdata, i+9)
        if not res:
            return -1
+        if illegal.search(rawdata, i+9, res.start(0)):
+            self.syntax_error('illegal character in CDATA')
+        if not self.stack:
+            self.syntax_error('CDATA not in content')
        self.handle_cdata(rawdata[i+9:res.start(0)])
        return res.end(0)

@ -324,24 +407,15 @@ class XMLParser:
        if not end:
            return -1
        j = end.start(0)
+        if illegal.search(rawdata, i+2, j):
+            self.syntax_error('illegal character in processing instruction')
        res = tagfind.match(rawdata, i+2)
        if not res:
            raise RuntimeError, 'unexpected call to parse_proc'
        k = res.end(0)
        name = res.group(0)
-        if name == 'xml':
-            if self.__at_start:
-                attrdict, k = self.parse_attributes('xml', k, j,
-                                                    self.__xml_attributes)
-                if k != j:
-                    self.syntax_error('garbage at end of <?xml?>')
-                if attrdict['version'] != '1.0':
-                    self.syntax_error('only XML version 1.0 supported')
-                self.handle_xml(attrdict.get('encoding', None),
-                                attrdict['standalone'])
-                return end.end(0)
-            else:
-                self.syntax_error("<?xml?> tag not at start of document")
+        if string.find(string.lower(name), 'xml') >= 0:
+            self.syntax_error('illegal processing instruction target name')
        self.handle_proc(name, rawdata[k:j])
        return end.end(0)

@ -375,6 +449,7 @@ class XMLParser:
                                  (attrname, tag))
            if attrdict.has_key(attrname):
                self.syntax_error('attribute specified twice')
+            attrvalue = string.translate(attrvalue, attrtrans)
            attrdict[attrname] = self.translate_references(attrvalue)
            k = res.end(0)
        if attributes is not None:
@ -400,6 +475,8 @@ class XMLParser:
        if not self.__seen_starttag and self.__seen_doctype:
            if tag != self.__seen_doctype:
                self.syntax_error('starttag does not match DOCTYPE')
+        if self.__seen_starttag and not self.stack:
+            self.syntax_error('multiple elements on top level')
        if hasattr(self, tag + '_attributes'):
            attributes = getattr(self, tag + '_attributes')
        else:
@ -428,10 +505,7 @@ class XMLParser:
            tag = res.group(0)
            k = res.end(0)
        if k != end.start(0):
-            # check that there is only white space at end of tag
-            res = space.match(rawdata, k)
-            if res is None or res.end(0) != end.start(0):
-                self.syntax_error('garbage in end tag')
+            self.syntax_error('garbage in end tag')
        self.finish_endtag(tag)
        return end.end(0)

@ -439,17 +513,18 @@ class XMLParser:
    # Return -1 for unknown tag, 1 for balanced tag
    def finish_starttag(self, tag, attrs):
        self.stack.append(tag)
-        try:
-            method = getattr(self, 'start_' + tag)
-        except AttributeError:
-            self.unknown_starttag(tag, attrs)
-            return -1
-        else:
+        methodname = 'start_' + tag
+        if hasattr(self, methodname):
+            method = getattr(self, methodname)
            self.handle_starttag(tag, method, attrs)
            return 1
+        else:
+            self.unknown_starttag(tag, attrs)
+            return -1

    # Internal -- finish processing of end tag
    def finish_endtag(self, tag):
+        methodname = 'end_' + tag
        if not tag:
            self.syntax_error('name-less end tag')
            found = len(self.stack) - 1
@ -459,9 +534,10 @@ class XMLParser:
        else:
            if tag not in self.stack:
                self.syntax_error('unopened end tag')
-                try:
-                    method = getattr(self, 'end_' + tag)
-                except AttributeError:
+                if hasattr(self, methodname):
+                    method = getattr(self, methodname)
+                    self.handle_endtag(tag, method)
+                else:
                    self.unknown_endtag(tag)
                return
            found = len(self.stack)
@ -472,11 +548,8 @@ class XMLParser:
            if found < len(self.stack) - 1:
                self.syntax_error('missing close tag for %s' % self.stack[-1])
            tag = self.stack[-1]
-            try:
-                method = getattr(self, 'end_' + tag)
-            except AttributeError:
-                method = None
-            if method:
+            if hasattr(self, methodname):
+                method = getattr(self, methodname)
                self.handle_endtag(tag, method)
            else:
                self.unknown_endtag(tag)
@ -487,7 +560,7 @@ class XMLParser:
        pass

    # Overridable -- handle DOCTYPE
-    def handle_doctype(self, tag, data):
+    def handle_doctype(self, tag, pubid, syslit, data):
        pass

    # Overridable -- handle start tag
@ -514,7 +587,12 @@ class XMLParser:
        self.handle_data(chr(n))

    # Definition of entities -- derived classes may override
-    entitydefs = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"}
+    entitydefs = {'lt': '&#60;',        # must use charref
+                  'gt': '&#62;',
+                  'amp': '&#38;',       # must use charref
+                  'quot': '&#34;',
+                  'apos': '&#39;',
+                  }

    # Example -- handle entity reference, no need to override
    def handle_entityref(self, name):
@ -541,10 +619,6 @@ class XMLParser:
    def handle_proc(self, name, data):
        pass

-    # Example -- handle special instructions, could be overridden
-    def handle_special(self, data):
-        pass
-
    # Example -- handle relatively harmless syntax errors, could be overridden
    def syntax_error(self, message):
        raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
@ -566,10 +640,14 @@ class TestXMLParser(XMLParser):
        self.flush()
        print 'xml: encoding =',encoding,'standalone =',standalone

-    def handle_doctype(self, tag, data):
+    def handle_doctype(self, tag, pubid, syslit, data):
        self.flush()
        print 'DOCTYPE:',tag, `data`

+    def handle_entity(self, name, strval, pubid, syslit, ndata):
+        self.flush()
+        print 'ENTITY:',`data`
+
    def handle_data(self, data):
        self.testdata = self.testdata + data
        if len(`self.testdata`) >= 70:
@ -589,10 +667,6 @@ class TestXMLParser(XMLParser):
        self.flush()
        print 'processing:',name,`data`

-    def handle_special(self, data):
-        self.flush()
-        print 'special:',`data`
-
    def handle_comment(self, data):
        self.flush()
        r = `data`
@ -660,9 +734,13 @@ def test(args = None):
        f.close()

    x = klass()
-    for c in data:
-        x.feed(c)
-    x.close()
+    try:
+        for c in data:
+            x.feed(c)
+        x.close()
+    except RuntimeError, msg:
+        print msg
+        sys.exit(1)


 if __name__ == '__main__':