SF patch 1504676: Make sgmllib char and entity references pluggable

(implementation/tests contributed by Sam Ruby)
This commit is contained in:
Fred Drake 2006-06-16 23:45:06 +00:00
parent 274facfd1d
commit fab461a4b5
4 changed files with 115 additions and 48 deletions

View file

@ -132,27 +132,59 @@ nothing.
\begin{methoddesc}{handle_charref}{ref} \begin{methoddesc}{handle_charref}{ref}
This method is called to process a character reference of the form This method is called to process a character reference of the form
\samp{\&\#\var{ref};}. In the base implementation, \var{ref} must \samp{\&\#\var{ref};}. The base implementation uses
be a decimal number in the \method{convert_charref()} to convert the reference to a string. If
range 0-255. It translates the character to \ASCII{} and calls the that method returns a string, it is passed to \method{handle_data()},
method \method{handle_data()} with the character as argument. If otherwise \method{unknown_charref(\var{ref})} is called to handle the
\var{ref} is invalid or out of range, the method error.
\code{unknown_charref(\var{ref})} is called to handle the error. A \versionchanged[Use \method{convert_charref()} instead of hard-coding
subclass must override this method to provide support for named the conversion]{2.5}
character entities. \end{methoddesc}
\begin{methoddesc}{convert_charref}{ref}
Convert a character reference to a string, or \code{None}. \var{ref}
is the reference passed in as a string. In the base implementation,
\var{ref} must be a decimal number in the range 0-255. It converts
the code point found using the \method{convert_codepoint()} method.
If \var{ref} is invalid or out of range, this method returns
\code{None}. This method is called by the default
\method{handle_charref()} implementation and by the attribute value
parser.
\versionadded{2.5}
\end{methoddesc}
\begin{methoddesc}{convert_codepoint}{codepoint}
Convert a codepoint to a \class{str} value. Encodings can be handled
here if appropriate, though the rest of \module{sgmllib} is oblivious
on this matter.
\versionadded{2.5}
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{handle_entityref}{ref} \begin{methoddesc}{handle_entityref}{ref}
This method is called to process a general entity reference of the This method is called to process a general entity reference of the
form \samp{\&\var{ref};} where \var{ref} is an general entity form \samp{\&\var{ref};} where \var{ref} is an general entity
reference. It looks for \var{ref} in the instance (or class) reference. It converts \var{ref} by passing it to
variable \member{entitydefs} which should be a mapping from entity \method{convert_entityref()}. If a translation is returned, it
names to corresponding translations. If a translation is found, it
calls the method \method{handle_data()} with the translation; calls the method \method{handle_data()} with the translation;
otherwise, it calls the method \code{unknown_entityref(\var{ref})}. otherwise, it calls the method \code{unknown_entityref(\var{ref})}.
The default \member{entitydefs} defines translations for The default \member{entitydefs} defines translations for
\code{\&}, \code{\&apos}, \code{\>}, \code{\<}, and \code{\&}, \code{\&apos}, \code{\>}, \code{\<}, and
\code{\"}. \code{\"}.
\versionchanged[Use \method{convert_entityref()} instead of hard-coding
the conversion]{2.5}
\end{methoddesc}
\begin{methoddesc}{convert_entityref}{ref}
Convert a named entity reference to a \class{str} value, or
\code{None}. The resulting value will not be parsed. \var{ref} will
be only the name of the entity. The default implementation looks for
\var{ref} in the instance (or class) variable \member{entitydefs}
which should be a mapping from entity names to corresponding
translations. If no translation is available for \var{ref}, this
method returns \code{None}. This method is called by the default
\method{handle_entityref()} implementation and by the attribute value
parser.
\versionadded{2.5}
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{handle_comment}{comment} \begin{methoddesc}{handle_comment}{comment}

View file

@ -53,6 +53,10 @@ class SGMLParseError(RuntimeError):
# self.handle_entityref() with the entity reference as argument. # self.handle_entityref() with the entity reference as argument.
class SGMLParser(markupbase.ParserBase): class SGMLParser(markupbase.ParserBase):
# Definition of entities -- derived classes may override
entity_or_charref = re.compile('&(?:'
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
')(;?)')
def __init__(self, verbose=0): def __init__(self, verbose=0):
"""Initialize and reset this instance.""" """Initialize and reset this instance."""
@ -277,32 +281,8 @@ class SGMLParser(markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]): attrvalue[:1] == '"' == attrvalue[-1:]):
# strip quotes # strip quotes
attrvalue = attrvalue[1:-1] attrvalue = attrvalue[1:-1]
l = 0 attrvalue = self.entity_or_charref.sub(
new_attrvalue = '' self._convert_ref, attrvalue)
while l < len(attrvalue):
av_match = entityref.match(attrvalue, l)
if (av_match and av_match.group(1) in self.entitydefs and
attrvalue[av_match.end(1)] == ';'):
# only substitute entityrefs ending in ';' since
# otherwise we may break <a href='?p=x&q=y'>
# which is very common
new_attrvalue += self.entitydefs[av_match.group(1)]
l = av_match.end(0)
continue
ch_match = charref.match(attrvalue, l)
if ch_match:
try:
char = chr(int(ch_match.group(1)))
new_attrvalue += char
l = ch_match.end(0)
continue
except ValueError:
# invalid character reference, don't substitute
pass
# all other cases
new_attrvalue += attrvalue[l]
l += 1
attrvalue = new_attrvalue
attrs.append((attrname.lower(), attrvalue)) attrs.append((attrname.lower(), attrvalue))
k = match.end(0) k = match.end(0)
if rawdata[j] == '>': if rawdata[j] == '>':
@ -311,6 +291,17 @@ class SGMLParser(markupbase.ParserBase):
self.finish_starttag(tag, attrs) self.finish_starttag(tag, attrs)
return j return j
# Internal -- convert entity or character reference
def _convert_ref(self, match):
if match.group(2):
return self.convert_charref(match.group(2)) or \
'&#%s%s' % match.groups()[1:]
elif match.group(3):
return self.convert_entityref(match.group(1)) or \
'&%s;' % match.group(1)
else:
return '&%s' % match.group(1)
# Internal -- parse endtag # Internal -- parse endtag
def parse_endtag(self, i): def parse_endtag(self, i):
rawdata = self.rawdata rawdata = self.rawdata
@ -394,35 +385,51 @@ class SGMLParser(markupbase.ParserBase):
print '*** Unbalanced </' + tag + '>' print '*** Unbalanced </' + tag + '>'
print '*** Stack:', self.stack print '*** Stack:', self.stack
def handle_charref(self, name): def convert_charref(self, name):
"""Handle character reference, no need to override.""" """Convert character reference, may be overridden."""
try: try:
n = int(name) n = int(name)
except ValueError: except ValueError:
self.unknown_charref(name)
return return
if not 0 <= n <= 255: if not 0 <= n <= 255:
self.unknown_charref(name)
return return
self.handle_data(chr(n)) return self.convert_codepoint(n)
def convert_codepoint(self, codepoint):
return chr(codepoint)
def handle_charref(self, name):
"""Handle character reference, no need to override."""
replacement = convert_charref(name)
if replacement is None:
self.unknown_charref(name)
else:
self.handle_data(convert_charref(name))
# Definition of entities -- derived classes may override # Definition of entities -- derived classes may override
entitydefs = \ entitydefs = \
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
def handle_entityref(self, name): def convert_entityref(self, name):
"""Handle entity references. """Convert entity references.
There should be no need to override this method; it can be As an alternative to overriding this method; one can tailor the
tailored by setting up the self.entitydefs mapping appropriately. results by setting up the self.entitydefs mapping appropriately.
""" """
table = self.entitydefs table = self.entitydefs
if name in table: if name in table:
self.handle_data(table[name]) return table[name]
else: else:
self.unknown_entityref(name)
return return
def handle_entityref(self, name):
"""Handle entity references, no need to override."""
replacement = convert_entityref(name)
if replacement is None:
self.unknown_entityref(name)
else:
self.handle_data(convert_entityref(name))
# Example -- handle data, should be overridden # Example -- handle data, should be overridden
def handle_data(self, data): def handle_data(self, data):
pass pass

View file

@ -64,6 +64,23 @@ class CDATAEventCollector(EventCollector):
self.setliteral() self.setliteral()
class HTMLEntityCollector(EventCollector):
import re, htmlentitydefs
entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
'|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
def convert_charref(self, name):
self.append(("charref", "convert", name))
if name.startswith('x'):
return unichr(int(name[1:],16))
else:
return unichr(int(name))
def convert_entityref(self, name):
self.append(("entityref", "convert", name))
return unichr(self.htmlentitydefs.name2codepoint[name])
class SGMLParserTestCase(unittest.TestCase): class SGMLParserTestCase(unittest.TestCase):
collector = EventCollector collector = EventCollector
@ -233,6 +250,16 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
("k", "&#42;"), ("k", "&#42;"),
])]) ])])
def test_convert_overrides(self):
self.collector = HTMLEntityCollector
self.check_events('<a title="&ldquo;test&#x201d;">foo</a>', [
('entityref', 'convert', 'ldquo'),
('charref', 'convert', 'x201d'),
('starttag', 'a', [('title', u'\u201ctest\u201d')]),
('data', 'foo'),
('endtag', 'a'),
])
def test_attr_funky_names(self): def test_attr_funky_names(self):
self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),

View file

@ -528,6 +528,7 @@ Hugo van Rossum
Saskia van Rossum Saskia van Rossum
Donald Wallace Rouse II Donald Wallace Rouse II
Liam Routt Liam Routt
Sam Ruby
Paul Rubin Paul Rubin
Audun S. Runde Audun S. Runde
Jeff Rush Jeff Rush