mirror of
https://github.com/python/cpython.git
synced 2025-07-30 14:44:10 +00:00
SF patch 1504676: Make sgmllib char and entity references pluggable
(implementation/tests contributed by Sam Ruby)
This commit is contained in:
parent
274facfd1d
commit
fab461a4b5
4 changed files with 115 additions and 48 deletions
|
@ -132,27 +132,59 @@ nothing.
|
||||||
|
|
||||||
\begin{methoddesc}{handle_charref}{ref}
|
\begin{methoddesc}{handle_charref}{ref}
|
||||||
This method is called to process a character reference of the form
|
This method is called to process a character reference of the form
|
||||||
\samp{\&\#\var{ref};}. In the base implementation, \var{ref} must
|
\samp{\&\#\var{ref};}. The base implementation uses
|
||||||
be a decimal number in the
|
\method{convert_charref()} to convert the reference to a string. If
|
||||||
range 0-255. It translates the character to \ASCII{} and calls the
|
that method returns a string, it is passed to \method{handle_data()},
|
||||||
method \method{handle_data()} with the character as argument. If
|
otherwise \method{unknown_charref(\var{ref})} is called to handle the
|
||||||
\var{ref} is invalid or out of range, the method
|
error.
|
||||||
\code{unknown_charref(\var{ref})} is called to handle the error. A
|
\versionchanged[Use \method{convert_charref()} instead of hard-coding
|
||||||
subclass must override this method to provide support for named
|
the conversion]{2.5}
|
||||||
character entities.
|
\end{methoddesc}
|
||||||
|
|
||||||
|
\begin{methoddesc}{convert_charref}{ref}
|
||||||
|
Convert a character reference to a string, or \code{None}. \var{ref}
|
||||||
|
is the reference passed in as a string. In the base implementation,
|
||||||
|
\var{ref} must be a decimal number in the range 0-255. It converts
|
||||||
|
the code point found using the \method{convert_codepoint()} method.
|
||||||
|
If \var{ref} is invalid or out of range, this method returns
|
||||||
|
\code{None}. This method is called by the default
|
||||||
|
\method{handle_charref()} implementation and by the attribute value
|
||||||
|
parser.
|
||||||
|
\versionadded{2.5}
|
||||||
|
\end{methoddesc}
|
||||||
|
|
||||||
|
\begin{methoddesc}{convert_codepoint}{codepoint}
|
||||||
|
Convert a codepoint to a \class{str} value. Encodings can be handled
|
||||||
|
here if appropriate, though the rest of \module{sgmllib} is oblivious
|
||||||
|
on this matter.
|
||||||
|
\versionadded{2.5}
|
||||||
\end{methoddesc}
|
\end{methoddesc}
|
||||||
|
|
||||||
\begin{methoddesc}{handle_entityref}{ref}
|
\begin{methoddesc}{handle_entityref}{ref}
|
||||||
This method is called to process a general entity reference of the
|
This method is called to process a general entity reference of the
|
||||||
form \samp{\&\var{ref};} where \var{ref} is an general entity
|
form \samp{\&\var{ref};} where \var{ref} is an general entity
|
||||||
reference. It looks for \var{ref} in the instance (or class)
|
reference. It converts \var{ref} by passing it to
|
||||||
variable \member{entitydefs} which should be a mapping from entity
|
\method{convert_entityref()}. If a translation is returned, it
|
||||||
names to corresponding translations. If a translation is found, it
|
|
||||||
calls the method \method{handle_data()} with the translation;
|
calls the method \method{handle_data()} with the translation;
|
||||||
otherwise, it calls the method \code{unknown_entityref(\var{ref})}.
|
otherwise, it calls the method \code{unknown_entityref(\var{ref})}.
|
||||||
The default \member{entitydefs} defines translations for
|
The default \member{entitydefs} defines translations for
|
||||||
\code{\&}, \code{\&apos}, \code{\>}, \code{\<}, and
|
\code{\&}, \code{\&apos}, \code{\>}, \code{\<}, and
|
||||||
\code{\"}.
|
\code{\"}.
|
||||||
|
\versionchanged[Use \method{convert_entityref()} instead of hard-coding
|
||||||
|
the conversion]{2.5}
|
||||||
|
\end{methoddesc}
|
||||||
|
|
||||||
|
\begin{methoddesc}{convert_entityref}{ref}
|
||||||
|
Convert a named entity reference to a \class{str} value, or
|
||||||
|
\code{None}. The resulting value will not be parsed. \var{ref} will
|
||||||
|
be only the name of the entity. The default implementation looks for
|
||||||
|
\var{ref} in the instance (or class) variable \member{entitydefs}
|
||||||
|
which should be a mapping from entity names to corresponding
|
||||||
|
translations. If no translation is available for \var{ref}, this
|
||||||
|
method returns \code{None}. This method is called by the default
|
||||||
|
\method{handle_entityref()} implementation and by the attribute value
|
||||||
|
parser.
|
||||||
|
\versionadded{2.5}
|
||||||
\end{methoddesc}
|
\end{methoddesc}
|
||||||
|
|
||||||
\begin{methoddesc}{handle_comment}{comment}
|
\begin{methoddesc}{handle_comment}{comment}
|
||||||
|
|
|
@ -53,6 +53,10 @@ class SGMLParseError(RuntimeError):
|
||||||
# self.handle_entityref() with the entity reference as argument.
|
# self.handle_entityref() with the entity reference as argument.
|
||||||
|
|
||||||
class SGMLParser(markupbase.ParserBase):
|
class SGMLParser(markupbase.ParserBase):
|
||||||
|
# Definition of entities -- derived classes may override
|
||||||
|
entity_or_charref = re.compile('&(?:'
|
||||||
|
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
|
||||||
|
')(;?)')
|
||||||
|
|
||||||
def __init__(self, verbose=0):
|
def __init__(self, verbose=0):
|
||||||
"""Initialize and reset this instance."""
|
"""Initialize and reset this instance."""
|
||||||
|
@ -277,32 +281,8 @@ class SGMLParser(markupbase.ParserBase):
|
||||||
attrvalue[:1] == '"' == attrvalue[-1:]):
|
attrvalue[:1] == '"' == attrvalue[-1:]):
|
||||||
# strip quotes
|
# strip quotes
|
||||||
attrvalue = attrvalue[1:-1]
|
attrvalue = attrvalue[1:-1]
|
||||||
l = 0
|
attrvalue = self.entity_or_charref.sub(
|
||||||
new_attrvalue = ''
|
self._convert_ref, attrvalue)
|
||||||
while l < len(attrvalue):
|
|
||||||
av_match = entityref.match(attrvalue, l)
|
|
||||||
if (av_match and av_match.group(1) in self.entitydefs and
|
|
||||||
attrvalue[av_match.end(1)] == ';'):
|
|
||||||
# only substitute entityrefs ending in ';' since
|
|
||||||
# otherwise we may break <a href='?p=x&q=y'>
|
|
||||||
# which is very common
|
|
||||||
new_attrvalue += self.entitydefs[av_match.group(1)]
|
|
||||||
l = av_match.end(0)
|
|
||||||
continue
|
|
||||||
ch_match = charref.match(attrvalue, l)
|
|
||||||
if ch_match:
|
|
||||||
try:
|
|
||||||
char = chr(int(ch_match.group(1)))
|
|
||||||
new_attrvalue += char
|
|
||||||
l = ch_match.end(0)
|
|
||||||
continue
|
|
||||||
except ValueError:
|
|
||||||
# invalid character reference, don't substitute
|
|
||||||
pass
|
|
||||||
# all other cases
|
|
||||||
new_attrvalue += attrvalue[l]
|
|
||||||
l += 1
|
|
||||||
attrvalue = new_attrvalue
|
|
||||||
attrs.append((attrname.lower(), attrvalue))
|
attrs.append((attrname.lower(), attrvalue))
|
||||||
k = match.end(0)
|
k = match.end(0)
|
||||||
if rawdata[j] == '>':
|
if rawdata[j] == '>':
|
||||||
|
@ -311,6 +291,17 @@ class SGMLParser(markupbase.ParserBase):
|
||||||
self.finish_starttag(tag, attrs)
|
self.finish_starttag(tag, attrs)
|
||||||
return j
|
return j
|
||||||
|
|
||||||
|
# Internal -- convert entity or character reference
|
||||||
|
def _convert_ref(self, match):
|
||||||
|
if match.group(2):
|
||||||
|
return self.convert_charref(match.group(2)) or \
|
||||||
|
'&#%s%s' % match.groups()[1:]
|
||||||
|
elif match.group(3):
|
||||||
|
return self.convert_entityref(match.group(1)) or \
|
||||||
|
'&%s;' % match.group(1)
|
||||||
|
else:
|
||||||
|
return '&%s' % match.group(1)
|
||||||
|
|
||||||
# Internal -- parse endtag
|
# Internal -- parse endtag
|
||||||
def parse_endtag(self, i):
|
def parse_endtag(self, i):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
|
@ -394,35 +385,51 @@ class SGMLParser(markupbase.ParserBase):
|
||||||
print '*** Unbalanced </' + tag + '>'
|
print '*** Unbalanced </' + tag + '>'
|
||||||
print '*** Stack:', self.stack
|
print '*** Stack:', self.stack
|
||||||
|
|
||||||
def handle_charref(self, name):
|
def convert_charref(self, name):
|
||||||
"""Handle character reference, no need to override."""
|
"""Convert character reference, may be overridden."""
|
||||||
try:
|
try:
|
||||||
n = int(name)
|
n = int(name)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.unknown_charref(name)
|
|
||||||
return
|
return
|
||||||
if not 0 <= n <= 255:
|
if not 0 <= n <= 255:
|
||||||
self.unknown_charref(name)
|
|
||||||
return
|
return
|
||||||
self.handle_data(chr(n))
|
return self.convert_codepoint(n)
|
||||||
|
|
||||||
|
def convert_codepoint(self, codepoint):
|
||||||
|
return chr(codepoint)
|
||||||
|
|
||||||
|
def handle_charref(self, name):
|
||||||
|
"""Handle character reference, no need to override."""
|
||||||
|
replacement = convert_charref(name)
|
||||||
|
if replacement is None:
|
||||||
|
self.unknown_charref(name)
|
||||||
|
else:
|
||||||
|
self.handle_data(convert_charref(name))
|
||||||
|
|
||||||
# Definition of entities -- derived classes may override
|
# Definition of entities -- derived classes may override
|
||||||
entitydefs = \
|
entitydefs = \
|
||||||
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
|
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
|
||||||
|
|
||||||
def handle_entityref(self, name):
|
def convert_entityref(self, name):
|
||||||
"""Handle entity references.
|
"""Convert entity references.
|
||||||
|
|
||||||
There should be no need to override this method; it can be
|
As an alternative to overriding this method; one can tailor the
|
||||||
tailored by setting up the self.entitydefs mapping appropriately.
|
results by setting up the self.entitydefs mapping appropriately.
|
||||||
"""
|
"""
|
||||||
table = self.entitydefs
|
table = self.entitydefs
|
||||||
if name in table:
|
if name in table:
|
||||||
self.handle_data(table[name])
|
return table[name]
|
||||||
else:
|
else:
|
||||||
self.unknown_entityref(name)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def handle_entityref(self, name):
|
||||||
|
"""Handle entity references, no need to override."""
|
||||||
|
replacement = convert_entityref(name)
|
||||||
|
if replacement is None:
|
||||||
|
self.unknown_entityref(name)
|
||||||
|
else:
|
||||||
|
self.handle_data(convert_entityref(name))
|
||||||
|
|
||||||
# Example -- handle data, should be overridden
|
# Example -- handle data, should be overridden
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -64,6 +64,23 @@ class CDATAEventCollector(EventCollector):
|
||||||
self.setliteral()
|
self.setliteral()
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLEntityCollector(EventCollector):
|
||||||
|
import re, htmlentitydefs
|
||||||
|
entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
|
||||||
|
'|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
|
||||||
|
|
||||||
|
def convert_charref(self, name):
|
||||||
|
self.append(("charref", "convert", name))
|
||||||
|
if name.startswith('x'):
|
||||||
|
return unichr(int(name[1:],16))
|
||||||
|
else:
|
||||||
|
return unichr(int(name))
|
||||||
|
|
||||||
|
def convert_entityref(self, name):
|
||||||
|
self.append(("entityref", "convert", name))
|
||||||
|
return unichr(self.htmlentitydefs.name2codepoint[name])
|
||||||
|
|
||||||
|
|
||||||
class SGMLParserTestCase(unittest.TestCase):
|
class SGMLParserTestCase(unittest.TestCase):
|
||||||
|
|
||||||
collector = EventCollector
|
collector = EventCollector
|
||||||
|
@ -233,6 +250,16 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
|
||||||
("k", "*"),
|
("k", "*"),
|
||||||
])])
|
])])
|
||||||
|
|
||||||
|
def test_convert_overrides(self):
|
||||||
|
self.collector = HTMLEntityCollector
|
||||||
|
self.check_events('<a title="“test”">foo</a>', [
|
||||||
|
('entityref', 'convert', 'ldquo'),
|
||||||
|
('charref', 'convert', 'x201d'),
|
||||||
|
('starttag', 'a', [('title', u'\u201ctest\u201d')]),
|
||||||
|
('data', 'foo'),
|
||||||
|
('endtag', 'a'),
|
||||||
|
])
|
||||||
|
|
||||||
def test_attr_funky_names(self):
|
def test_attr_funky_names(self):
|
||||||
self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
|
self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
|
||||||
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
|
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
|
||||||
|
|
|
@ -528,6 +528,7 @@ Hugo van Rossum
|
||||||
Saskia van Rossum
|
Saskia van Rossum
|
||||||
Donald Wallace Rouse II
|
Donald Wallace Rouse II
|
||||||
Liam Routt
|
Liam Routt
|
||||||
|
Sam Ruby
|
||||||
Paul Rubin
|
Paul Rubin
|
||||||
Audun S. Runde
|
Audun S. Runde
|
||||||
Jeff Rush
|
Jeff Rush
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue