mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 19:34:08 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			196 lines
		
	
	
	
		
			7.7 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
			
		
		
	
	
			196 lines
		
	
	
	
		
			7.7 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
\section{Standard Module \sectcode{sgmllib}}
 | 
						|
\label{module-sgmllib}
 | 
						|
\stmodindex{sgmllib}
 | 
						|
\index{SGML}
 | 
						|
 | 
						|
This module defines a class \code{SGMLParser} which serves as the
 | 
						|
basis for parsing text files formatted in SGML (Standard Generalized
 | 
						|
Mark-up Language).  In fact, it does not provide a full SGML parser
 | 
						|
--- it only parses SGML insofar as it is used by HTML, and the module
 | 
						|
only exists as a base for the \code{htmllib} module.
 | 
						|
\refstmodindex{htmllib}
 | 
						|
 | 
						|
In particular, the parser is hardcoded to recognize the following
 | 
						|
constructs:
 | 
						|
 | 
						|
\begin{itemize}
 | 
						|
 | 
						|
\item
 | 
						|
Opening and closing tags of the form
 | 
						|
``\code{<\var{tag} \var{attr}="\var{value}" ...>}'' and
 | 
						|
``\code{</\var{tag}>}'', respectively.
 | 
						|
 | 
						|
\item
 | 
						|
Numeric character references of the form ``\code{\&\#\var{name};}''.
 | 
						|
 | 
						|
\item
 | 
						|
Entity references of the form ``\code{\&\var{name};}''.
 | 
						|
 | 
						|
\item
 | 
						|
SGML comments of the form ``\code{<!--\var{text}-->}''.  Note that
 | 
						|
spaces, tabs, and newlines are allowed between the trailing
 | 
						|
``\code{>}'' and the immediately preceeding ``\code{--}''.
 | 
						|
 | 
						|
\end{itemize}
 | 
						|
 | 
						|
The \code{SGMLParser} class must be instantiated without arguments.
 | 
						|
It has the following interface methods:
 | 
						|
 | 
						|
\renewcommand{\indexsubitem}{(SGMLParser method)}
 | 
						|
 | 
						|
\begin{funcdesc}{reset}{}
 | 
						|
Reset the instance.  Loses all unprocessed data.  This is called
 | 
						|
implicitly at instantiation time.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{setnomoretags}{}
 | 
						|
Stop processing tags.  Treat all following input as literal input
 | 
						|
(CDATA).  (This is only provided so the HTML tag \code{<PLAINTEXT>}
 | 
						|
can be implemented.)
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{setliteral}{}
 | 
						|
Enter literal mode (CDATA mode).
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{feed}{data}
 | 
						|
Feed some text to the parser.  It is processed insofar as it consists
 | 
						|
of complete elements; incomplete data is buffered until more data is
 | 
						|
fed or \code{close()} is called.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{close}{}
 | 
						|
Force processing of all buffered data as if it were followed by an
 | 
						|
end-of-file mark.  This method may be redefined by a derived class to
 | 
						|
define additional processing at the end of the input, but the
 | 
						|
redefined version should always call \code{SGMLParser.close()}.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{handle_starttag}{tag\, method\, attributes}
 | 
						|
This method is called to handle start tags for which either a
 | 
						|
\code{start_\var{tag}()} or \code{do_\var{tag}()} method has been
 | 
						|
defined.  The \code{tag} argument is the name of the tag converted to
 | 
						|
lower case, and the \code{method} argument is the bound method which
 | 
						|
should be used to support semantic interpretation of the start tag.
 | 
						|
The \var{attributes} argument is a list of (\var{name}, \var{value})
 | 
						|
pairs containing the attributes found inside the tag's \code{<>}
 | 
						|
brackets.  The \var{name} has been translated to lower case and double
 | 
						|
quotes and backslashes in the \var{value} have been interpreted.  For
 | 
						|
instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this
 | 
						|
method would be called as \code{unknown_starttag('a', [('href',
 | 
						|
'http://www.cwi.nl/')])}.  The base implementation simply calls
 | 
						|
\code{method} with \code{attributes} as the only argument.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{handle_endtag}{tag\, method}
 | 
						|
 | 
						|
This method is called to handle endtags for which an
 | 
						|
\code{end_\var{tag}()} method has been defined.  The \code{tag}
 | 
						|
argument is the name of the tag converted to lower case, and the
 | 
						|
\code{method} argument is the bound method which should be used to
 | 
						|
support semantic interpretation of the end tag.  If no
 | 
						|
\code{end_\var{tag}()} method is defined for the closing element, this
 | 
						|
handler is not called.  The base implementation simply calls
 | 
						|
\code{method}.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{handle_data}{data}
 | 
						|
This method is called to process arbitrary data.  It is intended to be
 | 
						|
overridden by a derived class; the base class implementation does
 | 
						|
nothing.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{handle_charref}{ref}
 | 
						|
This method is called to process a character reference of the form
 | 
						|
``\code{\&\#\var{ref};}''.  In the base implementation, \var{ref} must
 | 
						|
be a decimal number in the
 | 
						|
range 0-255.  It translates the character to \ASCII{} and calls the
 | 
						|
method \code{handle_data()} with the character as argument.  If
 | 
						|
\var{ref} is invalid or out of range, the method
 | 
						|
\code{unknown_charref(\var{ref})} is called to handle the error.  A
 | 
						|
subclass must override this method to provide support for named
 | 
						|
character entities.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{handle_entityref}{ref}
 | 
						|
This method is called to process a general entity reference of the form
 | 
						|
``\code{\&\var{ref};}'' where \var{ref} is an general entity
 | 
						|
reference.  It looks for \var{ref} in the instance (or class)
 | 
						|
variable \code{entitydefs} which should be a mapping from entity names
 | 
						|
to corresponding translations.
 | 
						|
If a translation is found, it calls the method \code{handle_data()}
 | 
						|
with the translation; otherwise, it calls the method
 | 
						|
\code{unknown_entityref(\var{ref})}.  The default \code{entitydefs}
 | 
						|
defines translations for \code{\&}, \code{\&apos}, \code{\>},
 | 
						|
\code{\<}, and \code{\"}.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{handle_comment}{comment}
 | 
						|
This method is called when a comment is encountered.  The
 | 
						|
\code{comment} argument is a string containing the text between the
 | 
						|
``\code{<!--}'' and ``\code{-->}'' delimiters, but not the delimiters
 | 
						|
themselves.  For example, the comment ``\code{<!--text-->}'' will
 | 
						|
cause this method to be called with the argument \code{'text'}.  The
 | 
						|
default method does nothing.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{report_unbalanced}{tag}
 | 
						|
This method is called when an end tag is found which does not
 | 
						|
correspond to any open element.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{unknown_starttag}{tag\, attributes}
 | 
						|
This method is called to process an unknown start tag.  It is intended
 | 
						|
to be overridden by a derived class; the base class implementation
 | 
						|
does nothing.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{unknown_endtag}{tag}
 | 
						|
This method is called to process an unknown end tag.  It is intended
 | 
						|
to be overridden by a derived class; the base class implementation
 | 
						|
does nothing.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{unknown_charref}{ref}
 | 
						|
This method is called to process unresolvable numeric character
 | 
						|
references.  It is intended to be overridden by a derived class; the
 | 
						|
base class implementation does nothing.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{unknown_entityref}{ref}
 | 
						|
This method is called to process an unknown entity reference.  It is
 | 
						|
intended to be overridden by a derived class; the base class
 | 
						|
implementation does nothing.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
Apart from overriding or extending the methods listed above, derived
 | 
						|
classes may also define methods of the following form to define
 | 
						|
processing of specific tags.  Tag names in the input stream are case
 | 
						|
independent; the \var{tag} occurring in method names must be in lower
 | 
						|
case:
 | 
						|
 | 
						|
\begin{funcdesc}{start_\var{tag}}{attributes}
 | 
						|
This method is called to process an opening tag \var{tag}.  It has
 | 
						|
preference over \code{do_\var{tag}()}.  The \var{attributes} argument
 | 
						|
has the same meaning as described for \code{handle_starttag()} above.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{do_\var{tag}}{attributes}
 | 
						|
This method is called to process an opening tag \var{tag} that does
 | 
						|
not come with a matching closing tag.  The \var{attributes} argument
 | 
						|
has the same meaning as described for \code{handle_starttag()} above.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{end_\var{tag}}{}
 | 
						|
This method is called to process a closing tag \var{tag}.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
Note that the parser maintains a stack of open elements for which no
 | 
						|
end tag has been found yet.  Only tags processed by
 | 
						|
\code{start_\var{tag}()} are pushed on this stack.  Definition of an
 | 
						|
\code{end_\var{tag}()} method is optional for these tags.  For tags
 | 
						|
processed by \code{do_\var{tag}()} or by \code{unknown_tag()}, no
 | 
						|
\code{end_\var{tag}()} method must be defined; if defined, it will not
 | 
						|
be used.  If both \code{start_\var{tag}()} and \code{do_\var{tag}()}
 | 
						|
methods exist for a tag, the \code{start_\var{tag}()} method takes
 | 
						|
precedence.
 |