mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 19:34:08 +00:00 
			
		
		
		
	namespaces with 1.5.2. Added an index entry. Fixed XML expansion: XML is the "Extensible Markup Language", not the "eXtended Markup Language".
		
			
				
	
	
		
			241 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
			
		
		
	
	
			241 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
\section{\module{xmllib} ---
 | 
						|
         A parser for XML documents.}
 | 
						|
\declaremodule{standard}{xmllib}
 | 
						|
\moduleauthor{Sjoerd Mullender}{Sjoerd.Mullender@cwi.nl}
 | 
						|
\sectionauthor{Sjoerd Mullender}{Sjoerd.Mullender@cwi.nl}
 | 
						|
 | 
						|
\modulesynopsis{A parser for XML documents.}
 | 
						|
 | 
						|
\index{XML}
 | 
						|
\index{Extensible Markup Language}
 | 
						|
 | 
						|
\versionchanged{1.5.2}
 | 
						|
 | 
						|
This module defines a class \class{XMLParser} which serves as the basis 
 | 
						|
for parsing text files formatted in XML (Extensible Markup Language).
 | 
						|
 | 
						|
\begin{classdesc}{XMLParser}{}
 | 
						|
The \class{XMLParser} class must be instantiated without arguments.
 | 
						|
\end{classdesc}
 | 
						|
 | 
						|
This class provides the following interface methods and instance variables:
 | 
						|
 | 
						|
\begin{memberdesc}{attributes}
 | 
						|
A mapping of element names to mappings.  The latter mapping maps
 | 
						|
attribute names that are valid for the element to the default value of 
 | 
						|
the attribute, or if there is no default to \code{None}.  The default
 | 
						|
value is the empty dictionary.
 | 
						|
\end{memberdesc}
 | 
						|
 | 
						|
\begin{memberdesc}{elements} 
 | 
						|
A mapping of element names to tuples.  The tuples contain a function
 | 
						|
for handling the start and end tag respectively of the element, or
 | 
						|
\code{None} if the method \method{unknown_starttag()} or
 | 
						|
\method{unknown_endtag()} is to be called.  The default value is the
 | 
						|
empty dictionary.
 | 
						|
\end{memberdesc}
 | 
						|
 | 
						|
\begin{memberdesc}{entitydefs}
 | 
						|
A mapping of entitynames to their values.  The default value contains
 | 
						|
definitions for \code{'lt'}, \code{'gt'}, \code{'amp'}, \code{'quot'}, 
 | 
						|
and \code{'apos'}.
 | 
						|
\end{memberdesc}
 | 
						|
 | 
						|
\begin{methoddesc}{reset}{}
 | 
						|
Reset the instance.  Loses all unprocessed data.  This is called
 | 
						|
implicitly at the instantiation time.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{setnomoretags}{}
 | 
						|
Stop processing tags.  Treat all following input as literal input
 | 
						|
(CDATA).
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{setliteral}{}
 | 
						|
Enter literal mode (CDATA mode).  This mode is automatically exited
 | 
						|
when the close tag matching the last unclosed open tag is encountered.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{feed}{data}
 | 
						|
Feed some text to the parser.  It is processed insofar as it consists
 | 
						|
of complete tags; incomplete data is buffered until more data is
 | 
						|
fed or \method{close()} is called.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{close}{}
 | 
						|
Force processing of all buffered data as if it were followed by an
 | 
						|
end-of-file mark.  This method may be redefined by a derived class to
 | 
						|
define additional processing at the end of the input, but the
 | 
						|
redefined version should always call \method{close()}.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{translate_references}{data}
 | 
						|
Translate all entity and character references in \var{data} and
 | 
						|
returns the translated string.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_xml}{encoding, standalone}
 | 
						|
This method is called when the \samp{<?xml ...?>} tag is processed.
 | 
						|
The arguments are the values of the encoding and standalone attributes 
 | 
						|
in the tag.  Both encoding and standalone are optional.  The values
 | 
						|
passed to \method{handle_xml()} default to \code{None} and the string
 | 
						|
\code{'no'} respectively.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_doctype}{tag, data}
 | 
						|
This method is called when the \samp{<!DOCTYPE...>} tag is processed.
 | 
						|
The arguments are the name of the root element and the uninterpreted
 | 
						|
contents of the tag, starting after the white space after the name of
 | 
						|
the root element.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_starttag}{tag, method, attributes}
 | 
						|
This method is called to handle start tags for which a start tag
 | 
						|
handler is defined in the instance variable \member{elements}.  The
 | 
						|
\var{tag} argument is the name of the tag, and the \var{method}
 | 
						|
argument is the function (method) which should be used to support semantic
 | 
						|
interpretation of the start tag.  The \var{attributes} argument is a
 | 
						|
dictionary of attributes, the key being the \var{name} and the value
 | 
						|
being the \var{value} of the attribute found inside the tag's
 | 
						|
\code{<>} brackets.  Character and entity references in the
 | 
						|
\var{value} have been interpreted.  For instance, for the start tag
 | 
						|
\code{<A HREF="http://www.cwi.nl/">}, this method would be called as
 | 
						|
\code{handle_starttag('A', self.elements['A'][0], \{'HREF': 'http://www.cwi.nl/'\})}.
 | 
						|
The base implementation simply calls \var{method} with \var{attributes}
 | 
						|
as the only argument.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_endtag}{tag, method}
 | 
						|
This method is called to handle endtags for which an end tag handler
 | 
						|
is defined in the instance variable \member{elements}.  The \var{tag}
 | 
						|
argument is the name of the tag, and the \var{method} argument is the
 | 
						|
function (method) which should be used to support semantic
 | 
						|
interpretation of the end tag.  For instance, for the endtag
 | 
						|
\code{</A>}, this method would be called as \code{handle_endtag('A',
 | 
						|
self.elements['A'][1])}.  The base implementation simply calls
 | 
						|
\var{method}.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_data}{data}
 | 
						|
This method is called to process arbitrary data.  It is intended to be
 | 
						|
overridden by a derived class; the base class implementation does
 | 
						|
nothing.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_charref}{ref}
 | 
						|
This method is called to process a character reference of the form
 | 
						|
\samp{\&\#\var{ref};}.  \var{ref} can either be a decimal number,
 | 
						|
or a hexadecimal number when preceded by an \character{x}.
 | 
						|
In the base implementation, \var{ref} must be a number in the
 | 
						|
range 0-255.  It translates the character to \ASCII{} and calls the
 | 
						|
method \method{handle_data()} with the character as argument.  If
 | 
						|
\var{ref} is invalid or out of range, the method
 | 
						|
\code{unknown_charref(\var{ref})} is called to handle the error.  A
 | 
						|
subclass must override this method to provide support for character
 | 
						|
references outside of the \ASCII{} range.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_entityref}{ref}
 | 
						|
This method is called to process a general entity reference of the
 | 
						|
form \samp{\&\var{ref};} where \var{ref} is an general entity
 | 
						|
reference.  It looks for \var{ref} in the instance (or class)
 | 
						|
variable \member{entitydefs} which should be a mapping from entity
 | 
						|
names to corresponding translations.
 | 
						|
If a translation is found, it calls the method \method{handle_data()}
 | 
						|
with the translation; otherwise, it calls the method
 | 
						|
\code{unknown_entityref(\var{ref})}.  The default \member{entitydefs}
 | 
						|
defines translations for \code{\&}, \code{\&apos}, \code{\>},
 | 
						|
\code{\<}, and \code{\"}.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_comment}{comment}
 | 
						|
This method is called when a comment is encountered.  The
 | 
						|
\var{comment} argument is a string containing the text between the
 | 
						|
\samp{<!--} and \samp{-->} delimiters, but not the delimiters
 | 
						|
themselves.  For example, the comment \samp{<!--text-->} will
 | 
						|
cause this method to be called with the argument \code{'text'}.  The
 | 
						|
default method does nothing.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_cdata}{data}
 | 
						|
This method is called when a CDATA element is encountered.  The
 | 
						|
\var{data} argument is a string containing the text between the
 | 
						|
\samp{<![CDATA[} and \samp{]]>} delimiters, but not the delimiters
 | 
						|
themselves.  For example, the entity \samp{<![CDATA[text]]>} will
 | 
						|
cause this method to be called with the argument \code{'text'}.  The
 | 
						|
default method does nothing, and is intended to be overridden.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_proc}{name, data}
 | 
						|
This method is called when a processing instruction (PI) is
 | 
						|
encountered.  The \var{name} is the PI target, and the \var{data}
 | 
						|
argument is a string containing the text between the PI target and the
 | 
						|
closing delimiter, but not the delimiter itself.  For example, the
 | 
						|
instruction \samp{<?XML text?>} will cause this method to be called
 | 
						|
with the arguments \code{'XML'} and \code{'text'}.  The default method
 | 
						|
does nothing.  Note that if a document starts with \samp{<?xml
 | 
						|
..?>}, \method{handle_xml()} is called to handle it.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{handle_special}{data}
 | 
						|
This method is called when a declaration is encountered.  The
 | 
						|
\var{data} argument is a string containing the text between the
 | 
						|
\samp{<!} and \samp{>} delimiters, but not the delimiters
 | 
						|
themselves.  For example, the entity \samp{<!ENTITY text>} will
 | 
						|
cause this method to be called with the argument \code{'ENTITY text'}.  The
 | 
						|
default method does nothing.  Note that \samp{<!DOCTYPE ...>} is
 | 
						|
handled separately if it is located at the start of the document.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{syntax_error}{message}
 | 
						|
This method is called when a syntax error is encountered.  The
 | 
						|
\var{message} is a description of what was wrong.  The default method 
 | 
						|
raises a \exception{RuntimeError} exception.  If this method is
 | 
						|
overridden, it is permissable for it to return.  This method is only
 | 
						|
called when the error can be recovered from.  Unrecoverable errors
 | 
						|
raise a \exception{RuntimeError} without first calling
 | 
						|
\method{syntax_error()}.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{unknown_starttag}{tag, attributes}
 | 
						|
This method is called to process an unknown start tag.  It is intended
 | 
						|
to be overridden by a derived class; the base class implementation
 | 
						|
does nothing.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{unknown_endtag}{tag}
 | 
						|
This method is called to process an unknown end tag.  It is intended
 | 
						|
to be overridden by a derived class; the base class implementation
 | 
						|
does nothing.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{unknown_charref}{ref}
 | 
						|
This method is called to process unresolvable numeric character
 | 
						|
references.  It is intended to be overridden by a derived class; the
 | 
						|
base class implementation does nothing.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\begin{methoddesc}{unknown_entityref}{ref}
 | 
						|
This method is called to process an unknown entity reference.  It is
 | 
						|
intended to be overridden by a derived class; the base class
 | 
						|
implementation does nothing.
 | 
						|
\end{methoddesc}
 | 
						|
 | 
						|
\subsection{XML Namespaces}
 | 
						|
 | 
						|
This module has support for XML namespaces as defined in the XML
 | 
						|
Namespaces proposed recommendation.
 | 
						|
 | 
						|
Tag and attribute names that are defined in an XML namespace are
 | 
						|
handled as if the name of the tag or element consisted of the
 | 
						|
namespace (i.e. the URL that defines the namespace) followed by a
 | 
						|
space and the name of the tag or attribute.  For instance, the tag
 | 
						|
\code{<html xmlns='http://www.w3.org/TR/REC-html40'>} is treated as if 
 | 
						|
the tag name was \code{'http://www.w3.org/TR/REC-html40 html'}, and
 | 
						|
the tag \code{<html:a href='http://frob.com'>} inside the above
 | 
						|
mentioned element is treated as if the tag name were
 | 
						|
\code{'http://www.w3.org/TR/REC-html40 a'} and the attribute name as
 | 
						|
if it were \code{'http://www.w3.org/TR/REC-html40 src'}.
 | 
						|
 | 
						|
An older draft of the XML Namespaces proposal is also recognized, but
 | 
						|
triggers a warning.
 |