mirror of https://github.com/python/cpython.git
263 lines
8.9 KiB
TeX
263 lines
8.9 KiB
TeX
\section{\module{pyexpat} ---
|
|
Fast XML parsing using the Expat C library}
|
|
|
|
\declaremodule{builtin}{pyexpat}
|
|
\modulesynopsis{An interface to the Expat XML parser.}
|
|
\moduleauthor{Paul Prescod}{paul@prescod.net}
|
|
\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
|
|
|
|
The \module{pyexpat} module is a Python interface to the Expat
|
|
non-validating XML parser.
|
|
The module provides a single extension type, \class{xmlparser}, that
|
|
represents the current state of an XML parser. After an
|
|
\class{xmlparser} object has been created, various attributes of the object
|
|
can be set to handler functions. When an XML document is then fed to
|
|
the parser, the handler functions are called for the character data
|
|
and markup in the XML document.
|
|
|
|
The \module{pyexpat} module contains two functions:
|
|
|
|
\begin{funcdesc}{ErrorString}{errno}
|
|
Returns an explanatory string for a given error number \var{errno}.
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{ParserCreate}{\optional{encoding, namespace_separator}}
|
|
Creates and returns a new \class{xmlparser} object.
|
|
\var{encoding}, if specified, must be a string naming the encoding
|
|
used by the XML data. Expat doesn't support as many encodings as
|
|
Python does, and its repertoire of encodings can't be extended; it
|
|
supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.
|
|
|
|
% XXX pyexpat.c should only allow a 1-char string for this parameter
|
|
Expat can optionally do XML namespace processing for you, enabled by
|
|
providing a value for \var{namespace_separator}. When namespace
|
|
processing is enabled, element type names and attribute names that
|
|
belong to a namespace will be expanded. The element name
|
|
passed to the element handlers
|
|
\function{StartElementHandler()} and \function{EndElementHandler()}
|
|
will be the concatenation of the namespace URI, the namespace
|
|
separator character, and the local part of the name. If the namespace
|
|
separator is a zero byte (\code{chr(0)})
|
|
then the namespace URI and the local part will be
|
|
concatenated without any separator.
|
|
|
|
For example, if \var{namespace_separator} is set to
|
|
\samp{ }, and the following document is parsed:
|
|
|
|
\begin{verbatim}
|
|
<?xml version="1.0"?>
|
|
<root xmlns = "http://default-namespace.org/"
|
|
xmlns:py = "http://www.python.org/ns/">
|
|
<py:elem1 />
|
|
<elem2 xmlns="" />
|
|
</root>
|
|
\end{verbatim}
|
|
|
|
\function{StartElementHandler()} will receive the following strings for each element:
|
|
|
|
\begin{verbatim}
|
|
http://default-namespace.org/ root
|
|
http://www.python.org/ns/ elem1
|
|
elem2
|
|
\end{verbatim}
|
|
|
|
\end{funcdesc}
|
|
|
|
\class{xmlparser} objects have the following methods:
|
|
|
|
\begin{methoddesc}{Parse}{data \optional{, isfinal}}
|
|
Parses the contents of the string \var{data}, calling the appropriate
|
|
handler functions to process the parsed data. \var{isfinal} must be
|
|
true on the final call to this method. \var{data} can be the empty string at any time.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{ParseFile}{file}
|
|
Parse XML data reading from the object \var{file}. \var{file} only
|
|
needs to provide the \method{read(\var{nbytes})} method, returning the
|
|
empty string when there's no more data.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{SetBase}{base}
|
|
Sets the base to be used for resolving relative URIs in system identifiers in
|
|
declarations. Resolving relative identifiers is left to the application:
|
|
this value will be passed through as the base argument to the
|
|
\function{ExternalEntityRefHandler}, \function{NotationDeclHandler},
|
|
and \function{UnparsedEntityDeclHandler} functions.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{GetBase}{}
|
|
Returns a string containing the base set by a previous call to
|
|
\method{SetBase()}, or \code{None} if
|
|
\method{SetBase()} hasn't been called.
|
|
\end{methoddesc}
|
|
|
|
\class{xmlparser} objects have the following attributes, containing
|
|
values relating to the most recent error encountered by an
|
|
\class{xmlparser} object. These attributes will only have correct
|
|
values once a call to \method{Parse()} or \method{ParseFile()}
|
|
has raised a \exception{pyexpat.error} exception.
|
|
|
|
\begin{datadesc}{ErrorByteIndex}
|
|
Byte index at which an error occurred.
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{ErrorCode}
|
|
Numeric code specifying the problem. This value can be passed to the
|
|
\function{ErrorString()} function, or compared to one of the constants
|
|
defined in the \module{pyexpat.errors} submodule.
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{ErrorColumnNumber}
|
|
Column number at which an error occurred.
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{ErrorLineNumber}
|
|
Line number at which an error occurred.
|
|
\end{datadesc}
|
|
|
|
Here is the list of handlers that can be set. To set a handler on an
|
|
\class{xmlparser} object \var{o}, use \code{\var{o}.\var{handlername} = \var{func}}. \var{handlername} must be taken from the following list, and \var{func} must be a callable object accepting the correct number of arguments. The arguments are all strings, unless otherwise stated.
|
|
|
|
\begin{methoddesc}{StartElementHandler}{name, attributes}
|
|
Called for the start of every element. \var{name} is a string
|
|
containing the element name, and \var{attributes} is a dictionary
|
|
mapping attribute names to their values.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{EndElementHandler}{name}
|
|
Called for the end of every element.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{ProcessingInstructionHandler}{target, data}
|
|
Called for every processing instruction.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{CharacterDataHandler}{\var{data}}
|
|
Called for character data.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{UnparsedEntityDeclHandler}{entityName, base, systemId, publicId, notationName}
|
|
Called for unparsed (NDATA) entity declarations.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{NotationDeclHandler}{notationName, base, systemId, publicId}
|
|
Called for notation declarations.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{StartNamespaceDeclHandler}{prefix, uri}
|
|
Called when an element contains a namespace declaration.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{EndNamespaceDeclHandler}{prefix}
|
|
Called when the closing tag is reached for an element
|
|
that contained a namespace declaration.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{CommentHandler}{data}
|
|
Called for comments.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{StartCdataSectionHandler}{}
|
|
Called at the start of a CDATA section.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{EndCdataSectionHandler}{}
|
|
Called at the end of a CDATA section.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{DefaultHandler}{data}
|
|
Called for any characters in the XML document for
|
|
which no applicable handler has been specified. This means
|
|
characters that are part of a construct which could be reported, but
|
|
for which no handler has been supplied.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{DefaultHandlerExpand}{data}
|
|
This is the same as the \function{DefaultHandler},
|
|
but doesn't inhibit expansion of internal entities.
|
|
The entity reference will not be passed to the default handler.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{NotStandaloneHandler}{}
|
|
Called if the XML document hasn't been declared as being a standalone document.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}{ExternalEntityRefHandler}{context, base, systemId, publicId}
|
|
Called for references to external entities.
|
|
\end{methoddesc}
|
|
|
|
|
|
|
|
|
|
|
|
\subsection{\module{pyexpat.errors} -- Error constants}
|
|
|
|
The following table lists the error constants in the
|
|
\module{pyexpat.errors} submodule, available once the \module{pyexpat} module has been imported.
|
|
|
|
\begin{tableii}{l|l}{code}{Constants}{}{}
|
|
\lineii {XML_ERROR_ASYNC_ENTITY}
|
|
{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
|
|
\lineii {XML_ERROR_BAD_CHAR_REF}
|
|
{XML_ERROR_BINARY_ENTITY_REF}
|
|
\lineii {XML_ERROR_DUPLICATE_ATTRIBUTE}
|
|
{XML_ERROR_INCORRECT_ENCODING}
|
|
\lineii {XML_ERROR_INVALID_TOKEN}
|
|
{XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
|
|
\lineii {XML_ERROR_MISPLACED_XML_PI}
|
|
{XML_ERROR_NO_ELEMENTS}
|
|
\lineii {XML_ERROR_NO_MEMORY}
|
|
{XML_ERROR_PARAM_ENTITY_REF}
|
|
\lineii {XML_ERROR_PARTIAL_CHAR}
|
|
{XML_ERROR_RECURSIVE_ENTITY_REF}
|
|
\lineii {XML_ERROR_SYNTAX}
|
|
{XML_ERROR_TAG_MISMATCH}
|
|
\lineii {XML_ERROR_UNCLOSED_TOKEN}
|
|
{XML_ERROR_UNDEFINED_ENTITY}
|
|
\lineii {XML_ERROR_UNKNOWN_ENCODING}{}
|
|
\end{tableii}
|
|
|
|
\subsection{Example}
|
|
|
|
The following program defines 3 handlers that just print out their
|
|
arguments.
|
|
|
|
\begin{verbatim}
|
|
|
|
import pyexpat
|
|
|
|
# 3 handler functions
|
|
def start_element(name, attrs):
|
|
print 'Start element:', name, attrs
|
|
def end_element(name):
|
|
print 'End element:', name
|
|
def char_data(data):
|
|
print 'Character data:', repr(data)
|
|
|
|
p=pyexpat.ParserCreate()
|
|
|
|
p.StartElementHandler = start_element
|
|
p.EndElementHandler = end_element
|
|
p.CharacterDataHandler= char_data
|
|
|
|
p.Parse("""<?xml version="1.0"?>
|
|
<parent id="top"><child1 name="paul">Text goes here</child1>
|
|
<child2 name="fred">More text</child2>
|
|
</parent>""")
|
|
\end{verbatim}
|
|
|
|
The output from this program is:
|
|
|
|
\begin{verbatim}
|
|
Start element: parent {'id': 'top'}
|
|
Start element: child1 {'name': 'paul'}
|
|
Character data: 'Text goes here'
|
|
End element: child1
|
|
Character data: '\012'
|
|
Start element: child2 {'name': 'fred'}
|
|
Character data: 'More text'
|
|
End element: child2
|
|
Character data: '\012'
|
|
End element: parent
|
|
\end{verbatim}
|