mirror of https://github.com/python/cpython.git
357 lines
12 KiB
TeX
357 lines
12 KiB
TeX
\section{\module{xml.parsers.expat} ---
|
|
Fast XML parsing using Expat}
|
|
|
|
\declaremodule{standard}{xml.parsers.expat}
|
|
\modulesynopsis{An interface to the Expat non-validating XML parser.}
|
|
\moduleauthor{Paul Prescod}{paul@prescod.net}
|
|
\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
|
|
|
|
\versionadded{2.0}
|
|
|
|
The \module{xml.parsers.expat} module is a Python interface to the
|
|
Expat\index{Expat} non-validating XML parser.
|
|
The module provides a single extension type, \class{xmlparser}, that
|
|
represents the current state of an XML parser. After an
|
|
\class{xmlparser} object has been created, various attributes of the object
|
|
can be set to handler functions. When an XML document is then fed to
|
|
the parser, the handler functions are called for the character data
|
|
and markup in the XML document.
|
|
|
|
This module uses the \module{pyexpat}\refbimodindex{pyexpat} module to
|
|
provide access to the Expat parser. Direct use of the
|
|
\module{pyexpat} module is deprecated.
|
|
|
|
This module provides one exception and one type object:
|
|
|
|
\begin{excdesc}{error}
|
|
The exception raised when Expat reports an error.
|
|
\end{excdesc}
|
|
|
|
\begin{datadesc}{XMLParserType}
|
|
The type of the return values from the \function{ParserCreate()}
|
|
function.
|
|
\end{datadesc}
|
|
|
|
|
|
The \module{xml.parsers.expat} module contains two functions:
|
|
|
|
\begin{funcdesc}{ErrorString}{errno}
|
|
Returns an explanatory string for a given error number \var{errno}.
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{ParserCreate}{\optional{encoding\optional{,
|
|
namespace_separator}}}
|
|
Creates and returns a new \class{xmlparser} object.
|
|
\var{encoding}, if specified, must be a string naming the encoding
|
|
used by the XML data. Expat doesn't support as many encodings as
|
|
Python does, and its repertoire of encodings can't be extended; it
|
|
supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.
|
|
|
|
Expat can optionally do XML namespace processing for you, enabled by
|
|
providing a value for \var{namespace_separator}. The value must be a
|
|
one-character string; a \exception{ValueError} will be raised if the
|
|
string has an illegal length (\code{None} is considered the same as
|
|
omission). When namespace processing is enabled, element type names
|
|
and attribute names that belong to a namespace will be expanded. The
|
|
element name passed to the element handlers
|
|
\function{StartElementHandler()} and \function{EndElementHandler()}
|
|
will be the concatenation of the namespace URI, the namespace
|
|
separator character, and the local part of the name. If the namespace
|
|
separator is a zero byte (\code{chr(0)}) then the namespace URI and
|
|
the local part will be concatenated without any separator.
|
|
|
|
For example, if \var{namespace_separator} is set to a space character
|
|
(\character{ }) and the following document is parsed:
|
|
|
|
\begin{verbatim}
|
|
<?xml version="1.0"?>
|
|
<root xmlns = "http://default-namespace.org/"
|
|
xmlns:py = "http://www.python.org/ns/">
|
|
<py:elem1 />
|
|
<elem2 xmlns="" />
|
|
</root>
|
|
\end{verbatim}
|
|
|
|
\function{StartElementHandler()} will receive the following strings
|
|
for each element:
|
|
|
|
\begin{verbatim}
|
|
http://default-namespace.org/ root
|
|
http://www.python.org/ns/ elem1
|
|
elem2
|
|
\end{verbatim}
|
|
\end{funcdesc}
|
|
|
|
|
|
\subsection{XMLParser Objects \label{xmlparser-objects}}
|
|
|
|
\class{xmlparser} objects have the following methods:
|
|
|
|
\begin{methoddesc}[xmlparser]{Parse}{data\optional{, isfinal}}
|
|
Parses the contents of the string \var{data}, calling the appropriate
|
|
handler functions to process the parsed data. \var{isfinal} must be
|
|
true on the final call to this method. \var{data} can be the empty
|
|
string at any time.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{ParseFile}{file}
|
|
Parse XML data reading from the object \var{file}. \var{file} only
|
|
needs to provide the \method{read(\var{nbytes})} method, returning the
|
|
empty string when there's no more data.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{SetBase}{base}
|
|
Sets the base to be used for resolving relative URIs in system identifiers in
|
|
declarations. Resolving relative identifiers is left to the application:
|
|
this value will be passed through as the base argument to the
|
|
\function{ExternalEntityRefHandler}, \function{NotationDeclHandler},
|
|
and \function{UnparsedEntityDeclHandler} functions.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{GetBase}{}
|
|
Returns a string containing the base set by a previous call to
|
|
\method{SetBase()}, or \code{None} if
|
|
\method{SetBase()} hasn't been called.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{ExternalEntityParserCreate}{context\optional{,
|
|
encoding}}
|
|
Create a ``child'' parser which can be used to parse an external
|
|
parsed entity referred to by content parsed by the parent parser. The
|
|
\var{context} parameter should be the string passed to the
|
|
\method{ExternalEntityRefHandler()} handler function, described below.
|
|
\end{methoddesc}
|
|
|
|
|
|
\class{xmlparser} objects have the following attributes:
|
|
|
|
\begin{memberdesc}[xmlparser]{returns_unicode}
|
|
If this attribute is set to 1, the handler functions will be passed
|
|
Unicode strings. If \member{returns_unicode} is 0, 8-bit strings
|
|
containing UTF-8 encoded data will be passed to the handlers.
|
|
\versionchanged[Can be changed at any time to affect the result
|
|
type.]{1.6}
|
|
\end{memberdesc}
|
|
|
|
The following attributes contain values relating to the most recent
|
|
error encountered by an \class{xmlparser} object, and will only have
|
|
correct values once a call to \method{Parse()} or \method{ParseFile()}
|
|
has raised a \exception{xml.parsers.expat.error} exception.
|
|
|
|
\begin{memberdesc}[xmlparser]{ErrorByteIndex}
|
|
Byte index at which an error occurred.
|
|
\end{memberdesc}
|
|
|
|
\begin{memberdesc}[xmlparser]{ErrorCode}
|
|
Numeric code specifying the problem. This value can be passed to the
|
|
\function{ErrorString()} function, or compared to one of the constants
|
|
defined in the \module{errors} object.
|
|
\end{memberdesc}
|
|
|
|
\begin{memberdesc}[xmlparser]{ErrorColumnNumber}
|
|
Column number at which an error occurred.
|
|
\end{memberdesc}
|
|
|
|
\begin{memberdesc}[xmlparser]{ErrorLineNumber}
|
|
Line number at which an error occurred.
|
|
\end{memberdesc}
|
|
|
|
Here is the list of handlers that can be set. To set a handler on an
|
|
\class{xmlparser} object \var{o}, use
|
|
\code{\var{o}.\var{handlername} = \var{func}}. \var{handlername} must
|
|
be taken from the following list, and \var{func} must be a callable
|
|
object accepting the correct number of arguments. The arguments are
|
|
all strings, unless otherwise stated.
|
|
|
|
\begin{methoddesc}[xmlparser]{StartElementHandler}{name, attributes}
|
|
Called for the start of every element. \var{name} is a string
|
|
containing the element name, and \var{attributes} is a dictionary
|
|
mapping attribute names to their values.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{EndElementHandler}{name}
|
|
Called for the end of every element.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{ProcessingInstructionHandler}{target, data}
|
|
Called for every processing instruction.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{CharacterDataHandler}{data}
|
|
Called for character data.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{UnparsedEntityDeclHandler}{entityName, base,
|
|
systemId, publicId,
|
|
notationName}
|
|
Called for unparsed (NDATA) entity declarations.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{NotationDeclHandler}{notationName, base,
|
|
systemId, publicId}
|
|
Called for notation declarations.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{StartNamespaceDeclHandler}{prefix, uri}
|
|
Called when an element contains a namespace declaration.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{EndNamespaceDeclHandler}{prefix}
|
|
Called when the closing tag is reached for an element
|
|
that contained a namespace declaration.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{CommentHandler}{data}
|
|
Called for comments.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{StartCdataSectionHandler}{}
|
|
Called at the start of a CDATA section.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{EndCdataSectionHandler}{}
|
|
Called at the end of a CDATA section.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{DefaultHandler}{data}
|
|
Called for any characters in the XML document for
|
|
which no applicable handler has been specified. This means
|
|
characters that are part of a construct which could be reported, but
|
|
for which no handler has been supplied.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{DefaultHandlerExpand}{data}
|
|
This is the same as the \function{DefaultHandler},
|
|
but doesn't inhibit expansion of internal entities.
|
|
The entity reference will not be passed to the default handler.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{NotStandaloneHandler}{}
|
|
Called if the XML document hasn't been declared as being a standalone
|
|
document.
|
|
\end{methoddesc}
|
|
|
|
\begin{methoddesc}[xmlparser]{ExternalEntityRefHandler}{context, base,
|
|
systemId, publicId}
|
|
Called for references to external entities.
|
|
\end{methoddesc}
|
|
|
|
|
|
\subsection{Example \label{expat-example}}
|
|
|
|
The following program defines three handlers that just print out their
|
|
arguments.
|
|
|
|
\begin{verbatim}
|
|
import xml.parsers.expat
|
|
|
|
# 3 handler functions
|
|
def start_element(name, attrs):
|
|
print 'Start element:', name, attrs
|
|
def end_element(name):
|
|
print 'End element:', name
|
|
def char_data(data):
|
|
print 'Character data:', repr(data)
|
|
|
|
p = xml.parsers.expat.ParserCreate()
|
|
|
|
p.StartElementHandler = start_element
|
|
p.EndElementHandler = end_element
|
|
p.CharacterDataHandler = char_data
|
|
|
|
p.Parse("""<?xml version="1.0"?>
|
|
<parent id="top"><child1 name="paul">Text goes here</child1>
|
|
<child2 name="fred">More text</child2>
|
|
</parent>""")
|
|
\end{verbatim}
|
|
|
|
The output from this program is:
|
|
|
|
\begin{verbatim}
|
|
Start element: parent {'id': 'top'}
|
|
Start element: child1 {'name': 'paul'}
|
|
Character data: 'Text goes here'
|
|
End element: child1
|
|
Character data: '\012'
|
|
Start element: child2 {'name': 'fred'}
|
|
Character data: 'More text'
|
|
End element: child2
|
|
Character data: '\012'
|
|
End element: parent
|
|
\end{verbatim}
|
|
|
|
|
|
\subsection{Expat error constants \label{expat-errors}}
|
|
\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
|
|
|
|
The following table lists the error constants in the
|
|
\code{errors} object of the \module{xml.parsers.expat} module. These
|
|
constants are useful in interpreting some of the attributes of the
|
|
parser object after an error has occurred.
|
|
|
|
The \code{errors} object has the following attributes:
|
|
|
|
\begin{datadesc}{XML_ERROR_ASYNC_ENTITY}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_BAD_CHAR_REF}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_BINARY_ENTITY_REF}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_DUPLICATE_ATTRIBUTE}
|
|
An attribute was used more than once in a start tag.
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_INCORRECT_ENCODING}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_INVALID_TOKEN}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
|
|
Something other than whitespace occurred after the document element.
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_MISPLACED_XML_PI}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_NO_ELEMENTS}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_NO_MEMORY}
|
|
Expat was not able to allocate memory internally.
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_PARAM_ENTITY_REF}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_PARTIAL_CHAR}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_RECURSIVE_ENTITY_REF}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_SYNTAX}
|
|
Some unspecified syntax error was encountered.
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_TAG_MISMATCH}
|
|
An end tag did not match the innermost open start tag.
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_UNCLOSED_TOKEN}
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_UNDEFINED_ENTITY}
|
|
A reference was made to a entity which was not defined.
|
|
\end{datadesc}
|
|
|
|
\begin{datadesc}{XML_ERROR_UNKNOWN_ENCODING}
|
|
The document encoding is not supported by Expat.
|
|
\end{datadesc}
|