Implemented posix-mode parsing support in shlex.py, as dicussed in

mailing list, and in patch #722686.
This commit is contained in:
Gustavo Niemeyer 2003-04-17 21:31:33 +00:00
parent 84c2b1b9aa
commit 68d8cef89a
4 changed files with 416 additions and 47 deletions

View File

@ -4,26 +4,16 @@
\declaremodule{standard}{shlex} \declaremodule{standard}{shlex}
\modulesynopsis{Simple lexical analysis for \UNIX\ shell-like languages.} \modulesynopsis{Simple lexical analysis for \UNIX\ shell-like languages.}
\moduleauthor{Eric S. Raymond}{esr@snark.thyrsus.com} \moduleauthor{Eric S. Raymond}{esr@snark.thyrsus.com}
\moduleauthor{Gustavo Niemeyer}{niemeyer@conectiva.com}
\sectionauthor{Eric S. Raymond}{esr@snark.thyrsus.com} \sectionauthor{Eric S. Raymond}{esr@snark.thyrsus.com}
\sectionauthor{Gustavo Niemeyer}{niemeyer@conectiva.com}
\versionadded{1.5.2} \versionadded{1.5.2}
The \class{shlex} class makes it easy to write lexical analyzers for The \class{shlex} class makes it easy to write lexical analyzers for
simple syntaxes resembling that of the \UNIX{} shell. This will often simple syntaxes resembling that of the \UNIX{} shell. This will often
be useful for writing minilanguages, e.g.\ in run control files for be useful for writing minilanguages, (e.g. in run control files for
Python applications. Python applications) or for parsing quoted strings.
\begin{classdesc}{shlex}{\optional{stream\optional{, file}}}
A \class{shlex} instance or subclass instance is a lexical analyzer
object. The initialization argument, if present, specifies where to
read characters from. It must be a file- or stream-like object with
\method{read()} and \method{readline()} methods. If no argument is given,
input will be taken from \code{sys.stdin}. The second optional
argument is a filename string, which sets the initial value of the
\member{infile} member. If the stream argument is omitted or
equal to \code{sys.stdin}, this second argument defaults to ``stdin''.
\end{classdesc}
\begin{seealso} \begin{seealso}
\seemodule{ConfigParser}{Parser for configuration files similar to the \seemodule{ConfigParser}{Parser for configuration files similar to the
@ -31,16 +21,50 @@ equal to \code{sys.stdin}, this second argument defaults to ``stdin''.
\end{seealso} \end{seealso}
\subsection{Module Contents}
The \module{shlex} module defines the following functions:
\begin{funcdesc}{split}{s\optional{, posix=\code{True}\optional{,
spaces=\code{True}}}}
Split the string \var{s} using shell-like syntax. If \code{posix} is
\code{True}, operate in posix mode. If \code{spaces} is \code{True}, it
will only split words in whitespaces (setting the
\member{whitespace_split} member of the \class{shlex} instance).
\versionadded{2.3}
\end{funcdesc}
The \module{shlex} module defines the following classes:
\begin{classdesc}{shlex}{\optional{instream=\code{sys.stdin}\optional{,
infile=\code{None}\optional{,
posix=\code{False}}}}}
A \class{shlex} instance or subclass instance is a lexical analyzer
object. The initialization argument, if present, specifies where to
read characters from. It must be a file-/stream-like object with
\method{read()} and \method{readline()} methods, or a string (strings
are accepted since Python 2.3). If no argument is given, input will be
taken from \code{sys.stdin}. The second optional argument is a filename
string, which sets the initial value of the \member{infile} member. If
the \var{instream} argument is omitted or equal to \code{sys.stdin},
this second argument defaults to ``stdin''. The \var{posix} argument
was introduced in Python 2.3, and defines the operational mode. When
\var{posix} is not true (default), the \class{shlex} instance will
operate in compatibility mode. When operating in posix mode,
\class{shlex} will try to be as close as possible to the posix shell
parsing rules. See~\ref{shlex-objects}.
\end{classdesc}
\subsection{shlex Objects \label{shlex-objects}} \subsection{shlex Objects \label{shlex-objects}}
A \class{shlex} instance has the following methods: A \class{shlex} instance has the following methods:
\begin{methoddesc}{get_token}{} \begin{methoddesc}{get_token}{}
Return a token. If tokens have been stacked using Return a token. If tokens have been stacked using
\method{push_token()}, pop a token off the stack. Otherwise, read one \method{push_token()}, pop a token off the stack. Otherwise, read one
from the input stream. If reading encounters an immediate from the input stream. If reading encounters an immediate
end-of-file, an empty string is returned. end-of-file, \member{self.eof} is returned (the empty string (\code{""})
in non-posix mode, and \code{None} in posix mode).
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{push_token}{str} \begin{methoddesc}{push_token}{str}
@ -132,6 +156,12 @@ bounds tokens. By default, includes space, tab, linefeed and
carriage-return. carriage-return.
\end{memberdesc} \end{memberdesc}
\begin{memberdesc}{escape}
Characters that will be considered as escape. This will be only used
in posix mode, and includes just \character{\textbackslash} by default.
\versionadded{2.3}
\end{memberdesc}
\begin{memberdesc}{quotes} \begin{memberdesc}{quotes}
Characters that will be considered string quotes. The token Characters that will be considered string quotes. The token
accumulates until the same quote is encountered again (thus, different accumulates until the same quote is encountered again (thus, different
@ -139,6 +169,20 @@ quote types protect each other as in the shell.) By default, includes
\ASCII{} single and double quotes. \ASCII{} single and double quotes.
\end{memberdesc} \end{memberdesc}
\begin{memberdesc}{escapedquotes}
Characters in \member{quotes} that will interpret escape characters
defined in \member{escape}. This is only used in posix mode, and includes
just \character{"} by default.
\versionadded{2.3}
\end{memberdesc}
\begin{memberdesc}{whitespace_split}
If true, tokens will only be split in whitespaces. This is useful, for
example, for parsing command lines with \class{shlex}, getting tokens
in a similar way to shell arguments.
\versionadded{2.3}
\end{memberdesc}
\begin{memberdesc}{infile} \begin{memberdesc}{infile}
The name of the current input file, as initially set at class The name of the current input file, as initially set at class
instantiation time or stacked by later source requests. It may instantiation time or stacked by later source requests. It may
@ -168,13 +212,6 @@ need to use this, you can read the module source code to learn the
details. details.
\end{memberdesc} \end{memberdesc}
Note that any character not declared to be a word character,
whitespace, or a quote will be returned as a single-character token.
Quote and comment characters are not recognized within words. Thus,
the bare words \samp{ain't} and \samp{ain\#t} would be returned as single
tokens by the default parser.
\begin{memberdesc}{lineno} \begin{memberdesc}{lineno}
Source line number (count of newlines seen so far plus one). Source line number (count of newlines seen so far plus one).
\end{memberdesc} \end{memberdesc}
@ -183,3 +220,56 @@ Source line number (count of newlines seen so far plus one).
The token buffer. It may be useful to examine this when catching The token buffer. It may be useful to examine this when catching
exceptions. exceptions.
\end{memberdesc} \end{memberdesc}
\begin{memberdesc}{eof}
Token used to determine end of file. This will be set to the empty
string (\code{""}), in non-posix mode, and to \code{None} in posix
mode.
\versionadded{2.3}
\end{memberdesc}
\subsection{Parsing Rules\label{shlex-parsing-rules}}
When operating in non-posix mode, \class{shlex} with try to obey to the
following rules.
\begin{itemize}
\item Quote characters are not recognized within words
(\code{Do"Not"Separate} is parsed as the single word
\code{Do"Not"Separate});
\item Escape characters are not recognized;
\item Enclosing characters in quotes preserve the literal value of
all characters within the quotes;
\item Closing quotes separate words (\code{"Do"Separate} is parsed
as \code{"Do"} and \code{Separate});
\item If \member{whitespace_split} is \code{False}, any character not
declared to be a word character, whitespace, or a quote will be
returned as a single-character token. If it is \code{True},
\class{shlex} will only split words in whitespaces;
\item EOF is signaled with an empty string (\code{""});
\item It's not possible to parse empty strings, even if quoted.
\end{itemize}
When operating in posix mode, \class{shlex} will try to obey to the
following parsing rules.
\begin{itemize}
\item Quotes are stripped out, and do not separate words
(\code{"Do"Not"Separate"} is parsed as the single word
\code{DoNotSeparate});
\item Non-quoted escape characters (e.g. \character{\textbackslash})
preserve the literal value of the next character that follows;
\item Enclosing characters in quotes which are not part of
\member{escapedquotes} (e.g. \character{'}) preserve the literal
value of all characters within the quotes;
\item Enclosing characters in quotes which are part of
\member{escapedquotes} (e.g. \character{"}) preserves the literal
value of all characters within the quotes, with the exception of
the characters mentioned in \member{escape}. The escape characters
retain its special meaning only when followed by the quote in use,
or the escape character itself. Otherwise the escape character
will be considered a normal character.
\item EOF is signaled with a \code{None} value;
\item Quoted empty strings (\code{""}) are allowed;
\end{itemize}

View File

@ -1,28 +1,51 @@
# -*- coding: iso-8859-1 -*-
"""A lexical analyzer class for simple shell-like syntaxes.""" """A lexical analyzer class for simple shell-like syntaxes."""
# Module and documentation by Eric S. Raymond, 21 Dec 1998 # Module and documentation by Eric S. Raymond, 21 Dec 1998
# Input stacking and error message cleanup added by ESR, March 2000 # Input stacking and error message cleanup added by ESR, March 2000
# push_source() and pop_source() made explicit by ESR, January 2001. # push_source() and pop_source() made explicit by ESR, January 2001.
# Posix compliance, split(), string arguments, and
# iterator interface by Gustavo Niemeyer, April 2003.
import os.path import os.path
import sys import sys
__all__ = ["shlex"] from types import StringTypes
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
__all__ = ["shlex", "split"]
class shlex: class shlex:
"A lexical analyzer class for simple shell-like syntaxes." "A lexical analyzer class for simple shell-like syntaxes."
def __init__(self, instream=None, infile=None): def __init__(self, instream=None, infile=None, posix=0):
if type(instream) in StringTypes:
instream = StringIO(instream)
if instream is not None: if instream is not None:
self.instream = instream self.instream = instream
self.infile = infile self.infile = infile
else: else:
self.instream = sys.stdin self.instream = sys.stdin
self.infile = None self.infile = None
self.posix = posix
if posix:
self.eof = None
else:
self.eof = ''
self.commenters = '#' self.commenters = '#'
self.wordchars = ('abcdfeghijklmnopqrstuvwxyz' self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_') 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
if self.posix:
self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
self.whitespace = ' \t\r\n' self.whitespace = ' \t\r\n'
self.whitespace_split = 0
self.quotes = '\'"' self.quotes = '\'"'
self.escape = '\\'
self.escapedquotes = '"'
self.state = ' ' self.state = ' '
self.pushback = [] self.pushback = []
self.lineno = 1 self.lineno = 1
@ -42,6 +65,8 @@ def push_token(self, tok):
def push_source(self, newstream, newfile=None): def push_source(self, newstream, newfile=None):
"Push an input source onto the lexer's input source stack." "Push an input source onto the lexer's input source stack."
if type(newstream) in StringTypes:
newstream = StringIO(newstream)
self.filestack.insert(0, (self.infile, self.instream, self.lineno)) self.filestack.insert(0, (self.infile, self.instream, self.lineno))
self.infile = newfile self.infile = newfile
self.instream = newstream self.instream = newstream
@ -73,29 +98,31 @@ def get_token(self):
# No pushback. Get a token. # No pushback. Get a token.
raw = self.read_token() raw = self.read_token()
# Handle inclusions # Handle inclusions
while raw == self.source: if self.source is not None:
spec = self.sourcehook(self.read_token()) while raw == self.source:
if spec: spec = self.sourcehook(self.read_token())
(newfile, newstream) = spec if spec:
self.push_source(newstream, newfile) (newfile, newstream) = spec
raw = self.get_token() self.push_source(newstream, newfile)
raw = self.get_token()
# Maybe we got EOF instead? # Maybe we got EOF instead?
while raw == "": while raw == self.eof:
if len(self.filestack) == 0: if len(self.filestack) == 0:
return "" return self.eof
else: else:
self.pop_source() self.pop_source()
raw = self.get_token() raw = self.get_token()
# Neither inclusion nor EOF # Neither inclusion nor EOF
if self.debug >= 1: if self.debug >= 1:
if raw: if raw != self.eof:
print "shlex: token=" + `raw` print "shlex: token=" + `raw`
else: else:
print "shlex: token=EOF" print "shlex: token=EOF"
return raw return raw
def read_token(self): def read_token(self):
"Read a token from the input stream (no pushback or inclusions)" quoted = 0
escapedstate = ' '
while 1: while 1:
nextchar = self.instream.read(1) nextchar = self.instream.read(1)
if nextchar == '\n': if nextchar == '\n':
@ -113,35 +140,65 @@ def read_token(self):
elif nextchar in self.whitespace: elif nextchar in self.whitespace:
if self.debug >= 2: if self.debug >= 2:
print "shlex: I see whitespace in whitespace state" print "shlex: I see whitespace in whitespace state"
if self.token: if self.token or (self.posix and quoted):
break # emit current token break # emit current token
else: else:
continue continue
elif nextchar in self.commenters: elif nextchar in self.commenters:
self.instream.readline() self.instream.readline()
self.lineno = self.lineno + 1 self.lineno = self.lineno + 1
elif self.posix and nextchar in self.escape:
escapedstate = 'a'
self.state = nextchar
elif nextchar in self.wordchars: elif nextchar in self.wordchars:
self.token = nextchar self.token = nextchar
self.state = 'a' self.state = 'a'
elif nextchar in self.quotes: elif nextchar in self.quotes:
self.token = nextchar if not self.posix:
self.token = nextchar
self.state = nextchar self.state = nextchar
elif self.whitespace_split:
self.token = nextchar
self.state = 'a'
else: else:
self.token = nextchar self.token = nextchar
if self.token: if self.token or (self.posix and quoted):
break # emit current token break # emit current token
else: else:
continue continue
elif self.state in self.quotes: elif self.state in self.quotes:
self.token = self.token + nextchar quoted = 1
if nextchar == self.state: if not nextchar: # end of file
self.state = ' '
break
elif not nextchar: # end of file
if self.debug >= 2: if self.debug >= 2:
print "shlex: I see EOF in quotes state" print "shlex: I see EOF in quotes state"
# XXX what error should be raised here? # XXX what error should be raised here?
raise ValueError, "No closing quotation" raise ValueError, "No closing quotation"
if nextchar == self.state:
if not self.posix:
self.token = self.token + nextchar
self.state = ' '
break
else:
self.state = 'a'
elif self.posix and nextchar in self.escape and \
self.state in self.escapedquotes:
escapedstate = self.state
self.state = nextchar
else:
self.token = self.token + nextchar
elif self.state in self.escape:
if not nextchar: # end of file
if self.debug >= 2:
print "shlex: I see EOF in escape state"
# XXX what error should be raised here?
raise ValueError, "No escaped character"
# In posix shells, only the quote itself or the escape
# character may be escaped within quotes.
if escapedstate in self.quotes and \
nextchar != self.state and nextchar != escapedstate:
self.token = self.token + self.state
self.token = self.token + nextchar
self.state = escapedstate
elif self.state == 'a': elif self.state == 'a':
if not nextchar: if not nextchar:
self.state = None # end of file self.state = None # end of file
@ -150,14 +207,26 @@ def read_token(self):
if self.debug >= 2: if self.debug >= 2:
print "shlex: I see whitespace in word state" print "shlex: I see whitespace in word state"
self.state = ' ' self.state = ' '
if self.token: if self.token or (self.posix and quoted):
break # emit current token break # emit current token
else: else:
continue continue
elif nextchar in self.commenters: elif nextchar in self.commenters:
self.instream.readline() self.instream.readline()
self.lineno = self.lineno + 1 self.lineno = self.lineno + 1
elif nextchar in self.wordchars or nextchar in self.quotes: if self.posix:
self.state = ' '
if self.token or (self.posix and quoted):
break # emit current token
else:
continue
elif self.posix and nextchar in self.quotes:
self.state = nextchar
elif self.posix and nextchar in self.escape:
escapedstate = 'a'
self.state = nextchar
elif nextchar in self.wordchars or nextchar in self.quotes \
or self.whitespace_split:
self.token = self.token + nextchar self.token = self.token + nextchar
else: else:
self.pushback = [nextchar] + self.pushback self.pushback = [nextchar] + self.pushback
@ -170,6 +239,8 @@ def read_token(self):
continue continue
result = self.token result = self.token
self.token = '' self.token = ''
if self.posix and not quoted and result == '':
result = None
if self.debug > 1: if self.debug > 1:
if result: if result:
print "shlex: raw token=" + `result` print "shlex: raw token=" + `result`
@ -182,7 +253,7 @@ def sourcehook(self, newfile):
if newfile[0] == '"': if newfile[0] == '"':
newfile = newfile[1:-1] newfile = newfile[1:-1]
# This implements cpp-like semantics for relative-path inclusion. # This implements cpp-like semantics for relative-path inclusion.
if type(self.infile) == type("") and not os.path.isabs(newfile): if type(self.infile) in StringTypes and not os.path.isabs(newfile):
newfile = os.path.join(os.path.dirname(self.infile), newfile) newfile = os.path.join(os.path.dirname(self.infile), newfile)
return (newfile, open(newfile, "r")) return (newfile, open(newfile, "r"))
@ -194,6 +265,19 @@ def error_leader(self, infile=None, lineno=None):
lineno = self.lineno lineno = self.lineno
return "\"%s\", line %d: " % (infile, lineno) return "\"%s\", line %d: " % (infile, lineno)
def __iter__(self):
return self
def next(self):
token = self.get_token()
if token == self.eof:
raise StopIteration
return token
def split(s, posix=1, spaces=1):
lex = shlex(s, posix=posix)
lex.whitespace_split = spaces
return list(lex)
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) == 1: if len(sys.argv) == 1:

191
Lib/test/test_shlex.py Normal file
View File

@ -0,0 +1,191 @@
# -*- coding: iso-8859-1 -*-
import unittest
import os, sys
import shlex
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
# The original test data set was from shellwords, by Hartmut Goebel.
data = r"""x|x|
foo bar|foo|bar|
foo bar|foo|bar|
foo bar |foo|bar|
foo bar bla fasel|foo|bar|bla|fasel|
x y z xxxx|x|y|z|xxxx|
\x bar|\|x|bar|
\ x bar|\|x|bar|
\ bar|\|bar|
foo \x bar|foo|\|x|bar|
foo \ x bar|foo|\|x|bar|
foo \ bar|foo|\|bar|
foo "bar" bla|foo|"bar"|bla|
"foo" "bar" "bla"|"foo"|"bar"|"bla"|
"foo" bar "bla"|"foo"|bar|"bla"|
"foo" bar bla|"foo"|bar|bla|
foo 'bar' bla|foo|'bar'|bla|
'foo' 'bar' 'bla'|'foo'|'bar'|'bla'|
'foo' bar 'bla'|'foo'|bar|'bla'|
'foo' bar bla|'foo'|bar|bla|
blurb foo"bar"bar"fasel" baz|blurb|foo"bar"bar"fasel"|baz|
blurb foo'bar'bar'fasel' baz|blurb|foo'bar'bar'fasel'|baz|
""|""|
''|''|
foo "" bar|foo|""|bar|
foo '' bar|foo|''|bar|
foo "" "" "" bar|foo|""|""|""|bar|
foo '' '' '' bar|foo|''|''|''|bar|
\""|\|""|
"\"|"\"|
"foo\ bar"|"foo\ bar"|
"foo\\ bar"|"foo\\ bar"|
"foo\\ bar\"|"foo\\ bar\"|
"foo\\" bar\""|"foo\\"|bar|\|""|
"foo\\ bar\" dfadf"|"foo\\ bar\"|dfadf"|
"foo\\\ bar\" dfadf"|"foo\\\ bar\"|dfadf"|
"foo\\\x bar\" dfadf"|"foo\\\x bar\"|dfadf"|
"foo\x bar\" dfadf"|"foo\x bar\"|dfadf"|
\''|\|''|
'foo\ bar'|'foo\ bar'|
'foo\\ bar'|'foo\\ bar'|
"foo\\\x bar\" df'a\ 'df'|"foo\\\x bar\"|df'a|\|'df'|
\"foo"|\|"foo"|
\"foo"\x|\|"foo"|\|x|
"foo\x"|"foo\x"|
"foo\ "|"foo\ "|
foo\ xx|foo|\|xx|
foo\ x\x|foo|\|x|\|x|
foo\ x\x\""|foo|\|x|\|x|\|""|
"foo\ x\x"|"foo\ x\x"|
"foo\ x\x\\"|"foo\ x\x\\"|
"foo\ x\x\\""foobar"|"foo\ x\x\\"|"foobar"|
"foo\ x\x\\"\''"foobar"|"foo\ x\x\\"|\|''|"foobar"|
"foo\ x\x\\"\'"fo'obar"|"foo\ x\x\\"|\|'"fo'|obar"|
"foo\ x\x\\"\'"fo'obar" 'don'\''t'|"foo\ x\x\\"|\|'"fo'|obar"|'don'|\|''|t'|
'foo\ bar'|'foo\ bar'|
'foo\\ bar'|'foo\\ bar'|
foo\ bar|foo|\|bar|
foo#bar\nbaz|foobaz|
:-) ;-)|:|-|)|;|-|)|
áéíóú|á|é|í|ó|ú|
"""
posix_data = r"""x|x|
foo bar|foo|bar|
foo bar|foo|bar|
foo bar |foo|bar|
foo bar bla fasel|foo|bar|bla|fasel|
x y z xxxx|x|y|z|xxxx|
\x bar|x|bar|
\ x bar| x|bar|
\ bar| bar|
foo \x bar|foo|x|bar|
foo \ x bar|foo| x|bar|
foo \ bar|foo| bar|
foo "bar" bla|foo|bar|bla|
"foo" "bar" "bla"|foo|bar|bla|
"foo" bar "bla"|foo|bar|bla|
"foo" bar bla|foo|bar|bla|
foo 'bar' bla|foo|bar|bla|
'foo' 'bar' 'bla'|foo|bar|bla|
'foo' bar 'bla'|foo|bar|bla|
'foo' bar bla|foo|bar|bla|
blurb foo"bar"bar"fasel" baz|blurb|foobarbarfasel|baz|
blurb foo'bar'bar'fasel' baz|blurb|foobarbarfasel|baz|
""||
''||
foo "" bar|foo||bar|
foo '' bar|foo||bar|
foo "" "" "" bar|foo||||bar|
foo '' '' '' bar|foo||||bar|
\"|"|
"\""|"|
"foo\ bar"|foo\ bar|
"foo\\ bar"|foo\ bar|
"foo\\ bar\""|foo\ bar"|
"foo\\" bar\"|foo\|bar"|
"foo\\ bar\" dfadf"|foo\ bar" dfadf|
"foo\\\ bar\" dfadf"|foo\\ bar" dfadf|
"foo\\\x bar\" dfadf"|foo\\x bar" dfadf|
"foo\x bar\" dfadf"|foo\x bar" dfadf|
\'|'|
'foo\ bar'|foo\ bar|
'foo\\ bar'|foo\\ bar|
"foo\\\x bar\" df'a\ 'df"|foo\\x bar" df'a\ 'df|
\"foo|"foo|
\"foo\x|"foox|
"foo\x"|foo\x|
"foo\ "|foo\ |
foo\ xx|foo xx|
foo\ x\x|foo xx|
foo\ x\x\"|foo xx"|
"foo\ x\x"|foo\ x\x|
"foo\ x\x\\"|foo\ x\x\|
"foo\ x\x\\""foobar"|foo\ x\x\foobar|
"foo\ x\x\\"\'"foobar"|foo\ x\x\'foobar|
"foo\ x\x\\"\'"fo'obar"|foo\ x\x\'fo'obar|
"foo\ x\x\\"\'"fo'obar" 'don'\''t'|foo\ x\x\'fo'obar|don't|
"foo\ x\x\\"\'"fo'obar" 'don'\''t' \\|foo\ x\x\'fo'obar|don't|\|
'foo\ bar'|foo\ bar|
'foo\\ bar'|foo\\ bar|
foo\ bar|foo bar|
foo#bar\nbaz|foo|baz|
:-) ;-)|:-)|;-)|
áéíóú|áéíóú|
"""
class ShlexTest(unittest.TestCase):
def setUp(self):
self.data = [x.split("|")[:-1]
for x in data.splitlines()]
self.posix_data = [x.split("|")[:-1]
for x in posix_data.splitlines()]
for item in self.data:
item[0] = item[0].replace(r"\n", "\n")
for item in self.posix_data:
item[0] = item[0].replace(r"\n", "\n")
def splitTest(self, data, posix, spaces):
for i in range(len(data)):
l = shlex.split(data[i][0], posix=posix, spaces=spaces)
self.assertEqual(l, data[i][1:],
"%s: %s != %s" %
(data[i][0], l, data[i][1:]))
def oldSplit(self, s):
ret = []
lex = shlex.shlex(StringIO(s))
tok = lex.get_token()
while tok:
ret.append(tok)
tok = lex.get_token()
return ret
def testSplit(self):
"""Test data splitting with non-posix parser"""
self.splitTest(self.data, posix=0, spaces=0)
def testSplitPosix(self):
"""Test data splitting with posix parser"""
self.splitTest(self.posix_data, posix=1, spaces=1)
def testCompat(self):
"""Test compatibility interface"""
for i in range(len(self.data)):
l = self.oldSplit(self.data[i][0])
self.assertEqual(l, self.data[i][1:],
"%s: %s != %s" %
(self.data[i][0], l, self.data[i][1:]))
# Allow this test to be used with old shlex.py
if not getattr(shlex, "split", None):
for methname in dir(ShlexTest):
if methname.startswith("test") and methname != "testCompat":
delattr(ShlexTest, methname)
if __name__ == "__main__":
unittest.main()

View File

@ -155,6 +155,10 @@ Library
- New csv package makes it easy to read/write CSV files. - New csv package makes it easy to read/write CSV files.
- Module shlex has been extended to allow posix-like shell parsings,
including a split() function for easy spliting of quoted strings and
commands. An iterator interface was also implemented.
Tools/Demos Tools/Demos
----------- -----------