Merge "search: stronger lexer; parse errors contain a position"

This commit is contained in:
Brad Fitzpatrick 2014-04-10 02:32:07 +00:00 committed by Gerrit Code Review
commit 15e583f6ea
4 changed files with 952 additions and 822 deletions

View File

@ -24,13 +24,14 @@ import (
"strconv"
"strings"
"time"
"unicode/utf8"
"camlistore.org/pkg/context"
"camlistore.org/pkg/geocode"
"camlistore.org/pkg/types"
)
const seeDocs = "\nSee: https://camlistore.googlesource.com/camlistore/+/master/doc/search-ui.txt"
var (
tagExpr = regexp.MustCompile(`^tag:(.+)$`)
titleExpr = regexp.MustCompile(`^title:(.+)$`)
@ -48,12 +49,28 @@ var (
)
var (
errNoMatchingOpening = errors.New("No matching opening parenthesis")
errNoMatchingClosing = errors.New("No matching closing parenthesis")
errCannotStartBinaryOp = errors.New("Expression cannot start with a binary operator")
errExpectedAtom = errors.New("Expected an atom")
noMatchingOpening = "No matching opening parenthesis"
noMatchingClosing = "No matching closing parenthesis"
noLiteralSupport = "No support for literals yet"
noQuotedLiteralSupport = "No support for quoted literals yet"
expectedAtom = "Expected an atom"
predicateError = "Predicates do not start with a colon"
trailingTokens = "After parsing finished there is still input left"
)
type parseExpError struct {
mesg string
t token
}
func (e parseExpError) Error() string {
return fmt.Sprintf("%s at position %d, token: %q %s", e.mesg, e.t.start, e.t.val, seeDocs)
}
func newParseExpError(mesg string, t token) error {
return parseExpError{mesg: mesg, t: t}
}
func andConst(a, b *Constraint) *Constraint {
return &Constraint{
Logical: &LogicalConstraint{
@ -83,168 +100,171 @@ func notConst(a *Constraint) *Constraint {
}
}
func stripNot(tokens []string) (negated bool, rest []string) {
rest = tokens
for len(rest) > 0 {
if rest[0] != "-" {
return negated, rest
} else {
negated = !negated
rest = rest[1:]
}
}
return
type parser struct {
tokens chan token
peeked *token
}
func parseExp(ctx *context.Context, tokens []string) (c *Constraint, rest []string, err error) {
if len(tokens) == 0 {
func newParser(exp string) parser {
_, tokens := lex(exp)
return parser{tokens: tokens}
}
func (p *parser) next() *token {
if p.peeked != nil {
t := p.peeked
p.peeked = nil
return t
}
return p.readInternal()
}
func (p *parser) peek() *token {
if p.peeked == nil {
p.peeked = p.readInternal()
}
return p.peeked
}
// ReadInternal should not be called directly, use 'next' or 'peek'
func (p *parser) readInternal() *token {
for t := range p.tokens {
return &t
}
return &token{tokenEOF, "", -1}
}
func (p *parser) stripNot() (negated bool) {
for {
switch p.peek().typ {
case tokenNot:
p.next()
negated = !negated
continue
}
return negated
}
}
func (p *parser) parseExp(ctx *context.Context) (c *Constraint, err error) {
if p.peek().typ == tokenEOF {
return
}
rest = tokens
c, rest, err = parseOperand(ctx, rest)
c, err = p.parseOperand(ctx)
if err != nil {
return
}
for len(rest) > 0 {
switch rest[0] {
case "and":
c, rest, err = parseConjunction(ctx, c, rest[1:])
if err != nil {
return
}
continue
case "or":
return parseDisjunction(ctx, c, rest[1:])
case ")":
for {
switch p.peek().typ {
case tokenAnd:
p.next()
case tokenOr:
p.next()
return p.parseOrRHS(ctx, c)
case tokenClose, tokenEOF:
return
}
c, rest, err = parseConjunction(ctx, c, rest)
c, err = p.parseAndRHS(ctx, c)
if err != nil {
return
}
}
return
}
func parseGroup(ctx *context.Context, tokens []string) (c *Constraint, rest []string, err error) {
rest = tokens
if rest[0] == "(" {
c, rest, err = parseExp(ctx, rest[1:])
func (p *parser) parseGroup(ctx *context.Context) (c *Constraint, err error) {
i := p.next()
switch i.typ {
case tokenOpen:
c, err = p.parseExp(ctx)
if err != nil {
return
}
if len(rest) > 0 && rest[0] == ")" {
rest = rest[1:]
if p.peek().typ == tokenClose {
p.next()
return
} else {
err = errNoMatchingClosing
err = newParseExpError(noMatchingClosing, *i)
return
}
} else {
err = errNoMatchingOpening
return
}
err = newParseExpError("internal: do not call parseGroup when not on a '('", *i)
return
}
func parseDisjunction(ctx *context.Context, lhs *Constraint, tokens []string) (c *Constraint, rest []string, err error) {
func (p *parser) parseOrRHS(ctx *context.Context, lhs *Constraint) (c *Constraint, err error) {
var rhs *Constraint
c = lhs
rest = tokens
for {
rhs, rest, err = parseEntireConjunction(ctx, rest)
rhs, err = p.parseAnd(ctx)
if err != nil {
return
}
c = orConst(c, rhs)
if len(rest) > 0 {
switch rest[0] {
case "or":
rest = rest[1:]
continue
case "and", ")":
return
}
return
} else {
switch p.peek().typ {
case tokenOr:
p.next()
case tokenAnd, tokenClose, tokenEOF:
return
}
}
return
}
func parseEntireConjunction(ctx *context.Context, tokens []string) (c *Constraint, rest []string, err error) {
rest = tokens
func (p *parser) parseAnd(ctx *context.Context) (c *Constraint, err error) {
for {
c, rest, err = parseOperand(ctx, rest)
c, err = p.parseOperand(ctx)
if err != nil {
return
}
if len(rest) > 0 {
switch rest[0] {
case "and":
return parseConjunction(ctx, c, rest[1:])
case ")", "or":
return
}
return parseConjunction(ctx, c, rest)
} else {
switch p.peek().typ {
case tokenAnd:
p.next()
case tokenOr, tokenClose, tokenEOF:
return
}
return p.parseAndRHS(ctx, c)
}
return
}
func parseConjunction(ctx *context.Context, lhs *Constraint, tokens []string) (c *Constraint, rest []string, err error) {
func (p *parser) parseAndRHS(ctx *context.Context, lhs *Constraint) (c *Constraint, err error) {
var rhs *Constraint
c = lhs
rest = tokens
for {
rhs, rest, err = parseOperand(ctx, rest)
rhs, err = p.parseOperand(ctx)
if err != nil {
return
}
c = andConst(c, rhs)
if len(rest) > 0 {
switch rest[0] {
case "or", ")":
return
case "and":
rest = rest[1:]
continue
}
} else {
switch p.peek().typ {
case tokenOr, tokenClose, tokenEOF:
return
case tokenAnd:
p.next()
continue
}
return
}
return
}
func parseOperand(ctx *context.Context, tokens []string) (c *Constraint, rest []string, err error) {
var negated bool
negated, rest = stripNot(tokens)
if len(rest) > 0 {
if rest[0] == "(" {
c, rest, err = parseGroup(ctx, rest)
if err != nil {
return
}
} else {
switch rest[0] {
case "and", "or":
err = errCannotStartBinaryOp
return
case ")":
err = errNoMatchingOpening
return
}
c, err = parseAtom(ctx, rest[0])
if err != nil {
return
}
rest = rest[1:]
}
} else {
return nil, nil, errExpectedAtom
func (p *parser) parseOperand(ctx *context.Context) (c *Constraint, err error) {
negated := p.stripNot()
i := p.peek()
switch i.typ {
case tokenError:
err = newParseExpError(i.val, *i)
return
case tokenEOF:
err = newParseExpError(expectedAtom, *i)
return
case tokenClose:
err = newParseExpError(noMatchingOpening, *i)
return
case tokenLiteral, tokenQuotedLiteral, tokenPredicate, tokenColon, tokenArg:
c, err = p.parseAtom(ctx)
case tokenOpen:
c, err = p.parseGroup(ctx)
}
if err != nil {
return
}
if negated {
c = notConst(c)
@ -252,6 +272,66 @@ func parseOperand(ctx *context.Context, tokens []string) (c *Constraint, rest []
return
}
func (p *parser) atomWord() (word string, err error) {
i := p.peek()
switch i.typ {
case tokenLiteral:
err = newParseExpError(noLiteralSupport, *i)
return
case tokenQuotedLiteral:
err = newParseExpError(noQuotedLiteralSupport, *i)
return
case tokenColon:
err = newParseExpError(predicateError, *i)
return
case tokenPredicate:
i := p.next()
word += i.val
}
for {
switch p.peek().typ {
case tokenColon:
p.next()
word += ":"
continue
case tokenArg:
i := p.next()
word += i.val
continue
case tokenQuotedArg:
i := p.next()
uq, err := strconv.Unquote(i.val)
if err != nil {
return "", err
}
word += uq
continue
}
return
}
}
func (p *parser) parseAtom(ctx *context.Context) (c *Constraint, err error) {
word, err := p.atomWord()
if err != nil {
return
}
c, err = parseCoreAtom(ctx, word)
if err == nil {
return c, nil
}
c, err = parseImageAtom(ctx, word)
if err == nil {
return c, nil
}
c, err = parseLocationAtom(ctx, word)
if err == nil {
return c, nil
}
log.Printf("Unknown search predicate %q", word)
return nil, errors.New(fmt.Sprintf("Unknown search predicate: %q", word))
}
func permOfFile(fc *FileConstraint) *Constraint {
return &Constraint{
Permanode: &PermanodeConstraint{
@ -456,23 +536,6 @@ func parseLocationAtom(ctx *context.Context, word string) (*Constraint, error) {
return nil, errors.New(fmt.Sprintf("Not an location-atom: %v", word))
}
func parseAtom(ctx *context.Context, word string) (*Constraint, error) {
c, err := parseCoreAtom(ctx, word)
if err == nil {
return c, nil
}
c, err = parseImageAtom(ctx, word)
if err == nil {
return c, nil
}
c, err = parseLocationAtom(ctx, word)
if err == nil {
return c, nil
}
log.Printf("Unknown search expression word %q", word)
return nil, errors.New(fmt.Sprintf("Unknown search atom: %s", word))
}
func parseExpression(ctx *context.Context, exp string) (*SearchQuery, error) {
base := &Constraint{
Permanode: &PermanodeConstraint{
@ -487,18 +550,24 @@ func parseExpression(ctx *context.Context, exp string) (*SearchQuery, error) {
if exp == "" {
return sq, nil
}
_, tokens := lex(exp)
p := parser{tokens: tokens}
words := splitExpr(exp)
c, rem, err := parseExp(ctx, words)
c, err := p.parseExp(ctx)
if err != nil {
return nil, err
}
lastToken := p.next()
if lastToken.typ != tokenEOF {
switch lastToken.typ {
case tokenClose:
return nil, newParseExpError(noMatchingOpening, *lastToken)
}
return nil, newParseExpError(trailingTokens, *lastToken)
}
if c != nil {
sq.Constraint = andConst(base, c)
}
if len(rem) > 0 {
return nil, errors.New("Trailing terms")
}
return sq, nil
}
@ -539,132 +608,3 @@ func mimeFromFormat(v string) string {
}
return "???"
}
// Tokens are:
// literal
// foo: (for operators)
// "quoted string"
// "("
// ")"
// " " (for any amount of space)
// "-" negative sign
func tokenizeExpr(exp string) []string {
var tokens []string
for len(exp) > 0 {
var token string
token, exp = firstToken(exp)
tokens = append(tokens, token)
}
return tokens
}
func firstToken(s string) (token, rest string) {
isWordBound := func(r byte) bool {
if isSpace(r) {
return true
}
switch r {
case '(', ')', '-':
return true
}
return false
}
if s[0] == '-' {
return "-", s[1:]
}
if s[0] == '(' {
return "(", s[1:]
}
if s[0] == ')' {
return ")", s[1:]
}
if strings.HasPrefix(s, "and") && len(s) > 3 && isWordBound(s[3]) {
return "and", s[3:]
}
if strings.HasPrefix(s, "or") && len(s) > 2 && isWordBound(s[2]) {
return "or", s[2:]
}
if isSpace(s[0]) {
for len(s) > 0 && isSpace(s[0]) {
s = s[1:]
}
return " ", s
}
if s[0] == '"' {
quote := false
for i, r := range s[1:] {
if quote {
quote = false
continue
}
if r == '\\' {
quote = true
continue
}
if r == '"' {
return s[:i+2], s[i+2:]
}
}
}
for i, r := range s {
if r == ':' {
return s[:i+1], s[i+1:]
}
if r == '(' {
return s[:i], s[i:]
}
if r == ')' {
return s[:i], s[i:]
}
if r < utf8.RuneSelf && isSpace(byte(r)) {
return s[:i], s[i:]
}
}
return s, ""
}
func isSpace(b byte) bool {
switch b {
case ' ', '\n', '\r', '\t':
return true
}
return false
}
// Basically just strings.Fields for now but with de-quoting of quoted
// tokens after operators.
func splitExpr(exp string) []string {
tokens := tokenizeExpr(strings.TrimSpace(exp))
if len(tokens) == 0 {
return nil
}
// Turn any pair of ("operator:", `"quoted string"`) tokens into
// ("operator:", "quoted string"), unquoting the second.
for i, token := range tokens[:len(tokens)-1] {
nextToken := tokens[i+1]
if strings.HasSuffix(token, ":") && strings.HasPrefix(nextToken, "\"") {
if uq, err := strconv.Unquote(nextToken); err == nil {
tokens[i+1] = uq
}
}
}
// Split on space, ), ( tokens and concatenate tokens ending with :
// Not particularly efficient, though.
var f []string
var nextPasted bool
for _, token := range tokens {
if token == " " {
continue
} else if nextPasted {
f[len(f)-1] += token
nextPasted = false
} else {
f = append(f, token)
}
if strings.HasSuffix(token, ":") {
nextPasted = true
}
}
return f
}

File diff suppressed because it is too large Load Diff

316
pkg/search/lexer.go Normal file
View File

@ -0,0 +1,316 @@
/*
Copyright 2014 The Camlistore Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// This is the lexer for search expressions (see expr.go).
package search
import (
"fmt"
"strings"
"unicode"
"unicode/utf8"
)
type tokenType int
const (
tokenAnd tokenType = iota
tokenArg
tokenClose
tokenColon
tokenEOF
tokenError
tokenLiteral
tokenNot
tokenOpen
tokenOr
tokenPredicate
tokenQuotedArg
tokenQuotedLiteral
)
const (
eof = -1 // -1 is unused in utf8
whitespace = "\t\n\f\v\r "
opBound = whitespace + "("
)
// IsSearchWordRune defines the runes that can be used in unquoted predicate arguments
// or unquoted literals. These are all unicode letters, digits and punctuation,
// execpt for ':', which is used for predicate marking, and '(', ')', which are used
// for predicate grouping.
func isSearchWordRune(r rune) bool {
switch r {
case ':', ')', '(':
return false
}
return unicode.IsLetter(r) || unicode.IsDigit(r) || unicode.IsPunct(r)
}
type token struct {
typ tokenType
val string
start int
}
func (t token) String() string {
switch t.typ {
case tokenEOF:
return "EOF"
case tokenError:
return fmt.Sprintf("{err:%q at pos: %d}", t.val, t.start)
}
return fmt.Sprintf("{t:%v,%q (col: %d)}", t.typ, t.val, t.start)
}
type lexer struct {
input string
start int
pos int
width int
tokens chan token
state stateFn
}
func (l *lexer) emit(typ tokenType) {
l.tokens <- token{typ, l.input[l.start:l.pos], l.start}
l.start = l.pos
}
func (l *lexer) next() (r rune) {
if l.pos >= len(l.input) {
l.width = 0
return eof
}
r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
l.pos += l.width
return
}
func (l *lexer) ignore() {
l.start = l.pos
}
func (l *lexer) backup() {
l.pos -= l.width
}
func (l *lexer) peek() rune {
r := l.next()
l.backup()
return r
}
func (l *lexer) accept(valid string) bool {
if strings.IndexRune(valid, l.next()) >= 0 {
return true
}
l.backup()
return false
}
func (l *lexer) acceptString(s string) bool {
for _, r := range s {
if l.next() != r {
l.backup()
return false
}
}
return true
}
func (l *lexer) acceptRun(valid string) {
for strings.IndexRune(valid, l.next()) >= 0 {
}
l.backup()
}
func (l *lexer) acceptRunFn(valid func(rune) bool) {
for valid(l.next()) {
}
l.backup()
}
func (l *lexer) errorf(format string, args ...interface{}) stateFn {
l.tokens <- token{
typ: tokenError,
val: fmt.Sprintf(format, args...),
start: l.start,
}
return nil
}
func lex(input string) (*lexer, chan token) {
l := &lexer{
input: input,
tokens: make(chan token),
state: readExp,
}
go l.run()
return l, l.tokens
}
func (l *lexer) run() {
for {
if l.state == nil {
close(l.tokens)
return
}
l.state = l.state(l)
}
}
//
// State functions
//
type stateFn func(*lexer) stateFn
func readNeg(l *lexer) stateFn {
l.accept("-")
l.emit(tokenNot)
return readExp
}
func readClose(l *lexer) stateFn {
l.accept(")")
l.emit(tokenClose)
return readOperator
}
func readOpen(l *lexer) stateFn {
l.accept("(")
l.emit(tokenOpen)
return readExp
}
func readColon(l *lexer) stateFn {
l.accept(":")
l.emit(tokenColon)
return readArg
}
func readPredicate(l *lexer) stateFn {
l.acceptRunFn(unicode.IsLetter)
switch l.peek() {
case ':':
l.emit(tokenPredicate)
return readColon
}
return readLiteral
}
func readLiteral(l *lexer) stateFn {
l.acceptRunFn(isSearchWordRune)
l.emit(tokenLiteral)
return readOperator
}
func readArg(l *lexer) stateFn {
if l.peek() == '"' {
return readQuotedArg
}
l.acceptRunFn(isSearchWordRune)
l.emit(tokenArg)
if l.peek() == ':' {
return readColon
}
return readOperator
}
func readAND(l *lexer) stateFn {
if l.acceptString("and") && l.accept(opBound) {
l.backup()
l.emit(tokenAnd)
return readExp
} else {
return readPredicate
}
}
func readOR(l *lexer) stateFn {
if l.acceptString("or") && l.accept(opBound) {
l.backup()
l.emit(tokenOr)
return readExp
} else {
return readPredicate
}
}
func runQuoted(l *lexer) bool {
l.accept("\"")
for {
r := l.next()
switch r {
case eof:
return false
case '\\':
l.next()
case '"':
return true
}
}
}
func readQuotedLiteral(l *lexer) stateFn {
if !runQuoted(l) {
return l.errorf("Unclosed quote")
}
l.emit(tokenQuotedLiteral)
return readOperator
}
func readQuotedArg(l *lexer) stateFn {
if !runQuoted(l) {
return l.errorf("Unclosed quote")
}
l.emit(tokenQuotedArg)
if l.peek() == ':' {
return readColon
}
return readOperator
}
func readExp(l *lexer) stateFn {
l.acceptRun(whitespace)
l.ignore()
switch l.peek() {
case eof:
return nil
case '(':
return readOpen
case ')':
return readClose
case '-':
return readNeg
case '"':
return readQuotedLiteral
}
return readPredicate
}
func readOperator(l *lexer) stateFn {
l.acceptRun(whitespace)
l.ignore()
switch l.peek() {
case 'a':
return readAND
case 'o':
return readOR
}
return readExp
}

173
pkg/search/lexer_test.go Normal file
View File

@ -0,0 +1,173 @@
package search
import (
"reflect"
"testing"
)
const scaryQuote = `"\"Hi there\""`
var lexerTests = []struct {
in string
want []token
}{
{
in: "and and and",
want: []token{
{tokenLiteral, "and", 0},
{tokenAnd, "and", 4},
{tokenLiteral, "and", 8},
},
},
{
in: "and nd and",
want: []token{
{tokenLiteral, "and", 0},
{tokenLiteral, "nd", 4},
{tokenLiteral, "and", 7},
},
},
{
in: "or or or",
want: []token{
{tokenLiteral, "or", 0},
{tokenOr, "or", 3},
{tokenLiteral, "or", 6},
},
},
{
in: "or r or",
want: []token{
{tokenLiteral, "or", 0},
{tokenLiteral, "r", 3},
{tokenLiteral, "or", 5},
},
},
{
in: "(or or or) and or",
want: []token{
{tokenOpen, "(", 0},
{tokenLiteral, "or", 1},
{tokenOr, "or", 4},
{tokenLiteral, "or", 7},
{tokenClose, ")", 9},
{tokenAnd, "and", 11},
{tokenLiteral, "or", 15},
},
},
{
in: `(or or "or) and or`,
want: []token{
{tokenOpen, "(", 0},
{tokenLiteral, "or", 1},
{tokenOr, "or", 4},
{tokenError, "Unclosed quote", 7},
},
},
{
in: "bar and baz",
want: []token{{tokenLiteral, "bar", 0}, {tokenAnd, "and", 4}, {tokenLiteral, "baz", 8}},
},
{
in: "foo or bar",
want: []token{{tokenLiteral, "foo", 0}, {tokenOr, "or", 4}, {tokenLiteral, "bar", 7}},
},
{
in: "foo or (bar )",
want: []token{{tokenLiteral, "foo", 0}, {tokenOr, "or", 4}, {tokenOpen, "(", 7}, {tokenLiteral, "bar", 8}, {tokenClose, ")", 12}},
},
{
in: "foo or bar:foo:baz",
want: []token{
{tokenLiteral, "foo", 0},
{tokenOr, "or", 4},
{tokenPredicate, "bar", 7},
{tokenColon, ":", 10},
{tokenArg, "foo", 11},
{tokenColon, ":", 14},
{tokenArg, "baz", 15},
},
},
{
in: "--foo or - bar",
want: []token{
{tokenNot, "-", 0},
{tokenNot, "-", 1},
{tokenLiteral, "foo", 2},
{tokenOr, "or", 6},
{tokenNot, "-", 9},
{tokenLiteral, "bar", 11},
},
},
{
in: "foo:bar:baz or bar",
want: []token{
{tokenPredicate, "foo", 0},
{tokenColon, ":", 3},
{tokenArg, "bar", 4},
{tokenColon, ":", 7},
{tokenArg, "baz", 8},
{tokenOr, "or", 12},
{tokenLiteral, "bar", 15},
},
},
{
in: "is:pano or",
want: []token{
{tokenPredicate, "is", 0},
{tokenColon, ":", 2},
{tokenArg, "pano", 3},
{tokenLiteral, "or", 8},
},
},
{
in: "foo:" + scaryQuote + " or bar",
want: []token{
{tokenPredicate, "foo", 0},
{tokenColon, ":", 3},
{tokenQuotedArg, scaryQuote, 4},
{tokenOr, "or", 19},
{tokenLiteral, "bar", 22},
},
},
{
in: scaryQuote,
want: []token{
{tokenQuotedLiteral, scaryQuote, 0}},
},
}
func array(in string) (parsed []token) {
_, tokens := lex(in)
for token := range tokens {
if token.typ == tokenEOF {
break
}
parsed = append(parsed, token)
}
return
}
func TestLex(t *testing.T) {
for _, tt := range lexerTests {
tokens := array(tt.in)
if !reflect.DeepEqual(tokens, tt.want) {
t.Errorf("Got lex(%q)=%v expected %v", tt.in, tokens, tt.want)
}
}
}