From 11a17450341a2833842e602c464e759ddc0d40d4 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Wed, 4 Dec 2013 14:18:52 +0100 Subject: [PATCH] Add KeyBytes and ValueBytes accessor to sorted.KeyValue.Iterator. Goal is to iterate faster (notably: for slurping the index to the in-memory corpus on start-up), by doing fewer copies and generating less garbage. Change-Id: I054b0de2b994eb1f2356aa8587a466bafeb6cf82 --- pkg/index/kvfile/kvfile.go | 39 ++++++++++++++++++++++----- pkg/index/mongo/mongoindex.go | 24 +++++++++++++---- pkg/index/sqlindex/sqlindex.go | 37 +++++++++++++++++++------- pkg/sorted/kv.go | 14 ++++++++++ pkg/sorted/mem.go | 44 ++++++++++++++++++++++++++----- server/appengine/camli/aeindex.go | 5 ++++ 6 files changed, 135 insertions(+), 28 deletions(-) diff --git a/pkg/index/kvfile/kvfile.go b/pkg/index/kvfile/kvfile.go index 9c4050d53..ea6bd731b 100644 --- a/pkg/index/kvfile/kvfile.go +++ b/pkg/index/kvfile/kvfile.go @@ -156,8 +156,9 @@ type iter struct { enum *kv.Enumerator - valid bool - key, val string + valid bool + key, val []byte + skey, sval *string // non-nil if valid err error closed bool @@ -168,24 +169,49 @@ func (it *iter) Close() error { return it.err } -func (it *iter) Key() string { +func (it *iter) KeyBytes() []byte { if !it.valid { panic("not valid") } return it.key } -func (it *iter) Value() string { +func (it *iter) Key() string { + if !it.valid { + panic("not valid") + } + if it.skey != nil { + return *it.skey + } + str := string(it.key) + it.skey = &str + return str +} + +func (it *iter) ValueBytes() []byte { if !it.valid { panic("not valid") } return it.val } +func (it *iter) Value() string { + if !it.valid { + panic("not valid") + } + if it.sval != nil { + return *it.sval + } + str := string(it.val) + it.sval = &str + return str +} + func (it *iter) Next() (ret bool) { if it.closed { panic("Next called after Next returned value") } + it.skey, it.sval = nil, nil defer func() { it.valid = ret if !ret { @@ -198,13 +224,12 @@ func (it *iter) Next() (ret bool) { return false } } - key, val, err := it.enum.Next() + var err error + it.key, it.val, err = it.enum.Next() if err == io.EOF { it.err = nil return false } - it.key = string(key) - it.val = string(val) return true } diff --git a/pkg/index/mongo/mongoindex.go b/pkg/index/mongo/mongoindex.go index 85441eb1b..ad35f3837 100644 --- a/pkg/index/mongo/mongoindex.go +++ b/pkg/index/mongo/mongoindex.go @@ -159,11 +159,11 @@ type mongoStrIterator struct { *mgo.Iter } -func (s mongoStrIterator) Next() bool { +func (s *mongoStrIterator) Next() bool { return s.Iter.Next(&s.res) } -func (s mongoStrIterator) Key() (key string) { +func (s *mongoStrIterator) Key() string { key, ok := (s.res[mgoKey]).(string) if !ok { return "" @@ -171,7 +171,14 @@ func (s mongoStrIterator) Key() (key string) { return key } -func (s mongoStrIterator) Value() (value string) { +func (s *mongoStrIterator) KeyBytes() []byte { + // TODO(bradfitz,mpl): this is less efficient than the string way. we should + // do better here, somehow, like all the other sorted.KeyValue iterators. + // For now: + return []byte(s.Key()) +} + +func (s *mongoStrIterator) Value() string { value, ok := (s.res[mgoValue]).(string) if !ok { return "" @@ -179,7 +186,14 @@ func (s mongoStrIterator) Value() (value string) { return value } -func (s mongoStrIterator) Close() error { +func (s *mongoStrIterator) ValueBytes() []byte { + // TODO(bradfitz,mpl): this is less efficient than the string way. we should + // do better here, somehow, like all the other sorted.KeyValue iterators. + // For now: + return []byte(s.Value()) +} + +func (s *mongoStrIterator) Close() error { // TODO(mpl): think about anything more to be done here. return nil } @@ -213,7 +227,7 @@ func (mk *mongoKeys) Find(key string) sorted.Iterator { // more suited if possible. cleanedKey := strings.Replace(key, "|", `\|`, -1) iter := mk.db.Find(&bson.M{mgoKey: &bson.M{"$regex": "^" + cleanedKey}}).Sort(mgoKey).Iter() - return mongoStrIterator{res: bson.M{}, Iter: iter} + return &mongoStrIterator{res: bson.M{}, Iter: iter} } func (mk *mongoKeys) Set(key, value string) error { diff --git a/pkg/index/sqlindex/sqlindex.go b/pkg/index/sqlindex/sqlindex.go index f3b8cdad6..a4701e50e 100644 --- a/pkg/index/sqlindex/sqlindex.go +++ b/pkg/index/sqlindex/sqlindex.go @@ -161,7 +161,7 @@ func (s *Storage) Close() error { return s.DB.Close() } func (s *Storage) Find(key string) sorted.Iterator { it := &iter{ s: s, - low: key, + low: []byte(key), op: ">=", closeCheck: leak.NewChecker(), } @@ -171,7 +171,7 @@ func (s *Storage) Find(key string) sorted.Iterator { // iter is a iterator over sorted key/value pairs in rows. type iter struct { s *Storage - low string + low []byte op string // ">=" initially, then ">" err error // accumulated error, returned at Close @@ -182,14 +182,32 @@ type iter struct { batchSize int // how big our LIMIT query was seen int // how many rows we've seen this query - key string - value string + key sql.RawBytes + val sql.RawBytes + skey, sval *string // if non-nil, it's been stringified } var errClosed = errors.New("mysqlindexer: Iterator already closed") -func (t *iter) Key() string { return t.key } -func (t *iter) Value() string { return t.value } +func (t *iter) KeyBytes() []byte { return t.key } +func (t *iter) Key() string { + if t.skey != nil { + return *t.skey + } + str := string(t.key) + t.skey = &str + return str +} + +func (t *iter) ValueBytes() []byte { return t.val } +func (t *iter) Value() string { + if t.sval != nil { + return *t.sval + } + str := string(t.val) + t.sval = &str + return str +} func (t *iter) Close() error { t.closeCheck.Close() @@ -205,6 +223,7 @@ func (t *iter) Next() bool { if t.err != nil { return false } + t.skey, t.sval = nil, nil if t.rows == nil { const batchSize = 50 t.batchSize = batchSize @@ -213,7 +232,7 @@ func (t *iter) Next() bool { } t.rows, t.err = t.s.DB.Query(t.s.sql( "SELECT k, v FROM rows WHERE k "+t.op+" ? ORDER BY k LIMIT "+strconv.Itoa(batchSize)), - t.low) + string(t.low)) if t.s.Serial { t.s.mu.Unlock() } @@ -232,12 +251,12 @@ func (t *iter) Next() bool { } return false } - t.err = t.rows.Scan(&t.key, &t.value) + t.err = t.rows.Scan(&t.key, &t.val) if t.err != nil { log.Printf("unexpected Scan error: %v", t.err) return false } - t.low = t.key + t.low = append(t.low[:0], t.key...) t.seen++ return true } diff --git a/pkg/sorted/kv.go b/pkg/sorted/kv.go index b2d9b98b6..de0a5e2ed 100644 --- a/pkg/sorted/kv.go +++ b/pkg/sorted/kv.go @@ -70,10 +70,24 @@ type Iterator interface { // Only valid after a call to Next returns true. Key() string + // KeyBytes returns the key as bytes. The returned bytes + // should not be written and are invalid after the next call + // to Next or Close. + // TODO(bradfitz): rename this and change it to return a + // mem.RO instead? + KeyBytes() []byte + // Value returns the value of the current key/value pair. // Only valid after a call to Next returns true. Value() string + // ValueBytes returns the value as bytes. The returned bytes + // should not be written and are invalid after the next call + // to Next or Close. + // TODO(bradfitz): rename this and change it to return a + // mem.RO instead? + ValueBytes() []byte + // Close closes the iterator and returns any accumulated error. Exhausting // all the key/value pairs in a table is not considered to be an error. // It is valid to call Close multiple times. Other methods should not be diff --git a/pkg/sorted/mem.go b/pkg/sorted/mem.go index b43a230bc..bb0de3980 100644 --- a/pkg/sorted/mem.go +++ b/pkg/sorted/mem.go @@ -43,15 +43,45 @@ type memKeys struct { // operates on []byte, to Camlistore's index.Iterator, which operates // on string. type stringIterator struct { - db.Iterator + lit db.Iterator // underlying leveldb iterator + k, v *string // if nil, not stringified yet } -func (s stringIterator) Key() string { - return string(s.Iterator.Key()) +func (s *stringIterator) Next() bool { + s.k, s.v = nil, nil + return s.lit.Next() } -func (s stringIterator) Value() string { - return string(s.Iterator.Value()) +func (s *stringIterator) Close() error { + err := s.lit.Close() + *s = stringIterator{} // to cause crashes on future access + return err +} + +func (s *stringIterator) KeyBytes() []byte { + return s.lit.Key() +} + +func (s *stringIterator) ValueBytes() []byte { + return s.lit.Value() +} + +func (s *stringIterator) Key() string { + if s.k != nil { + return *s.k + } + str := string(s.KeyBytes()) + s.k = &str + return str +} + +func (s *stringIterator) Value() string { + if s.v != nil { + return *s.v + } + str := string(s.ValueBytes()) + s.v = &str + return str } func (mk *memKeys) Get(key string) (string, error) { @@ -67,8 +97,8 @@ func (mk *memKeys) Get(key string) (string, error) { func (mk *memKeys) Find(key string) Iterator { mk.mu.Lock() defer mk.mu.Unlock() - dit := mk.db.Find([]byte(key), nil) - return stringIterator{dit} + lit := mk.db.Find([]byte(key), nil) + return &stringIterator{lit: lit} } func (mk *memKeys) Set(key, value string) error { diff --git a/server/appengine/camli/aeindex.go b/server/appengine/camli/aeindex.go index 9a81819ec..a92eb2589 100644 --- a/server/appengine/camli/aeindex.go +++ b/server/appengine/camli/aeindex.go @@ -189,6 +189,11 @@ func (it *iter) Next() bool { func (it *iter) Key() string { return it.key } func (it *iter) Value() string { return it.value } +// TODO(bradfit): optimize the string<->[]byte copies in this iterator, as done in the other +// sorted.KeyValue iterators. +func (it *iter) KeyBytes() []byte { return []byte(it.key) } +func (it *iter) ValueBytes() []byte { return []byte(it.value) } + func indexFromConfig(ld blobserver.Loader, config jsonconfig.Obj) (storage blobserver.Storage, err error) { is := &indexStorage{} var (