perkeep/pkg/index/enumstat.go

/*
Copyright 2011 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package index

import (
	"fmt"
	"strconv"
	"strings"

	"camlistore.org/pkg/blob"
	"camlistore.org/pkg/context"
	"camlistore.org/pkg/sorted"
)

func (ix *Index) EnumerateBlobs(ctx *context.Context, dest chan<- blob.SizedRef, after string, limit int) (err error) {
	defer close(dest)
	it := ix.s.Find("have:"+after, "have~")
	defer func() {
		closeErr := it.Close()
		if err == nil {
			err = closeErr
		}
	}()

	afterKey := "have:" + after
	n := int(0)
	for n < limit && it.Next() {
		k := it.Key()
		if k <= afterKey {
			continue
		}
		if !strings.HasPrefix(k, "have:") {
			break
		}
		n++
		br, ok := blob.Parse(k[len("have:"):])
		if !ok {
			continue
		}
		size, err := parseHaveVal(it.Value())
		if err == nil {
			select {
			case dest <- blob.SizedRef{br, uint32(size)}:
			case <-ctx.Done():
				return context.ErrCanceled
			}
		}
	}
	return nil
}

func (ix *Index) StatBlobs(dest chan<- blob.SizedRef, blobs []blob.Ref) error {
	for _, br := range blobs {
		key := "have:" + br.String()
		v, err := ix.s.Get(key)
		if err == sorted.ErrNotFound {
			continue
		}
		if err != nil {
			return fmt.Errorf("error looking up key %q: %v", key, err)
		}
		size, err := parseHaveVal(v)
		if err != nil {
			return fmt.Errorf("invalid size for key %q = %q", key, v)
		}
		dest <- blob.SizedRef{br, uint32(size)}
	}
	return nil
}

// parseHaveVal takes the value part of an "have" index row and returns
// the blob size found in that value. Examples:
// parseHaveVal("324|indexed") == 324
// parseHaveVal("654") == 654
func parseHaveVal(val string) (size uint64, err error) {
	pipei := strings.Index(val, "|")
	if pipei >= 0 {
		// filter out the "indexed" suffix
		val = val[:pipei]
	}
	return strconv.ParseUint(val, 10, 32)
}
start of enumerate and stat for generic indexer Change-Id: Iabac9959a239d2418ab351e37895c7f0721e4e58 2011-11-10 17:47:32 +00:00			`/*`
			`Copyright 2011 Google Inc.`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License.`
			`*/`

			`package index`

			`import (`
generic index: untested stat support. Change-Id: Idff5ff19d7b18ef6581c6c8bc344230470c468ef 2011-11-11 02:58:04 +00:00			`"fmt"`
memory index enumerate support. untested. Change-Id: I28df97ed6385675545a74a6e01e779cc627ef033 2011-11-10 18:28:44 +00:00			`"strconv"`
			`"strings"`
start of enumerate and stat for generic indexer Change-Id: Iabac9959a239d2418ab351e37895c7f0721e4e58 2011-11-10 17:47:32 +00:00
all: delete pkg/blobref; convert all from *blobref.BlobRef to new blob.Ref Change-Id: Id2dfb7f19452bedf4f3c9310b36227fd8117b225 2013-08-04 02:54:30 +00:00			`"camlistore.org/pkg/blob"`
Start of new context package and *context.Context type. Will eventually be plumbed through lots of APIs, especially those requiring or benefiting from cancelation notification and/or those needing access to the HTTP context (e.g. App Engine). Change-Id: I591496725d620126e09d49eb07cade7707c7fc64 2013-12-02 21:20:51 +00:00			`"camlistore.org/pkg/context"`
Rename index.Storage to sorted.KeyValue and move it into a new package. Having index.Index and index.Storage both in the same package led to confusing discussions about "an index". Better names now, and smaller packages. 2013-11-23 07:24:54 +00:00			`"camlistore.org/pkg/sorted"`
start of enumerate and stat for generic indexer Change-Id: Iabac9959a239d2418ab351e37895c7f0721e4e58 2011-11-10 17:47:32 +00:00			`)`

Start of new context package and *context.Context type. Will eventually be plumbed through lots of APIs, especially those requiring or benefiting from cancelation notification and/or those needing access to the HTTP context (e.g. App Engine). Change-Id: I591496725d620126e09d49eb07cade7707c7fc64 2013-12-02 21:20:51 +00:00			`func (ix Index) EnumerateBlobs(ctx context.Context, dest chan<- blob.SizedRef, after string, limit int) (err error) {`
start of enumerate and stat for generic indexer Change-Id: Iabac9959a239d2418ab351e37895c7f0721e4e58 2011-11-10 17:47:32 +00:00			`defer close(dest)`
Change sorted.KeyValue.Find to take an optional end bound; add tests. The new package sorted/kvtest provides a generic KeyValue test for all implementations. Memory, SQLite, and kvfile now use it. This speeds up the index slurping start-up of my personal Camlistore server from 30 seconds (when it was doing 17,000+ queries in small windows) to now just 5 seconds. That 5 seconds can be improved yet further. Change-Id: Idd55ba9ccd3ed12a26868a41db1af676aff7b67b 2013-12-07 16:43:18 +00:00			`it := ix.s.Find("have:"+after, "have~")`
Start of new context package and *context.Context type. Will eventually be plumbed through lots of APIs, especially those requiring or benefiting from cancelation notification and/or those needing access to the HTTP context (e.g. App Engine). Change-Id: I591496725d620126e09d49eb07cade7707c7fc64 2013-12-02 21:20:51 +00:00			`defer func() {`
			`closeErr := it.Close()`
			`if err == nil {`
			`err = closeErr`
			`}`
			`}()`

index: actually reindex when out of order problem: the out-of-order mechanism based on the outOfOrderIndexerLoop was not working for some claims. Let C be a delete claim on permanode P. If C was received before P was, C was marked as being received with the "have" index row. However, for the deletion to be marked in the index, some information about P is needed (its meta row), so C could not be fully indexed upon reception. Then, when P was finally received, the outOfOrderIndexerLoop would kick in and retry indexing C. Which would fail, because a test based on the "have" row would (wrongly) detect that C is already indexed and return early. In this patch: -we introduce the "\|indexed" suffix to the "have" - value part - row (receive.go). If a blob is received but some of its dependencies are missing, the have row value is written without the suffix. Upon reception of a blob, we now test for the presence of the suffix in the have row. If missing, the reception continues instead of returning early. The existing mechanism that was detecting missing dependencies for file blobs has been adapted to work with this suffix too. -the index enumeration (enumstat.go), which relies on "have" rows, has been adapted to work with the new "have" row format, while staying compatible with the old format. And related tests have been added. http://camlistore.org/issue/454 Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e 2014-07-03 16:07:08 +00:00			`afterKey := "have:" + after`
first pass with mgo as indexer fixed indexer so that it builds as a package added mgo as a third party fixed mgoindexer fixed (hackishly) tests so that they pass I ignored everything memindex for now Change-Id: I3c03ecfa859cf92b088a759239278b1c0edaf8aa 2012-02-20 12:32:46 +00:00			`n := int(0)`
memory index enumerate support. untested. Change-Id: I28df97ed6385675545a74a6e01e779cc627ef033 2011-11-10 18:28:44 +00:00			`for n < limit && it.Next() {`
			`k := it.Key()`
index: actually reindex when out of order problem: the out-of-order mechanism based on the outOfOrderIndexerLoop was not working for some claims. Let C be a delete claim on permanode P. If C was received before P was, C was marked as being received with the "have" index row. However, for the deletion to be marked in the index, some information about P is needed (its meta row), so C could not be fully indexed upon reception. Then, when P was finally received, the outOfOrderIndexerLoop would kick in and retry indexing C. Which would fail, because a test based on the "have" row would (wrongly) detect that C is already indexed and return early. In this patch: -we introduce the "\|indexed" suffix to the "have" - value part - row (receive.go). If a blob is received but some of its dependencies are missing, the have row value is written without the suffix. Upon reception of a blob, we now test for the presence of the suffix in the have row. If missing, the reception continues instead of returning early. The existing mechanism that was detecting missing dependencies for file blobs has been adapted to work with this suffix too. -the index enumeration (enumstat.go), which relies on "have" rows, has been adapted to work with the new "have" row format, while staying compatible with the old format. And related tests have been added. http://camlistore.org/issue/454 Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e 2014-07-03 16:07:08 +00:00			`if k <= afterKey {`
Start of new context package and *context.Context type. Will eventually be plumbed through lots of APIs, especially those requiring or benefiting from cancelation notification and/or those needing access to the HTTP context (e.g. App Engine). Change-Id: I591496725d620126e09d49eb07cade7707c7fc64 2013-12-02 21:20:51 +00:00			`continue`
			`}`
memory index enumerate support. untested. Change-Id: I28df97ed6385675545a74a6e01e779cc627ef033 2011-11-10 18:28:44 +00:00			`if !strings.HasPrefix(k, "have:") {`
			`break`
			`}`
			`n++`
all: delete pkg/blobref; convert all from *blobref.BlobRef to new blob.Ref Change-Id: Id2dfb7f19452bedf4f3c9310b36227fd8117b225 2013-08-04 02:54:30 +00:00			`br, ok := blob.Parse(k[len("have:"):])`
index: actually reindex when out of order problem: the out-of-order mechanism based on the outOfOrderIndexerLoop was not working for some claims. Let C be a delete claim on permanode P. If C was received before P was, C was marked as being received with the "have" index row. However, for the deletion to be marked in the index, some information about P is needed (its meta row), so C could not be fully indexed upon reception. Then, when P was finally received, the outOfOrderIndexerLoop would kick in and retry indexing C. Which would fail, because a test based on the "have" row would (wrongly) detect that C is already indexed and return early. In this patch: -we introduce the "\|indexed" suffix to the "have" - value part - row (receive.go). If a blob is received but some of its dependencies are missing, the have row value is written without the suffix. Upon reception of a blob, we now test for the presence of the suffix in the have row. If missing, the reception continues instead of returning early. The existing mechanism that was detecting missing dependencies for file blobs has been adapted to work with this suffix too. -the index enumeration (enumstat.go), which relies on "have" rows, has been adapted to work with the new "have" row format, while staying compatible with the old format. And related tests have been added. http://camlistore.org/issue/454 Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e 2014-07-03 16:07:08 +00:00			`if !ok {`
			`continue`
			`}`
			`size, err := parseHaveVal(it.Value())`
			`if err == nil {`
Start of new context package and *context.Context type. Will eventually be plumbed through lots of APIs, especially those requiring or benefiting from cancelation notification and/or those needing access to the HTTP context (e.g. App Engine). Change-Id: I591496725d620126e09d49eb07cade7707c7fc64 2013-12-02 21:20:51 +00:00			`select {`
Use 'uint32' instead of 'int64' for blob sizes everywhere. Not just in blob.SizedRef, but in blobserver.Fetch and blobserver.FetchStreaming, too. Blobs have a max size of 10-32 MB anyway, and the index.Corpus is now using uint32 to save memory. Change-Id: I1172445c2f9463fdaee55bfe0f1218d44be4aa53 2014-01-28 20:46:52 +00:00			`case dest <- blob.SizedRef{br, uint32(size)}:`
Start of new context package and *context.Context type. Will eventually be plumbed through lots of APIs, especially those requiring or benefiting from cancelation notification and/or those needing access to the HTTP context (e.g. App Engine). Change-Id: I591496725d620126e09d49eb07cade7707c7fc64 2013-12-02 21:20:51 +00:00			`case <-ctx.Done():`
			`return context.ErrCanceled`
			`}`
memory index enumerate support. untested. Change-Id: I28df97ed6385675545a74a6e01e779cc627ef033 2011-11-10 18:28:44 +00:00			`}`
			`}`
Start of new context package and *context.Context type. Will eventually be plumbed through lots of APIs, especially those requiring or benefiting from cancelation notification and/or those needing access to the HTTP context (e.g. App Engine). Change-Id: I591496725d620126e09d49eb07cade7707c7fc64 2013-12-02 21:20:51 +00:00			`return nil`
start of enumerate and stat for generic indexer Change-Id: Iabac9959a239d2418ab351e37895c7f0721e4e58 2011-11-10 17:47:32 +00:00			`}`

Cleanup: remove BlobHub and time.Duration waits from storage interface Move up a layer to the HTTP. Also, start to remove ContextWrapper stuff. We've done it differently for App Engine instead, and will do it differently yet moving forward. Also add blobserver.Receive and use it in most places, moving checksum verification up a layer. Bunch of other cleanup and TODO fixing too. Much simpler and cleaner. Change-Id: I12e56c5d4e53bfcf82bdd8fb0b6d57c248ff605c 2013-08-21 20:57:28 +00:00			`func (ix *Index) StatBlobs(dest chan<- blob.SizedRef, blobs []blob.Ref) error {`
generic index: untested stat support. Change-Id: Idff5ff19d7b18ef6581c6c8bc344230470c468ef 2011-11-11 02:58:04 +00:00			`for _, br := range blobs {`
			`key := "have:" + br.String()`
			`v, err := ix.s.Get(key)`
Rename index.Storage to sorted.KeyValue and move it into a new package. Having index.Index and index.Storage both in the same package led to confusing discussions about "an index". Better names now, and smaller packages. 2013-11-23 07:24:54 +00:00			`if err == sorted.ErrNotFound {`
generic index: untested stat support. Change-Id: Idff5ff19d7b18ef6581c6c8bc344230470c468ef 2011-11-11 02:58:04 +00:00			`continue`
			`}`
			`if err != nil {`
			`return fmt.Errorf("error looking up key %q: %v", key, err)`
			`}`
index: actually reindex when out of order problem: the out-of-order mechanism based on the outOfOrderIndexerLoop was not working for some claims. Let C be a delete claim on permanode P. If C was received before P was, C was marked as being received with the "have" index row. However, for the deletion to be marked in the index, some information about P is needed (its meta row), so C could not be fully indexed upon reception. Then, when P was finally received, the outOfOrderIndexerLoop would kick in and retry indexing C. Which would fail, because a test based on the "have" row would (wrongly) detect that C is already indexed and return early. In this patch: -we introduce the "\|indexed" suffix to the "have" - value part - row (receive.go). If a blob is received but some of its dependencies are missing, the have row value is written without the suffix. Upon reception of a blob, we now test for the presence of the suffix in the have row. If missing, the reception continues instead of returning early. The existing mechanism that was detecting missing dependencies for file blobs has been adapted to work with this suffix too. -the index enumeration (enumstat.go), which relies on "have" rows, has been adapted to work with the new "have" row format, while staying compatible with the old format. And related tests have been added. http://camlistore.org/issue/454 Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e 2014-07-03 16:07:08 +00:00			`size, err := parseHaveVal(v)`
generic index: untested stat support. Change-Id: Idff5ff19d7b18ef6581c6c8bc344230470c468ef 2011-11-11 02:58:04 +00:00			`if err != nil {`
			`return fmt.Errorf("invalid size for key %q = %q", key, v)`
			`}`
Use 'uint32' instead of 'int64' for blob sizes everywhere. Not just in blob.SizedRef, but in blobserver.Fetch and blobserver.FetchStreaming, too. Blobs have a max size of 10-32 MB anyway, and the index.Corpus is now using uint32 to save memory. Change-Id: I1172445c2f9463fdaee55bfe0f1218d44be4aa53 2014-01-28 20:46:52 +00:00			`dest <- blob.SizedRef{br, uint32(size)}`
generic index: untested stat support. Change-Id: Idff5ff19d7b18ef6581c6c8bc344230470c468ef 2011-11-11 02:58:04 +00:00			`}`
start of enumerate and stat for generic indexer Change-Id: Iabac9959a239d2418ab351e37895c7f0721e4e58 2011-11-10 17:47:32 +00:00			`return nil`
			`}`
index: actually reindex when out of order problem: the out-of-order mechanism based on the outOfOrderIndexerLoop was not working for some claims. Let C be a delete claim on permanode P. If C was received before P was, C was marked as being received with the "have" index row. However, for the deletion to be marked in the index, some information about P is needed (its meta row), so C could not be fully indexed upon reception. Then, when P was finally received, the outOfOrderIndexerLoop would kick in and retry indexing C. Which would fail, because a test based on the "have" row would (wrongly) detect that C is already indexed and return early. In this patch: -we introduce the "\|indexed" suffix to the "have" - value part - row (receive.go). If a blob is received but some of its dependencies are missing, the have row value is written without the suffix. Upon reception of a blob, we now test for the presence of the suffix in the have row. If missing, the reception continues instead of returning early. The existing mechanism that was detecting missing dependencies for file blobs has been adapted to work with this suffix too. -the index enumeration (enumstat.go), which relies on "have" rows, has been adapted to work with the new "have" row format, while staying compatible with the old format. And related tests have been added. http://camlistore.org/issue/454 Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e 2014-07-03 16:07:08 +00:00
			`// parseHaveVal takes the value part of an "have" index row and returns`
			`// the blob size found in that value. Examples:`
			`// parseHaveVal("324\|indexed") == 324`
			`// parseHaveVal("654") == 654`
			`func parseHaveVal(val string) (size uint64, err error) {`
			`pipei := strings.Index(val, "\|")`
			`if pipei >= 0 {`
			`// filter out the "indexed" suffix`
			`val = val[:pipei]`
			`}`
			`return strconv.ParseUint(val, 10, 32)`
			`}`