2011-11-07 16:40:31 +00:00
|
|
|
/*
|
2018-01-04 00:52:49 +00:00
|
|
|
Copyright 2011 The Perkeep Authors
|
2011-11-07 16:40:31 +00:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package index
|
|
|
|
|
|
|
|
import (
|
2011-11-29 20:40:33 +00:00
|
|
|
"bytes"
|
2017-11-26 09:05:38 +00:00
|
|
|
"context"
|
Update from r60 to [almost] Go 1.
A lot is still broken, but most stuff at least compiles now.
The directory tree has been rearranged now too. Go libraries are now
under "pkg". Fully qualified, they are e.g. "camlistore.org/pkg/jsonsign".
The go tool cannot yet fetch from arbitrary domains, but discussion is
happening now on which mechanism to use to allow that.
For now, put the camlistore root under $GOPATH/src. Typically $GOPATH
is $HOME, so Camlistore should be at $HOME/src/camlistore.org.
Then you can:
$ go build ./server/camlistored
... etc
The build.pl script is currently disabled. It'll be resurrected at
some point, but with a very different role (helping create a fake
GOPATH and running the go build command, if things are installed at
the wrong place, and/or running fileembed generators).
Many things are certainly broken.
Many things are disabled. (MySQL, all indexing, etc).
Many things need to be moved into
camlistore.org/third_party/{code.google.com,github.com} and updated
from their r60 to Go 1 versions, where applicable.
The GoMySQL stuff should be updated to use database/sql and the ziutek
library implementing database/sql/driver.
Help wanted.
Change-Id: If71217dc5c8f0e70dbe46e9504ca5131c6eeacde
2012-02-19 05:53:06 +00:00
|
|
|
"errors"
|
2011-11-10 01:15:58 +00:00
|
|
|
"fmt"
|
2012-11-07 22:54:00 +00:00
|
|
|
_ "image/gif"
|
2015-12-18 14:41:58 +00:00
|
|
|
_ "image/jpeg"
|
2012-11-07 22:54:00 +00:00
|
|
|
_ "image/png"
|
2011-11-07 16:40:31 +00:00
|
|
|
"io"
|
|
|
|
"log"
|
2016-05-06 07:21:21 +00:00
|
|
|
"math"
|
2013-11-18 14:51:47 +00:00
|
|
|
"os"
|
2016-09-02 22:00:22 +00:00
|
|
|
"path/filepath"
|
2013-02-19 05:31:41 +00:00
|
|
|
"sort"
|
2015-10-22 15:37:21 +00:00
|
|
|
"strconv"
|
2011-12-01 18:43:57 +00:00
|
|
|
"strings"
|
2013-02-08 05:02:42 +00:00
|
|
|
"sync"
|
2013-02-19 05:31:41 +00:00
|
|
|
"time"
|
2011-11-07 16:40:31 +00:00
|
|
|
|
2015-10-27 22:16:41 +00:00
|
|
|
"github.com/hjfreyer/taglib-go/taglib"
|
2015-12-18 14:41:58 +00:00
|
|
|
"github.com/rwcarlsen/goexif/exif"
|
|
|
|
"github.com/rwcarlsen/goexif/tiff"
|
2018-04-12 22:07:32 +00:00
|
|
|
_ "go4.org/media/heif"
|
2016-02-05 14:56:16 +00:00
|
|
|
"go4.org/readerutil"
|
|
|
|
"go4.org/types"
|
2018-01-03 05:03:30 +00:00
|
|
|
"perkeep.org/internal/images"
|
|
|
|
"perkeep.org/internal/magic"
|
|
|
|
"perkeep.org/internal/media"
|
Rename import paths from camlistore.org to perkeep.org.
Part of the project renaming, issue #981.
After this, users will need to mv their $GOPATH/src/camlistore.org to
$GOPATH/src/perkeep.org. Sorry.
This doesn't yet rename the tools like camlistored, camput, camget,
camtool, etc.
Also, this only moves the lru package to internal. More will move to
internal later.
Also, this doesn't yet remove the "/pkg/" directory. That'll likely
happen later.
This updates some docs, but not all.
devcam test now passes again, even with Go 1.10 (which requires vet
checks are clean too). So a bunch of vet tests are fixed in this CL
too, and a bunch of other broken tests are now fixed (introduced from
the past week of merging the CL backlog).
Change-Id: If580db1691b5b99f8ed6195070789b1f44877dd4
2018-01-01 22:41:41 +00:00
|
|
|
"perkeep.org/pkg/blob"
|
|
|
|
"perkeep.org/pkg/blobserver"
|
|
|
|
"perkeep.org/pkg/jsonsign"
|
|
|
|
"perkeep.org/pkg/schema"
|
2011-11-07 16:40:31 +00:00
|
|
|
)
|
|
|
|
|
pkg/index: ignore unset msdos time when possible
If a zip archive is created without specifying the modtimes of the
files, they'll end up with a default modtime set to the MSDOS epoch
(1980-01-01 modulo some timezone and silly details), which is a common
enough occurrence.
Even when the index has a better information, such as the EXIF time,
when clients of the index (the web UI, through the search package) sort
by creation time, they use the oldest indexed time available, which is
unfortunate in that case.
Therefore, this CL makes the indexer ignore the oldest time found, if it
is before the MSDOS epoch, and if we have another time available, when
receiving a file.
Also fixed the use of hardcoded value of keyFileTimes.name, to help with
reading/searching code.
Change-Id: I9c2c39b319fdf6cd5214cab8928dd025451077ac
2017-03-13 16:59:30 +00:00
|
|
|
func init() {
|
|
|
|
t, err := time.Parse(time.RFC3339, msdosEpoch)
|
|
|
|
if err != nil {
|
|
|
|
panic(fmt.Sprintf("Cannot parse MSDOS epoch: %v", err))
|
|
|
|
}
|
|
|
|
msdosEpochTime = t
|
|
|
|
}
|
|
|
|
|
2013-11-27 16:35:21 +00:00
|
|
|
type mutationMap struct {
|
2018-01-17 18:02:03 +00:00
|
|
|
// When the mutations are from a claim, signerBlobRef is the signer of the
|
|
|
|
// claim, and signerID is its matching GPG key ID. They are copied out of kv because,
|
|
|
|
// when adding the corresponding entries in the corpus, the signerBlobRef-signerID
|
|
|
|
// relation needs to be known before the claim mutations themselves, so we need to
|
|
|
|
// make sure the keySignerKeyID entry is always added first.
|
|
|
|
signerBlobRef blob.Ref
|
|
|
|
signerID string
|
|
|
|
kv map[string]string // the keys and values we populate
|
2014-03-15 15:36:52 +00:00
|
|
|
|
|
|
|
// We record if we get a delete claim, so we can update
|
|
|
|
// the deletes cache right after committing the mutation.
|
|
|
|
//
|
2013-11-27 16:35:21 +00:00
|
|
|
// TODO(mpl): we only need to keep track of one claim so far,
|
|
|
|
// but I chose a slice for when we need to do multi-claims?
|
2014-03-15 15:36:52 +00:00
|
|
|
deletes []schema.Claim
|
2013-11-27 16:35:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (mm *mutationMap) Set(k, v string) {
|
|
|
|
if mm.kv == nil {
|
|
|
|
mm.kv = make(map[string]string)
|
|
|
|
}
|
|
|
|
mm.kv[k] = v
|
|
|
|
}
|
2013-11-14 23:49:52 +00:00
|
|
|
|
2013-11-27 16:35:21 +00:00
|
|
|
func (mm *mutationMap) noteDelete(deleteClaim schema.Claim) {
|
|
|
|
mm.deletes = append(mm.deletes, deleteClaim)
|
2013-11-14 23:49:52 +00:00
|
|
|
}
|
|
|
|
|
2014-03-15 15:36:52 +00:00
|
|
|
func blobsFilteringOut(v []blob.Ref, x blob.Ref) []blob.Ref {
|
|
|
|
switch len(v) {
|
|
|
|
case 0:
|
|
|
|
return nil
|
|
|
|
case 1:
|
|
|
|
if v[0] == x {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return v
|
|
|
|
}
|
|
|
|
nl := v[:0]
|
|
|
|
for _, vb := range v {
|
|
|
|
if vb != x {
|
|
|
|
nl = append(nl, vb)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nl
|
|
|
|
}
|
|
|
|
|
2018-01-16 23:03:16 +00:00
|
|
|
func (ix *Index) indexBlob(ctx context.Context, br blob.Ref) error {
|
|
|
|
rc, _, err := ix.blobSource.Fetch(ctx, br)
|
2017-02-15 01:03:58 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("index: failed to fetch %v for reindexing: %v", br, err)
|
|
|
|
}
|
|
|
|
defer rc.Close()
|
2018-01-16 23:03:16 +00:00
|
|
|
if _, err := blobserver.Receive(ctx, ix, br, rc); err != nil {
|
2017-02-15 01:03:58 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// DisableOutOfOrderIndexing should only be used for tests. It disables the
|
|
|
|
// asynchronous, out of order, indexing to demonstrate that e.g. reindexing fails
|
|
|
|
// without it.
|
|
|
|
func (ix *Index) DisableOutOfOrderIndexing() {
|
|
|
|
ix.Lock()
|
|
|
|
defer ix.Unlock()
|
|
|
|
ix.oooDisabled = true
|
|
|
|
}
|
|
|
|
|
|
|
|
// indexReadyBlobs indexes blobs that have been recently marked as ready to be
|
|
|
|
// reindexed, after the blobs they depend on eventually were indexed.
|
2018-01-16 23:03:16 +00:00
|
|
|
func (ix *Index) indexReadyBlobs(ctx context.Context) {
|
2017-02-15 01:03:58 +00:00
|
|
|
defer ix.reindexWg.Done()
|
|
|
|
ix.RLock()
|
|
|
|
// For tests
|
|
|
|
if ix.oooDisabled {
|
|
|
|
ix.RUnlock()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
ix.RUnlock()
|
|
|
|
failed := make(map[blob.Ref]bool)
|
|
|
|
for {
|
|
|
|
ix.Lock()
|
|
|
|
if len(ix.readyReindex) == 0 {
|
|
|
|
ix.Unlock()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
var br blob.Ref
|
|
|
|
for br = range ix.readyReindex {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
delete(ix.readyReindex, br)
|
|
|
|
ix.Unlock()
|
2018-01-16 23:03:16 +00:00
|
|
|
if err := ix.indexBlob(ctx, br); err != nil {
|
2017-02-15 01:03:58 +00:00
|
|
|
log.Printf("out-of-order indexBlob(%v) = %v", br, err)
|
|
|
|
failed[br] = true
|
|
|
|
}
|
|
|
|
}
|
2021-07-27 01:19:53 +00:00
|
|
|
// TODO(aviau): This code is unreachable. Will fix this in a follow-up PR.
|
|
|
|
/*
|
|
|
|
ix.Lock()
|
|
|
|
defer ix.Unlock()
|
|
|
|
for br := range failed {
|
|
|
|
ix.readyReindex[br] = true
|
|
|
|
}
|
|
|
|
*/
|
2017-02-15 01:03:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// noteBlobIndexed checks if the recent indexing of br now allows the blobs that
|
|
|
|
// were depending on br, to be indexed in turn. If yes, they're reindexed
|
|
|
|
// asynchronously by indexReadyBlobs.
|
2014-03-15 15:36:52 +00:00
|
|
|
func (ix *Index) noteBlobIndexed(br blob.Ref) {
|
|
|
|
for _, needer := range ix.neededBy[br] {
|
|
|
|
newNeeds := blobsFilteringOut(ix.needs[needer], br)
|
|
|
|
if len(newNeeds) == 0 {
|
|
|
|
ix.readyReindex[needer] = true
|
|
|
|
delete(ix.needs, needer)
|
2017-02-15 01:03:58 +00:00
|
|
|
ix.reindexWg.Add(1)
|
2018-01-16 23:03:16 +00:00
|
|
|
go ix.indexReadyBlobs(context.Background())
|
2014-03-15 15:36:52 +00:00
|
|
|
} else {
|
|
|
|
ix.needs[needer] = newNeeds
|
|
|
|
}
|
|
|
|
}
|
|
|
|
delete(ix.neededBy, br)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ix *Index) removeAllMissingEdges(br blob.Ref) {
|
|
|
|
var toDelete []string
|
|
|
|
it := ix.queryPrefix(keyMissing, br)
|
|
|
|
for it.Next() {
|
|
|
|
toDelete = append(toDelete, it.Key())
|
|
|
|
}
|
|
|
|
if err := it.Close(); err != nil {
|
|
|
|
// TODO: Care? Can lazily clean up later.
|
|
|
|
log.Printf("Iterator close error: %v", err)
|
|
|
|
}
|
|
|
|
for _, k := range toDelete {
|
|
|
|
if err := ix.s.Delete(k); err != nil {
|
|
|
|
log.Printf("Error deleting key %s: %v", k, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-01-16 23:03:16 +00:00
|
|
|
func (ix *Index) ReceiveBlob(ctx context.Context, blobRef blob.Ref, source io.Reader) (blob.SizedRef, error) {
|
2016-11-08 05:56:15 +00:00
|
|
|
// Read from source before acquiring ix.Lock (Issue 878):
|
|
|
|
sniffer := NewBlobSniffer(blobRef)
|
|
|
|
written, err := io.Copy(sniffer, source)
|
|
|
|
if err != nil {
|
|
|
|
return blob.SizedRef{}, err
|
|
|
|
}
|
|
|
|
sbr := blob.SizedRef{Ref: blobRef, Size: uint32(written)}
|
2016-04-22 04:34:24 +00:00
|
|
|
|
|
|
|
ix.Lock()
|
|
|
|
defer ix.Unlock()
|
|
|
|
|
2014-03-15 15:36:52 +00:00
|
|
|
missingDeps := false
|
|
|
|
defer func() {
|
|
|
|
if err == nil {
|
|
|
|
ix.noteBlobIndexed(blobRef)
|
|
|
|
if !missingDeps {
|
|
|
|
ix.removeAllMissingEdges(blobRef)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
2016-11-08 05:56:15 +00:00
|
|
|
|
2018-04-21 02:40:11 +00:00
|
|
|
// By default, return immediately if it looks like we already
|
|
|
|
// have indexed this blob before. But if the user has
|
|
|
|
// CAMLI_REDO_INDEX_ON_RECEIVE set in their environment,
|
|
|
|
// always index it. This is generally only useful when working
|
|
|
|
// on the indexing code and retroactively indexing a subset of
|
|
|
|
// content without forcing a global reindexing.
|
2018-04-21 03:51:41 +00:00
|
|
|
if haveVal, haveErr := ix.s.Get("have:" + blobRef.String()); haveErr == nil {
|
|
|
|
if strings.HasSuffix(haveVal, "|indexed") {
|
|
|
|
if allowReindex, _ := strconv.ParseBool(os.Getenv("CAMLI_REDO_INDEX_ON_RECEIVE")); allowReindex {
|
|
|
|
if debugEnv {
|
|
|
|
log.Printf("index: reindexing %v", sbr)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if debugEnv {
|
|
|
|
log.Printf("index: ignoring upload of already-indexed %v", sbr)
|
|
|
|
}
|
2018-04-21 02:40:11 +00:00
|
|
|
return sbr, nil
|
|
|
|
}
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
}
|
2013-12-31 04:17:47 +00:00
|
|
|
}
|
2011-11-07 16:40:31 +00:00
|
|
|
|
|
|
|
sniffer.Parse()
|
2011-11-10 01:15:58 +00:00
|
|
|
|
2014-03-14 19:11:08 +00:00
|
|
|
fetcher := &missTrackFetcher{
|
2014-06-26 15:07:26 +00:00
|
|
|
fetcher: ix.blobSource,
|
2014-03-14 16:14:44 +00:00
|
|
|
}
|
|
|
|
|
2016-04-22 04:34:24 +00:00
|
|
|
mm, err := ix.populateMutationMap(ctx, fetcher, blobRef, sniffer)
|
2018-04-27 19:29:00 +00:00
|
|
|
if debugEnv {
|
|
|
|
log.Printf("index of %v: mm=%v, err=%v", blobRef, mm, err)
|
|
|
|
}
|
2011-11-10 01:15:58 +00:00
|
|
|
if err != nil {
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
if err != errMissingDep {
|
2016-11-08 05:56:15 +00:00
|
|
|
return blob.SizedRef{}, err
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
}
|
2014-03-14 16:14:44 +00:00
|
|
|
fetcher.mu.Lock()
|
|
|
|
defer fetcher.mu.Unlock()
|
|
|
|
if len(fetcher.missing) == 0 {
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
panic("errMissingDep happened, but no fetcher.missing recorded")
|
2014-03-14 16:14:44 +00:00
|
|
|
}
|
2014-03-15 15:36:52 +00:00
|
|
|
missingDeps = true
|
|
|
|
allRecorded := true
|
|
|
|
for _, missing := range fetcher.missing {
|
|
|
|
if err := ix.noteNeeded(blobRef, missing); err != nil {
|
|
|
|
allRecorded = false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if allRecorded {
|
|
|
|
// Lie and say things are good. We've
|
|
|
|
// successfully recorded that the blob isn't
|
|
|
|
// indexed, but we'll reindex it later once
|
|
|
|
// the dependent blobs arrive.
|
2016-11-08 05:56:15 +00:00
|
|
|
return sbr, nil
|
2014-03-15 15:36:52 +00:00
|
|
|
}
|
2016-11-08 05:56:15 +00:00
|
|
|
return blob.SizedRef{}, err
|
2011-11-10 01:15:58 +00:00
|
|
|
}
|
|
|
|
|
2013-11-14 23:49:52 +00:00
|
|
|
if err := ix.commit(mm); err != nil {
|
2016-11-08 05:56:15 +00:00
|
|
|
return blob.SizedRef{}, err
|
2011-11-10 01:15:58 +00:00
|
|
|
}
|
|
|
|
|
2013-11-17 01:24:02 +00:00
|
|
|
if c := ix.corpus; c != nil {
|
2016-04-22 04:34:24 +00:00
|
|
|
if err = c.addBlob(ctx, blobRef, mm); err != nil {
|
2016-11-08 05:56:15 +00:00
|
|
|
return blob.SizedRef{}, err
|
2013-11-17 01:24:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-04 14:26:13 +00:00
|
|
|
// TODO(bradfitz): log levels? These are generally noisy
|
|
|
|
// (especially in tests, like search/handler_test), but I
|
|
|
|
// could see it being useful in production. For now, disabled:
|
|
|
|
//
|
2013-01-22 17:32:40 +00:00
|
|
|
// mimeType := sniffer.MIMEType()
|
2012-11-04 14:26:13 +00:00
|
|
|
// log.Printf("indexer: received %s; type=%v; truncated=%v", blobRef, mimeType, sniffer.IsTruncated())
|
2011-11-07 16:40:31 +00:00
|
|
|
|
2016-01-08 02:20:50 +00:00
|
|
|
return blob.SizedRef{Ref: blobRef, Size: uint32(written)}, nil
|
2011-11-07 16:40:31 +00:00
|
|
|
}
|
2011-11-10 01:15:58 +00:00
|
|
|
|
2013-11-14 23:49:52 +00:00
|
|
|
// commit writes the contents of the mutationMap on a batch
|
2013-11-27 16:35:21 +00:00
|
|
|
// mutation and commits that batch. It also updates the deletes
|
|
|
|
// cache.
|
|
|
|
func (ix *Index) commit(mm *mutationMap) error {
|
|
|
|
// We want the update of the deletes cache to be atomic
|
|
|
|
// with the transaction commit, so we lock here instead
|
|
|
|
// of within updateDeletesCache.
|
|
|
|
ix.deletes.Lock()
|
|
|
|
defer ix.deletes.Unlock()
|
2013-11-14 23:49:52 +00:00
|
|
|
bm := ix.s.BeginBatch()
|
2013-11-27 16:35:21 +00:00
|
|
|
for k, v := range mm.kv {
|
2013-11-14 23:49:52 +00:00
|
|
|
bm.Set(k, v)
|
|
|
|
}
|
2013-11-27 16:35:21 +00:00
|
|
|
err := ix.s.CommitBatch(bm)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
for _, cl := range mm.deletes {
|
|
|
|
if err := ix.updateDeletesCache(cl); err != nil {
|
|
|
|
return fmt.Errorf("Could not update the deletes cache after deletion from %v: %v", cl, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
2013-11-14 23:49:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// populateMutationMap populates keys & values that will be committed
|
|
|
|
// into the returned map.
|
2011-11-10 01:15:58 +00:00
|
|
|
//
|
|
|
|
// the blobref can be trusted at this point (it's been fully consumed
|
2011-11-27 15:46:51 +00:00
|
|
|
// and verified to match), and the sniffer has been populated.
|
2016-04-22 04:34:24 +00:00
|
|
|
func (ix *Index) populateMutationMap(ctx context.Context, fetcher *missTrackFetcher, br blob.Ref, sniffer *BlobSniffer) (*mutationMap, error) {
|
2013-11-27 16:35:21 +00:00
|
|
|
mm := &mutationMap{
|
|
|
|
kv: map[string]string{
|
|
|
|
"meta:" + br.String(): fmt.Sprintf("%d|%s", sniffer.Size(), sniffer.MIMEType()),
|
|
|
|
},
|
2013-11-14 23:49:52 +00:00
|
|
|
}
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
var err error
|
2013-01-22 17:32:40 +00:00
|
|
|
if blob, ok := sniffer.SchemaBlob(); ok {
|
|
|
|
switch blob.Type() {
|
2021-01-17 03:05:35 +00:00
|
|
|
case schema.TypeClaim:
|
2016-04-22 04:34:24 +00:00
|
|
|
err = ix.populateClaim(ctx, fetcher, blob, mm)
|
2021-01-17 03:05:35 +00:00
|
|
|
case schema.TypeFile:
|
2018-01-16 23:03:16 +00:00
|
|
|
err = ix.populateFile(ctx, fetcher, blob, mm)
|
2021-01-17 03:05:35 +00:00
|
|
|
case schema.TypeDirectory:
|
2018-01-16 23:03:16 +00:00
|
|
|
err = ix.populateDir(ctx, fetcher, blob, mm)
|
2011-11-27 15:46:51 +00:00
|
|
|
}
|
|
|
|
}
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
if err != nil && err != errMissingDep {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
var haveVal string
|
|
|
|
if err == errMissingDep {
|
|
|
|
haveVal = fmt.Sprintf("%d", sniffer.Size())
|
|
|
|
} else {
|
|
|
|
haveVal = fmt.Sprintf("%d|indexed", sniffer.Size())
|
|
|
|
}
|
|
|
|
mm.kv["have:"+br.String()] = haveVal
|
|
|
|
if len(fetcher.missing) == 0 {
|
|
|
|
// If err == nil, we're good. Else (err == errMissingDep), we
|
|
|
|
// know the error did not come from a fetching miss (because
|
|
|
|
// len(fetcher.missing) == 0) , but from an index miss. Therefore
|
|
|
|
// we know the miss has already been noted and will be dealt with
|
|
|
|
// later, so we can also pretend everything's fine.
|
|
|
|
return mm, nil
|
|
|
|
}
|
|
|
|
return mm, err
|
2011-11-10 01:15:58 +00:00
|
|
|
}
|
2011-11-27 23:21:26 +00:00
|
|
|
|
2013-02-19 05:31:41 +00:00
|
|
|
// keepFirstN keeps the first N bytes written to it in Bytes.
|
|
|
|
type keepFirstN struct {
|
|
|
|
N int
|
|
|
|
Bytes []byte
|
|
|
|
}
|
|
|
|
|
|
|
|
func (w *keepFirstN) Write(p []byte) (n int, err error) {
|
|
|
|
if n := w.N - len(w.Bytes); n > 0 {
|
|
|
|
if n > len(p) {
|
|
|
|
n = len(p)
|
|
|
|
}
|
|
|
|
w.Bytes = append(w.Bytes, p[:n]...)
|
|
|
|
}
|
|
|
|
return len(p), nil
|
|
|
|
}
|
|
|
|
|
2014-03-14 19:11:08 +00:00
|
|
|
// missTrackFetcher is a blob.Fetcher that records which blob(s) it
|
|
|
|
// failed to load from src.
|
|
|
|
type missTrackFetcher struct {
|
|
|
|
fetcher blob.Fetcher
|
2014-03-14 16:14:44 +00:00
|
|
|
|
2013-12-01 07:15:07 +00:00
|
|
|
mu sync.Mutex // guards missing
|
|
|
|
missing []blob.Ref
|
|
|
|
}
|
|
|
|
|
2018-01-16 23:03:16 +00:00
|
|
|
func (f *missTrackFetcher) Fetch(ctx context.Context, br blob.Ref) (blob io.ReadCloser, size uint32, err error) {
|
|
|
|
blob, size, err = f.fetcher.Fetch(ctx, br)
|
2013-12-01 07:15:07 +00:00
|
|
|
if err == os.ErrNotExist {
|
|
|
|
f.mu.Lock()
|
|
|
|
defer f.mu.Unlock()
|
|
|
|
f.missing = append(f.missing, br)
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
pkg/index: use missing dep mechanism for static sets too
We relied on missTrackFetcher to return errMissingDep when the
underlying Fetch() returned os.ErrNotExist. The caller could then know
how to act if some indexing operation failed because of an errMissingDep
error.
This was wrong for 2 reasons:
1) if a function fn(tf blob.Fetcher) error does:
if _, _, err := tf.Fetch(br); err != nil {
return fmt.Errorf("wrapping this error in a nicer error
message: %v", err)
}
when we call err := fn(tf), we lose the ability to directly determine
whether err is an errMissingDep. We'd have to parse the error string,
which is gross.
This is exactly what happens in populateDir, when we call
dr.StaticSet().
And in order to fix issue #738, we want to be able to tell when a call
to dr.StaticSet() failed because the underlying Fetch() operation
failed.
2) The blob.Fetcher interface specifically states that os.ErrNotExist
should be returned when a blob is not found. We were breaking that rule
by returning errMissingDep.
In order to address both 1) and 2), it seemed like we could add an err
field to missTrackFetcher to keep track of when an os.ErrNotExist
occurred during a Fetch, and let Fetch return an os.ErrNotExist.
However, that would not work, as a missTrackFetcher is used concurrently
by several callers, so a given caller wouldn't be able to tell whether
"its" Fetch failed or a Fetch from a concurrent caller failed.
Therefore, we introduce trackErrorsFetcher, that has such an error field,
and that wraps the missTrackFetcher. All the callers can keep on sharing
the missTrackFetcher, but each of them initialize their own
trackErrorsFetcher, and can check the errors field after a failed call to a
function is suspected to be the result of a failed Fetch.
Also added a test to demonstrate that issue #738 is fixed.
Fixes #738
Change-Id: Ia5c3081b71c77be1e8cff0bbc847ade68f019bf9
2016-12-12 21:19:35 +00:00
|
|
|
// trackErrorsFetcher is a blob.Fetcher that records to errs all Fetch errors.
|
|
|
|
type trackErrorsFetcher struct {
|
|
|
|
mu sync.RWMutex
|
|
|
|
errs []error
|
|
|
|
|
|
|
|
f blob.Fetcher
|
|
|
|
}
|
|
|
|
|
2018-01-16 23:03:16 +00:00
|
|
|
func (tf *trackErrorsFetcher) Fetch(ctx context.Context, br blob.Ref) (blob io.ReadCloser, size uint32, err error) {
|
|
|
|
blob, size, err = tf.f.Fetch(ctx, br)
|
pkg/index: use missing dep mechanism for static sets too
We relied on missTrackFetcher to return errMissingDep when the
underlying Fetch() returned os.ErrNotExist. The caller could then know
how to act if some indexing operation failed because of an errMissingDep
error.
This was wrong for 2 reasons:
1) if a function fn(tf blob.Fetcher) error does:
if _, _, err := tf.Fetch(br); err != nil {
return fmt.Errorf("wrapping this error in a nicer error
message: %v", err)
}
when we call err := fn(tf), we lose the ability to directly determine
whether err is an errMissingDep. We'd have to parse the error string,
which is gross.
This is exactly what happens in populateDir, when we call
dr.StaticSet().
And in order to fix issue #738, we want to be able to tell when a call
to dr.StaticSet() failed because the underlying Fetch() operation
failed.
2) The blob.Fetcher interface specifically states that os.ErrNotExist
should be returned when a blob is not found. We were breaking that rule
by returning errMissingDep.
In order to address both 1) and 2), it seemed like we could add an err
field to missTrackFetcher to keep track of when an os.ErrNotExist
occurred during a Fetch, and let Fetch return an os.ErrNotExist.
However, that would not work, as a missTrackFetcher is used concurrently
by several callers, so a given caller wouldn't be able to tell whether
"its" Fetch failed or a Fetch from a concurrent caller failed.
Therefore, we introduce trackErrorsFetcher, that has such an error field,
and that wraps the missTrackFetcher. All the callers can keep on sharing
the missTrackFetcher, but each of them initialize their own
trackErrorsFetcher, and can check the errors field after a failed call to a
function is suspected to be the result of a failed Fetch.
Also added a test to demonstrate that issue #738 is fixed.
Fixes #738
Change-Id: Ia5c3081b71c77be1e8cff0bbc847ade68f019bf9
2016-12-12 21:19:35 +00:00
|
|
|
if err != nil {
|
|
|
|
tf.mu.Lock()
|
|
|
|
defer tf.mu.Unlock()
|
|
|
|
tf.errs = append(tf.errs, err)
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// hasErrNotExist reports whether tf recorded any error and if all of them are
|
|
|
|
// os.ErrNotExist errors.
|
|
|
|
func (tf *trackErrorsFetcher) hasErrNotExist() bool {
|
|
|
|
tf.mu.RLock()
|
|
|
|
defer tf.mu.RUnlock()
|
|
|
|
if len(tf.errs) == 0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
for _, v := range tf.errs {
|
|
|
|
if v != os.ErrNotExist {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2015-04-25 00:11:51 +00:00
|
|
|
// filePrefixReader is both a *bytes.Reader and a *schema.FileReader for use in readPrefixOrFile
|
|
|
|
type filePrefixReader interface {
|
|
|
|
io.Reader
|
|
|
|
io.ReaderAt
|
|
|
|
}
|
|
|
|
|
|
|
|
// readPrefixOrFile executes a given func with a reader on the passed prefix and
|
|
|
|
// falls back to passing a reader on the whole file if the func returns an error.
|
|
|
|
func readPrefixOrFile(prefix []byte, fetcher blob.Fetcher, b *schema.Blob, fn func(filePrefixReader) error) (err error) {
|
|
|
|
pr := bytes.NewReader(prefix)
|
|
|
|
err = fn(pr)
|
|
|
|
if err == io.EOF || err == io.ErrUnexpectedEOF {
|
|
|
|
var fr *schema.FileReader
|
|
|
|
fr, err = b.NewFileReader(fetcher)
|
|
|
|
if err == nil {
|
|
|
|
err = fn(fr)
|
|
|
|
fr.Close()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
pkg/index: ignore unset msdos time when possible
If a zip archive is created without specifying the modtimes of the
files, they'll end up with a default modtime set to the MSDOS epoch
(1980-01-01 modulo some timezone and silly details), which is a common
enough occurrence.
Even when the index has a better information, such as the EXIF time,
when clients of the index (the web UI, through the search package) sort
by creation time, they use the oldest indexed time available, which is
unfortunate in that case.
Therefore, this CL makes the indexer ignore the oldest time found, if it
is before the MSDOS epoch, and if we have another time available, when
receiving a file.
Also fixed the use of hardcoded value of keyFileTimes.name, to help with
reading/searching code.
Change-Id: I9c2c39b319fdf6cd5214cab8928dd025451077ac
2017-03-13 16:59:30 +00:00
|
|
|
const msdosEpoch = "1980-01-01T00:00:00Z"
|
|
|
|
|
|
|
|
var (
|
|
|
|
exifDebug, _ = strconv.ParseBool(os.Getenv("CAMLI_DEBUG_IMAGES"))
|
2018-04-21 03:51:41 +00:00
|
|
|
debugEnv, _ = strconv.ParseBool(os.Getenv("CAMLI_DEBUG"))
|
pkg/index: ignore unset msdos time when possible
If a zip archive is created without specifying the modtimes of the
files, they'll end up with a default modtime set to the MSDOS epoch
(1980-01-01 modulo some timezone and silly details), which is a common
enough occurrence.
Even when the index has a better information, such as the EXIF time,
when clients of the index (the web UI, through the search package) sort
by creation time, they use the oldest indexed time available, which is
unfortunate in that case.
Therefore, this CL makes the indexer ignore the oldest time found, if it
is before the MSDOS epoch, and if we have another time available, when
receiving a file.
Also fixed the use of hardcoded value of keyFileTimes.name, to help with
reading/searching code.
Change-Id: I9c2c39b319fdf6cd5214cab8928dd025451077ac
2017-03-13 16:59:30 +00:00
|
|
|
msdosEpochTime time.Time
|
|
|
|
)
|
2015-10-22 15:37:21 +00:00
|
|
|
|
2013-11-14 23:49:52 +00:00
|
|
|
// b: the parsed file schema blob
|
|
|
|
// mm: keys to populate
|
2018-01-16 23:03:16 +00:00
|
|
|
func (ix *Index) populateFile(ctx context.Context, fetcher blob.Fetcher, b *schema.Blob, mm *mutationMap) (err error) {
|
2013-02-19 05:31:41 +00:00
|
|
|
var times []time.Time // all creation or mod times seen; may be zero
|
2013-08-04 02:54:30 +00:00
|
|
|
times = append(times, b.ModTime())
|
2013-01-22 17:32:40 +00:00
|
|
|
|
2013-08-04 02:54:30 +00:00
|
|
|
blobRef := b.BlobRef()
|
pkg/index: use missing dep mechanism for static sets too
We relied on missTrackFetcher to return errMissingDep when the
underlying Fetch() returned os.ErrNotExist. The caller could then know
how to act if some indexing operation failed because of an errMissingDep
error.
This was wrong for 2 reasons:
1) if a function fn(tf blob.Fetcher) error does:
if _, _, err := tf.Fetch(br); err != nil {
return fmt.Errorf("wrapping this error in a nicer error
message: %v", err)
}
when we call err := fn(tf), we lose the ability to directly determine
whether err is an errMissingDep. We'd have to parse the error string,
which is gross.
This is exactly what happens in populateDir, when we call
dr.StaticSet().
And in order to fix issue #738, we want to be able to tell when a call
to dr.StaticSet() failed because the underlying Fetch() operation
failed.
2) The blob.Fetcher interface specifically states that os.ErrNotExist
should be returned when a blob is not found. We were breaking that rule
by returning errMissingDep.
In order to address both 1) and 2), it seemed like we could add an err
field to missTrackFetcher to keep track of when an os.ErrNotExist
occurred during a Fetch, and let Fetch return an os.ErrNotExist.
However, that would not work, as a missTrackFetcher is used concurrently
by several callers, so a given caller wouldn't be able to tell whether
"its" Fetch failed or a Fetch from a concurrent caller failed.
Therefore, we introduce trackErrorsFetcher, that has such an error field,
and that wraps the missTrackFetcher. All the callers can keep on sharing
the missTrackFetcher, but each of them initialize their own
trackErrorsFetcher, and can check the errors field after a failed call to a
function is suspected to be the result of a failed Fetch.
Also added a test to demonstrate that issue #738 is fixed.
Fixes #738
Change-Id: Ia5c3081b71c77be1e8cff0bbc847ade68f019bf9
2016-12-12 21:19:35 +00:00
|
|
|
tf := &trackErrorsFetcher{f: fetcher.(*missTrackFetcher)}
|
|
|
|
fr, err := b.NewFileReader(tf)
|
2011-12-03 19:26:42 +00:00
|
|
|
if err != nil {
|
2014-03-15 15:36:52 +00:00
|
|
|
return err
|
2011-12-03 19:26:42 +00:00
|
|
|
}
|
2013-01-01 02:02:13 +00:00
|
|
|
defer fr.Close()
|
2016-09-02 22:00:22 +00:00
|
|
|
mimeType, mr := magic.MIMETypeFromReader(fr)
|
|
|
|
if mimeType == "" {
|
|
|
|
mimeType = magic.MIMETypeByExtension(filepath.Ext(b.FileName()))
|
|
|
|
}
|
2012-11-07 22:54:00 +00:00
|
|
|
|
2018-01-09 23:07:38 +00:00
|
|
|
h := blob.NewHash()
|
|
|
|
var copyDest io.Writer = h
|
2013-02-19 05:31:41 +00:00
|
|
|
var imageBuf *keepFirstN // or nil
|
2016-09-02 22:00:22 +00:00
|
|
|
if strings.HasPrefix(mimeType, "image/") {
|
2014-09-06 19:20:03 +00:00
|
|
|
imageBuf = &keepFirstN{N: 512 << 10}
|
2013-02-19 05:31:41 +00:00
|
|
|
copyDest = io.MultiWriter(copyDest, imageBuf)
|
2012-11-07 22:54:00 +00:00
|
|
|
}
|
2015-04-25 00:11:51 +00:00
|
|
|
size, err := io.Copy(copyDest, mr)
|
2011-12-03 19:26:42 +00:00
|
|
|
if err != nil {
|
pkg/index: use missing dep mechanism for static sets too
We relied on missTrackFetcher to return errMissingDep when the
underlying Fetch() returned os.ErrNotExist. The caller could then know
how to act if some indexing operation failed because of an errMissingDep
error.
This was wrong for 2 reasons:
1) if a function fn(tf blob.Fetcher) error does:
if _, _, err := tf.Fetch(br); err != nil {
return fmt.Errorf("wrapping this error in a nicer error
message: %v", err)
}
when we call err := fn(tf), we lose the ability to directly determine
whether err is an errMissingDep. We'd have to parse the error string,
which is gross.
This is exactly what happens in populateDir, when we call
dr.StaticSet().
And in order to fix issue #738, we want to be able to tell when a call
to dr.StaticSet() failed because the underlying Fetch() operation
failed.
2) The blob.Fetcher interface specifically states that os.ErrNotExist
should be returned when a blob is not found. We were breaking that rule
by returning errMissingDep.
In order to address both 1) and 2), it seemed like we could add an err
field to missTrackFetcher to keep track of when an os.ErrNotExist
occurred during a Fetch, and let Fetch return an os.ErrNotExist.
However, that would not work, as a missTrackFetcher is used concurrently
by several callers, so a given caller wouldn't be able to tell whether
"its" Fetch failed or a Fetch from a concurrent caller failed.
Therefore, we introduce trackErrorsFetcher, that has such an error field,
and that wraps the missTrackFetcher. All the callers can keep on sharing
the missTrackFetcher, but each of them initialize their own
trackErrorsFetcher, and can check the errors field after a failed call to a
function is suspected to be the result of a failed Fetch.
Also added a test to demonstrate that issue #738 is fixed.
Fixes #738
Change-Id: Ia5c3081b71c77be1e8cff0bbc847ade68f019bf9
2016-12-12 21:19:35 +00:00
|
|
|
if tf.hasErrNotExist() {
|
|
|
|
return errMissingDep
|
|
|
|
}
|
2014-03-15 15:36:52 +00:00
|
|
|
return err
|
2011-12-03 19:26:42 +00:00
|
|
|
}
|
2018-01-09 23:07:38 +00:00
|
|
|
wholeRef := blob.RefFromHash(h)
|
2011-12-03 19:26:42 +00:00
|
|
|
|
2013-02-19 05:31:41 +00:00
|
|
|
if imageBuf != nil {
|
2015-04-25 00:11:51 +00:00
|
|
|
var conf images.Config
|
|
|
|
decodeConfig := func(r filePrefixReader) error {
|
|
|
|
conf, err = images.DecodeConfig(r)
|
|
|
|
return err
|
2014-09-06 19:20:03 +00:00
|
|
|
}
|
2015-04-25 00:11:51 +00:00
|
|
|
if err := readPrefixOrFile(imageBuf.Bytes, fetcher, b, decodeConfig); err == nil {
|
2013-11-14 23:49:52 +00:00
|
|
|
mm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height)))
|
2018-04-27 19:29:00 +00:00
|
|
|
} else if debugEnv {
|
|
|
|
log.Printf("index: WARNING: image decodeConfig: %v", err)
|
2013-02-19 05:31:41 +00:00
|
|
|
}
|
2015-04-25 00:11:51 +00:00
|
|
|
|
2018-04-18 01:46:57 +00:00
|
|
|
exifData := imageBuf.Bytes
|
|
|
|
if conf.HEICEXIF != nil {
|
|
|
|
exifData = conf.HEICEXIF
|
|
|
|
}
|
2015-04-25 00:11:51 +00:00
|
|
|
var ft time.Time
|
|
|
|
fileTime := func(r filePrefixReader) error {
|
|
|
|
ft, err = schema.FileTime(r)
|
|
|
|
return err
|
|
|
|
}
|
2018-04-18 01:46:57 +00:00
|
|
|
|
|
|
|
if err = readPrefixOrFile(exifData, fetcher, b, fileTime); err == nil {
|
2013-02-19 05:31:41 +00:00
|
|
|
times = append(times, ft)
|
2018-04-27 19:29:00 +00:00
|
|
|
} else if debugEnv {
|
|
|
|
log.Printf("index: WARNING: image fileTime: %v", err)
|
|
|
|
|
2013-02-19 05:31:41 +00:00
|
|
|
}
|
2015-10-22 15:37:21 +00:00
|
|
|
if exifDebug {
|
|
|
|
log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
|
|
|
|
}
|
2013-12-24 00:21:19 +00:00
|
|
|
|
2014-10-29 01:01:14 +00:00
|
|
|
// TODO(mpl): find (generate?) more broken EXIF images to experiment with.
|
2015-04-25 00:11:51 +00:00
|
|
|
indexEXIFData := func(r filePrefixReader) error {
|
|
|
|
return indexEXIF(wholeRef, r, mm)
|
2014-09-06 19:20:03 +00:00
|
|
|
}
|
2018-04-18 01:46:57 +00:00
|
|
|
if err = readPrefixOrFile(exifData, fetcher, b, indexEXIFData); err != nil {
|
2015-10-22 15:37:21 +00:00
|
|
|
if exifDebug {
|
|
|
|
log.Printf("error parsing EXIF: %v", err)
|
|
|
|
}
|
2014-09-06 19:20:03 +00:00
|
|
|
}
|
2013-02-19 05:31:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
var sortTimes []time.Time
|
|
|
|
for _, t := range times {
|
|
|
|
if !t.IsZero() {
|
|
|
|
sortTimes = append(sortTimes, t)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
sort.Sort(types.ByTime(sortTimes))
|
|
|
|
var time3339s string
|
|
|
|
switch {
|
|
|
|
case len(sortTimes) == 1:
|
|
|
|
time3339s = types.Time3339(sortTimes[0]).String()
|
|
|
|
case len(sortTimes) >= 2:
|
|
|
|
oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1]
|
pkg/index: ignore unset msdos time when possible
If a zip archive is created without specifying the modtimes of the
files, they'll end up with a default modtime set to the MSDOS epoch
(1980-01-01 modulo some timezone and silly details), which is a common
enough occurrence.
Even when the index has a better information, such as the EXIF time,
when clients of the index (the web UI, through the search package) sort
by creation time, they use the oldest indexed time available, which is
unfortunate in that case.
Therefore, this CL makes the indexer ignore the oldest time found, if it
is before the MSDOS epoch, and if we have another time available, when
receiving a file.
Also fixed the use of hardcoded value of keyFileTimes.name, to help with
reading/searching code.
Change-Id: I9c2c39b319fdf6cd5214cab8928dd025451077ac
2017-03-13 16:59:30 +00:00
|
|
|
// Common enough exception: unset creation time from an MSDOS
|
|
|
|
// system (which is the default in zip files). So if we have
|
|
|
|
// another time to use, just ignore the MSDOS epoch one.
|
|
|
|
if oldest.After(msdosEpochTime) {
|
|
|
|
time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String()
|
|
|
|
} else {
|
|
|
|
time3339s = types.Time3339(newest).String()
|
|
|
|
}
|
2013-02-19 05:31:41 +00:00
|
|
|
}
|
|
|
|
|
2013-11-14 23:49:52 +00:00
|
|
|
mm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1")
|
2016-09-02 22:00:22 +00:00
|
|
|
mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mimeType, wholeRef))
|
2013-11-14 23:49:52 +00:00
|
|
|
mm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s))
|
2013-08-27 01:18:38 +00:00
|
|
|
|
2016-09-02 22:00:22 +00:00
|
|
|
if strings.HasPrefix(mimeType, "audio/") {
|
2014-01-21 03:31:15 +00:00
|
|
|
indexMusic(io.NewSectionReader(fr, 0, fr.Size()), wholeRef, mm)
|
2013-08-27 01:18:38 +00:00
|
|
|
}
|
|
|
|
|
2011-12-03 19:26:42 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2013-12-24 00:21:19 +00:00
|
|
|
func tagFormatString(tag *tiff.Tag) string {
|
2014-08-28 13:29:30 +00:00
|
|
|
switch tag.Format() {
|
2013-12-24 00:21:19 +00:00
|
|
|
case tiff.IntVal:
|
|
|
|
return "int"
|
|
|
|
case tiff.RatVal:
|
|
|
|
return "rat"
|
|
|
|
case tiff.FloatVal:
|
|
|
|
return "float"
|
|
|
|
case tiff.StringVal:
|
|
|
|
return "string"
|
|
|
|
}
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
|
|
|
type exifWalkFunc func(name exif.FieldName, tag *tiff.Tag) error
|
|
|
|
|
|
|
|
func (f exifWalkFunc) Walk(name exif.FieldName, tag *tiff.Tag) error { return f(name, tag) }
|
|
|
|
|
2014-09-06 19:20:03 +00:00
|
|
|
var errEXIFPanic = errors.New("EXIF library panicked while walking fields")
|
|
|
|
|
2015-04-25 00:11:51 +00:00
|
|
|
func indexEXIF(wholeRef blob.Ref, r io.Reader, mm *mutationMap) (err error) {
|
2014-10-29 01:01:14 +00:00
|
|
|
var tiffErr error
|
2015-04-25 00:11:51 +00:00
|
|
|
ex, err := exif.Decode(r)
|
2013-12-24 00:21:19 +00:00
|
|
|
if err != nil {
|
2014-10-29 01:01:14 +00:00
|
|
|
tiffErr = err
|
|
|
|
if exif.IsCriticalError(err) {
|
2015-04-25 00:11:16 +00:00
|
|
|
if exif.IsShortReadTagValueError(err) {
|
|
|
|
return io.ErrUnexpectedEOF // trigger a retry with whole file
|
|
|
|
}
|
2014-10-29 01:01:14 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
log.Printf("Non critical TIFF decoding error: %v", err)
|
2013-12-24 00:21:19 +00:00
|
|
|
}
|
|
|
|
defer func() {
|
|
|
|
// The EXIF library panics if you access a field past
|
|
|
|
// what the file contains. Be paranoid and just
|
|
|
|
// recover here, instead of crashing on an invalid
|
|
|
|
// EXIF file.
|
|
|
|
if e := recover(); e != nil {
|
2014-09-06 19:20:03 +00:00
|
|
|
err = errEXIFPanic
|
2013-12-24 00:21:19 +00:00
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2014-09-06 19:20:03 +00:00
|
|
|
err = ex.Walk(exifWalkFunc(func(name exif.FieldName, tag *tiff.Tag) error {
|
2013-12-24 00:21:19 +00:00
|
|
|
tagFmt := tagFormatString(tag)
|
|
|
|
if tagFmt == "" {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
key := keyEXIFTag.Key(wholeRef, fmt.Sprintf("%04x", tag.Id))
|
2014-09-03 20:47:24 +00:00
|
|
|
numComp := int(tag.Count)
|
2014-08-28 13:29:30 +00:00
|
|
|
if tag.Format() == tiff.StringVal {
|
2013-12-24 00:21:19 +00:00
|
|
|
numComp = 1
|
|
|
|
}
|
|
|
|
var val bytes.Buffer
|
|
|
|
val.WriteString(keyEXIFTag.Val(tagFmt, numComp, ""))
|
2014-08-28 13:29:30 +00:00
|
|
|
if tag.Format() == tiff.StringVal {
|
|
|
|
str, err := tag.StringVal()
|
|
|
|
if err != nil {
|
|
|
|
log.Printf("Invalid EXIF string data: %v", err)
|
|
|
|
return nil
|
|
|
|
}
|
2013-12-24 00:21:19 +00:00
|
|
|
if containsUnsafeRawStrByte(str) {
|
|
|
|
val.WriteString(urle(str))
|
|
|
|
} else {
|
|
|
|
val.WriteString(str)
|
|
|
|
}
|
|
|
|
} else {
|
2014-09-03 20:47:24 +00:00
|
|
|
for i := 0; i < int(tag.Count); i++ {
|
2013-12-24 00:21:19 +00:00
|
|
|
if i > 0 {
|
|
|
|
val.WriteByte('|')
|
|
|
|
}
|
|
|
|
switch tagFmt {
|
|
|
|
case "int":
|
2014-08-28 13:29:30 +00:00
|
|
|
v, err := tag.Int(i)
|
|
|
|
if err != nil {
|
|
|
|
log.Printf("Invalid EXIF int data: %v", err)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
fmt.Fprintf(&val, "%d", v)
|
2013-12-24 00:21:19 +00:00
|
|
|
case "rat":
|
2014-08-28 13:29:30 +00:00
|
|
|
n, d, err := tag.Rat2(i)
|
|
|
|
if err != nil {
|
|
|
|
log.Printf("Invalid EXIF rat data: %v", err)
|
|
|
|
return nil
|
|
|
|
}
|
2013-12-24 00:21:19 +00:00
|
|
|
fmt.Fprintf(&val, "%d/%d", n, d)
|
|
|
|
case "float":
|
2014-08-28 13:29:30 +00:00
|
|
|
v, err := tag.Float(i)
|
|
|
|
if err != nil {
|
|
|
|
log.Printf("Invalid EXIF float data: %v", err)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
fmt.Fprintf(&val, "%v", v)
|
2013-12-24 00:21:19 +00:00
|
|
|
default:
|
|
|
|
panic("shouldn't get here")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
valStr := val.String()
|
|
|
|
mm.Set(key, valStr)
|
|
|
|
return nil
|
|
|
|
}))
|
2014-09-06 19:20:03 +00:00
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
2013-12-24 00:21:19 +00:00
|
|
|
|
2014-10-29 01:01:14 +00:00
|
|
|
if exif.IsGPSError(tiffErr) {
|
|
|
|
log.Printf("Invalid EXIF GPS data: %v", tiffErr)
|
|
|
|
return nil
|
|
|
|
}
|
2014-08-28 13:29:30 +00:00
|
|
|
if lat, long, err := ex.LatLong(); err == nil {
|
2016-05-06 07:21:21 +00:00
|
|
|
if math.Abs(long) > 180.0 || math.Abs(lat) > 90.0 {
|
|
|
|
log.Printf("Long, lat outside allowed range: %v, %v", long, lat)
|
|
|
|
return nil
|
|
|
|
}
|
2017-05-18 21:18:46 +00:00
|
|
|
if math.IsNaN(long) || math.IsNaN(lat) {
|
|
|
|
log.Print("Latitude or Longitude is NaN")
|
|
|
|
return nil
|
|
|
|
}
|
2016-05-06 07:21:21 +00:00
|
|
|
// index 7 places fixed precision (~10mm worst case at equator)
|
|
|
|
// http://stackoverflow.com/a/1947615/114581
|
|
|
|
mm.Set(keyEXIFGPS.Key(wholeRef), keyEXIFGPS.Val(fmt.Sprintf("%.7f", lat), fmt.Sprintf("%.7f", long)))
|
2014-08-28 13:29:30 +00:00
|
|
|
} else if !exif.IsTagNotPresentError(err) {
|
|
|
|
log.Printf("Invalid EXIF GPS data: %v", err)
|
2013-12-24 00:21:19 +00:00
|
|
|
}
|
2014-09-06 19:20:03 +00:00
|
|
|
return nil
|
2013-12-24 00:21:19 +00:00
|
|
|
}
|
|
|
|
|
2014-01-21 03:31:15 +00:00
|
|
|
// indexMusic adds mutations to index the wholeRef by attached metadata and other properties.
|
2016-02-05 14:56:16 +00:00
|
|
|
func indexMusic(r readerutil.SizeReaderAt, wholeRef blob.Ref, mm *mutationMap) {
|
2014-01-24 05:43:12 +00:00
|
|
|
tag, err := taglib.Decode(r, r.Size())
|
2014-01-21 03:31:15 +00:00
|
|
|
if err != nil {
|
|
|
|
log.Print("index: error parsing tag: ", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
var footerLength int64 = 0
|
2014-01-24 05:43:12 +00:00
|
|
|
if hasTag, err := media.HasID3v1Tag(r); err != nil {
|
|
|
|
log.Print("index: unable to check for ID3v1 tag: ", err)
|
|
|
|
return
|
|
|
|
} else if hasTag {
|
|
|
|
footerLength = media.ID3v1TagLength
|
2014-01-21 03:31:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Generate a hash of the audio portion of the file (i.e. excluding ID3v1 and v2 tags).
|
2014-01-24 05:43:12 +00:00
|
|
|
audioStart := int64(tag.TagSize())
|
|
|
|
audioSize := r.Size() - audioStart - footerLength
|
2018-01-09 23:07:38 +00:00
|
|
|
hash := blob.NewHash()
|
2014-01-24 05:43:12 +00:00
|
|
|
if _, err := io.Copy(hash, io.NewSectionReader(r, audioStart, audioSize)); err != nil {
|
2018-01-09 23:07:38 +00:00
|
|
|
log.Print("index: error generating hash of audio data: ", err)
|
2014-01-21 03:31:15 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
mediaRef := blob.RefFromHash(hash)
|
2013-08-27 01:18:38 +00:00
|
|
|
|
2014-01-24 05:43:12 +00:00
|
|
|
duration, err := media.GetMPEGAudioDuration(io.NewSectionReader(r, audioStart, audioSize))
|
|
|
|
if err != nil {
|
|
|
|
log.Print("index: unable to calculate audio duration: ", err)
|
|
|
|
duration = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
var yearStr, trackStr, discStr, durationStr string
|
2013-08-27 01:18:38 +00:00
|
|
|
if !tag.Year().IsZero() {
|
2014-01-21 03:31:15 +00:00
|
|
|
const justYearLayout = "2006"
|
2013-08-27 01:18:38 +00:00
|
|
|
yearStr = tag.Year().Format(justYearLayout)
|
|
|
|
}
|
|
|
|
if tag.Track() != 0 {
|
|
|
|
trackStr = fmt.Sprintf("%d", tag.Track())
|
|
|
|
}
|
2014-01-21 03:31:15 +00:00
|
|
|
if tag.Disc() != 0 {
|
|
|
|
discStr = fmt.Sprintf("%d", tag.Disc())
|
|
|
|
}
|
2014-01-24 05:43:12 +00:00
|
|
|
if duration != 0 {
|
|
|
|
durationStr = fmt.Sprintf("%d", duration/time.Millisecond)
|
|
|
|
}
|
2013-08-27 01:18:38 +00:00
|
|
|
|
2014-01-20 23:47:21 +00:00
|
|
|
// Note: if you add to this map, please update
|
|
|
|
// pkg/search/query.go's MediaTagConstraint Tag docs.
|
2013-08-27 01:18:38 +00:00
|
|
|
tags := map[string]string{
|
2014-05-06 03:45:58 +00:00
|
|
|
"title": tag.Title(),
|
|
|
|
"artist": tag.Artist(),
|
|
|
|
"album": tag.Album(),
|
|
|
|
"genre": tag.Genre(),
|
|
|
|
"musicbrainzalbumid": tag.CustomFrames()["MusicBrainz Album Id"],
|
|
|
|
"year": yearStr,
|
|
|
|
"track": trackStr,
|
|
|
|
"disc": discStr,
|
|
|
|
"mediaref": mediaRef.String(),
|
|
|
|
"durationms": durationStr,
|
2013-08-27 01:18:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for tag, value := range tags {
|
|
|
|
if value != "" {
|
2014-01-21 05:05:45 +00:00
|
|
|
mm.Set(keyMediaTag.Key(wholeRef, tag), keyMediaTag.Val(value))
|
2013-08-27 01:18:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-14 23:49:52 +00:00
|
|
|
// b: the parsed file schema blob
|
|
|
|
// mm: keys to populate
|
2018-01-16 23:03:16 +00:00
|
|
|
func (ix *Index) populateDir(ctx context.Context, fetcher blob.Fetcher, b *schema.Blob, mm *mutationMap) error {
|
2013-08-04 02:54:30 +00:00
|
|
|
blobRef := b.BlobRef()
|
2013-02-19 05:31:41 +00:00
|
|
|
// TODO(bradfitz): move the NewDirReader and FileName method off *schema.Blob and onto
|
2014-03-14 16:14:44 +00:00
|
|
|
// StaticFile/StaticDirectory or something.
|
2013-01-22 17:32:40 +00:00
|
|
|
|
pkg/index: use missing dep mechanism for static sets too
We relied on missTrackFetcher to return errMissingDep when the
underlying Fetch() returned os.ErrNotExist. The caller could then know
how to act if some indexing operation failed because of an errMissingDep
error.
This was wrong for 2 reasons:
1) if a function fn(tf blob.Fetcher) error does:
if _, _, err := tf.Fetch(br); err != nil {
return fmt.Errorf("wrapping this error in a nicer error
message: %v", err)
}
when we call err := fn(tf), we lose the ability to directly determine
whether err is an errMissingDep. We'd have to parse the error string,
which is gross.
This is exactly what happens in populateDir, when we call
dr.StaticSet().
And in order to fix issue #738, we want to be able to tell when a call
to dr.StaticSet() failed because the underlying Fetch() operation
failed.
2) The blob.Fetcher interface specifically states that os.ErrNotExist
should be returned when a blob is not found. We were breaking that rule
by returning errMissingDep.
In order to address both 1) and 2), it seemed like we could add an err
field to missTrackFetcher to keep track of when an os.ErrNotExist
occurred during a Fetch, and let Fetch return an os.ErrNotExist.
However, that would not work, as a missTrackFetcher is used concurrently
by several callers, so a given caller wouldn't be able to tell whether
"its" Fetch failed or a Fetch from a concurrent caller failed.
Therefore, we introduce trackErrorsFetcher, that has such an error field,
and that wraps the missTrackFetcher. All the callers can keep on sharing
the missTrackFetcher, but each of them initialize their own
trackErrorsFetcher, and can check the errors field after a failed call to a
function is suspected to be the result of a failed Fetch.
Also added a test to demonstrate that issue #738 is fixed.
Fixes #738
Change-Id: Ia5c3081b71c77be1e8cff0bbc847ade68f019bf9
2016-12-12 21:19:35 +00:00
|
|
|
tf := &trackErrorsFetcher{f: fetcher.(*missTrackFetcher)}
|
2018-01-16 23:03:16 +00:00
|
|
|
dr, err := b.NewDirReader(ctx, tf)
|
2013-01-09 15:58:20 +00:00
|
|
|
if err != nil {
|
|
|
|
// TODO(bradfitz): propagate up a transient failure
|
|
|
|
// error type, so we can retry indexing files in the
|
|
|
|
// future if blobs are only temporarily unavailable.
|
|
|
|
log.Printf("index: error indexing directory, creating NewDirReader %s: %v", blobRef, err)
|
|
|
|
return nil
|
|
|
|
}
|
2018-01-16 23:03:16 +00:00
|
|
|
sts, err := dr.StaticSet(ctx)
|
2013-01-09 15:58:20 +00:00
|
|
|
if err != nil {
|
pkg/index: use missing dep mechanism for static sets too
We relied on missTrackFetcher to return errMissingDep when the
underlying Fetch() returned os.ErrNotExist. The caller could then know
how to act if some indexing operation failed because of an errMissingDep
error.
This was wrong for 2 reasons:
1) if a function fn(tf blob.Fetcher) error does:
if _, _, err := tf.Fetch(br); err != nil {
return fmt.Errorf("wrapping this error in a nicer error
message: %v", err)
}
when we call err := fn(tf), we lose the ability to directly determine
whether err is an errMissingDep. We'd have to parse the error string,
which is gross.
This is exactly what happens in populateDir, when we call
dr.StaticSet().
And in order to fix issue #738, we want to be able to tell when a call
to dr.StaticSet() failed because the underlying Fetch() operation
failed.
2) The blob.Fetcher interface specifically states that os.ErrNotExist
should be returned when a blob is not found. We were breaking that rule
by returning errMissingDep.
In order to address both 1) and 2), it seemed like we could add an err
field to missTrackFetcher to keep track of when an os.ErrNotExist
occurred during a Fetch, and let Fetch return an os.ErrNotExist.
However, that would not work, as a missTrackFetcher is used concurrently
by several callers, so a given caller wouldn't be able to tell whether
"its" Fetch failed or a Fetch from a concurrent caller failed.
Therefore, we introduce trackErrorsFetcher, that has such an error field,
and that wraps the missTrackFetcher. All the callers can keep on sharing
the missTrackFetcher, but each of them initialize their own
trackErrorsFetcher, and can check the errors field after a failed call to a
function is suspected to be the result of a failed Fetch.
Also added a test to demonstrate that issue #738 is fixed.
Fixes #738
Change-Id: Ia5c3081b71c77be1e8cff0bbc847ade68f019bf9
2016-12-12 21:19:35 +00:00
|
|
|
if tf.hasErrNotExist() {
|
|
|
|
return errMissingDep
|
|
|
|
}
|
2013-01-09 15:58:20 +00:00
|
|
|
log.Printf("index: error indexing directory: can't get StaticSet: %v\n", err)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-02-05 05:04:36 +00:00
|
|
|
mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(len(sts), b.FileName(), "", blob.Ref{}))
|
2013-09-10 20:14:53 +00:00
|
|
|
for _, br := range sts {
|
2013-11-14 23:49:52 +00:00
|
|
|
mm.Set(keyStaticDirChild.Key(blobRef, br.String()), "1")
|
2013-09-10 20:14:53 +00:00
|
|
|
}
|
2013-01-09 15:58:20 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
var errMissingDep = errors.New("blob was not fully indexed because of a missing dependency")
|
|
|
|
|
2013-11-18 14:51:47 +00:00
|
|
|
// populateDeleteClaim adds to mm the entries resulting from the delete claim cl.
|
|
|
|
// It is assumed cl is a valid claim, and vr has already been verified.
|
2016-04-22 04:34:24 +00:00
|
|
|
func (ix *Index) populateDeleteClaim(ctx context.Context, cl schema.Claim, vr *jsonsign.VerifyRequest, mm *mutationMap) error {
|
2013-11-18 14:51:47 +00:00
|
|
|
br := cl.Blob().BlobRef()
|
|
|
|
target := cl.Target()
|
|
|
|
if !target.Valid() {
|
|
|
|
log.Print(fmt.Errorf("no valid target for delete claim %v", br))
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
return nil
|
2013-11-18 14:51:47 +00:00
|
|
|
}
|
2016-04-22 04:34:24 +00:00
|
|
|
meta, err := ix.GetBlobMeta(ctx, target)
|
2013-11-18 14:51:47 +00:00
|
|
|
if err != nil {
|
|
|
|
if err == os.ErrNotExist {
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
if err := ix.noteNeeded(br, target); err != nil {
|
|
|
|
return fmt.Errorf("could not note that delete claim %v depends on %v: %v", br, target, err)
|
|
|
|
}
|
|
|
|
return errMissingDep
|
2013-11-18 14:51:47 +00:00
|
|
|
}
|
|
|
|
log.Print(fmt.Errorf("Could not get mime type of target blob %v: %v", target, err))
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
return nil
|
2013-11-18 14:51:47 +00:00
|
|
|
}
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
|
2021-01-17 03:05:35 +00:00
|
|
|
if meta.CamliType != schema.TypePermanode && meta.CamliType != schema.TypeClaim {
|
2013-11-18 14:51:47 +00:00
|
|
|
log.Print(fmt.Errorf("delete claim target in %v is neither a permanode nor a claim: %v", br, meta.CamliType))
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
return nil
|
2013-11-18 14:51:47 +00:00
|
|
|
}
|
|
|
|
mm.Set(keyDeleted.Key(target, cl.ClaimDateString(), br), "")
|
2021-01-17 03:05:35 +00:00
|
|
|
if meta.CamliType == schema.TypeClaim {
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
return nil
|
2013-11-18 14:51:47 +00:00
|
|
|
}
|
|
|
|
recentKey := keyRecentPermanode.Key(vr.SignerKeyId, cl.ClaimDateString(), br)
|
|
|
|
mm.Set(recentKey, target.String())
|
|
|
|
attr, value := cl.Attribute(), cl.Value()
|
|
|
|
claimKey := keyPermanodeClaim.Key(target, vr.SignerKeyId, cl.ClaimDateString(), br)
|
|
|
|
mm.Set(claimKey, keyPermanodeClaim.Val(cl.ClaimType(), attr, value, vr.CamliSigner))
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
return nil
|
2013-11-18 14:51:47 +00:00
|
|
|
}
|
|
|
|
|
2016-04-22 04:34:24 +00:00
|
|
|
func (ix *Index) populateClaim(ctx context.Context, fetcher *missTrackFetcher, b *schema.Blob, mm *mutationMap) error {
|
2013-08-04 02:54:30 +00:00
|
|
|
br := b.BlobRef()
|
2013-01-22 17:32:40 +00:00
|
|
|
|
2013-08-04 02:54:30 +00:00
|
|
|
claim, ok := b.AsClaim()
|
2013-01-22 17:32:40 +00:00
|
|
|
if !ok {
|
2011-11-27 23:21:26 +00:00
|
|
|
// Skip bogus claim with malformed permanode.
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
pkg/index: use missing dep mechanism for static sets too
We relied on missTrackFetcher to return errMissingDep when the
underlying Fetch() returned os.ErrNotExist. The caller could then know
how to act if some indexing operation failed because of an errMissingDep
error.
This was wrong for 2 reasons:
1) if a function fn(tf blob.Fetcher) error does:
if _, _, err := tf.Fetch(br); err != nil {
return fmt.Errorf("wrapping this error in a nicer error
message: %v", err)
}
when we call err := fn(tf), we lose the ability to directly determine
whether err is an errMissingDep. We'd have to parse the error string,
which is gross.
This is exactly what happens in populateDir, when we call
dr.StaticSet().
And in order to fix issue #738, we want to be able to tell when a call
to dr.StaticSet() failed because the underlying Fetch() operation
failed.
2) The blob.Fetcher interface specifically states that os.ErrNotExist
should be returned when a blob is not found. We were breaking that rule
by returning errMissingDep.
In order to address both 1) and 2), it seemed like we could add an err
field to missTrackFetcher to keep track of when an os.ErrNotExist
occurred during a Fetch, and let Fetch return an os.ErrNotExist.
However, that would not work, as a missTrackFetcher is used concurrently
by several callers, so a given caller wouldn't be able to tell whether
"its" Fetch failed or a Fetch from a concurrent caller failed.
Therefore, we introduce trackErrorsFetcher, that has such an error field,
and that wraps the missTrackFetcher. All the callers can keep on sharing
the missTrackFetcher, but each of them initialize their own
trackErrorsFetcher, and can check the errors field after a failed call to a
function is suspected to be the result of a failed Fetch.
Also added a test to demonstrate that issue #738 is fixed.
Fixes #738
Change-Id: Ia5c3081b71c77be1e8cff0bbc847ade68f019bf9
2016-12-12 21:19:35 +00:00
|
|
|
tf := &trackErrorsFetcher{f: fetcher}
|
|
|
|
vr := jsonsign.NewVerificationRequest(b.JSON(), blob.NewSerialFetcher(ix.KeyFetcher, tf))
|
2018-01-16 23:03:16 +00:00
|
|
|
_, err := vr.Verify(ctx)
|
|
|
|
if err != nil {
|
2011-11-27 23:21:26 +00:00
|
|
|
// TODO(bradfitz): ask if the vr.Err.(jsonsign.Error).IsPermanent() and retry
|
|
|
|
// later if it's not permanent? or maybe do this up a level?
|
2018-01-16 23:03:16 +00:00
|
|
|
if tf.hasErrNotExist() {
|
|
|
|
return errMissingDep
|
2011-11-27 23:21:26 +00:00
|
|
|
}
|
2018-01-16 23:03:16 +00:00
|
|
|
return err
|
2011-11-27 23:21:26 +00:00
|
|
|
}
|
|
|
|
verifiedKeyId := vr.SignerKeyId
|
2018-01-17 18:02:03 +00:00
|
|
|
mm.signerID = verifiedKeyId
|
|
|
|
mm.signerBlobRef = vr.CamliSigner
|
2018-01-19 18:53:44 +00:00
|
|
|
mm.Set(keySignerKeyID.name+":"+vr.CamliSigner.String(), verifiedKeyId)
|
2011-11-27 23:21:26 +00:00
|
|
|
|
2013-11-18 14:51:47 +00:00
|
|
|
if claim.ClaimType() == string(schema.DeleteClaim) {
|
2016-04-22 04:34:24 +00:00
|
|
|
if err := ix.populateDeleteClaim(ctx, claim, vr, mm); err != nil {
|
index: actually reindex when out of order
problem: the out-of-order mechanism based on the outOfOrderIndexerLoop
was not working for some claims.
Let C be a delete claim on permanode P. If C was received before P was,
C was marked as being received with the "have" index row. However, for
the deletion to be marked in the index, some information about P is
needed (its meta row), so C could not be fully indexed upon reception.
Then, when P was finally received, the outOfOrderIndexerLoop would kick
in and retry indexing C. Which would fail, because a test based on the
"have" row would (wrongly) detect that C is already indexed and return
early.
In this patch:
-we introduce the "|indexed" suffix to the "have" - value part - row
(receive.go). If a blob is received but some of its dependencies are
missing, the have row value is written without the suffix. Upon
reception of a blob, we now test for the presence of the suffix in the
have row. If missing, the reception continues instead of returning
early. The existing mechanism that was detecting missing dependencies
for file blobs has been adapted to work with this suffix too.
-the index enumeration (enumstat.go), which relies on "have" rows, has
been adapted to work with the new "have" row format, while staying
compatible with the old format. And related tests have been added.
http://camlistore.org/issue/454
Change-Id: I2559d08a12b2a4e0f0691fc7e31f1ed1f874625e
2014-07-03 16:07:08 +00:00
|
|
|
return err
|
|
|
|
}
|
2013-11-27 16:35:21 +00:00
|
|
|
mm.noteDelete(claim)
|
2013-11-18 14:51:47 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
pnbr := claim.ModifiedPermanode()
|
|
|
|
if !pnbr.Valid() {
|
|
|
|
// A different type of claim; not modifying a permanode.
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
attr, value := claim.Attribute(), claim.Value()
|
2013-01-22 17:32:40 +00:00
|
|
|
recentKey := keyRecentPermanode.Key(verifiedKeyId, claim.ClaimDateString(), br)
|
2013-11-14 23:49:52 +00:00
|
|
|
mm.Set(recentKey, pnbr.String())
|
2013-11-07 00:09:11 +00:00
|
|
|
claimKey := keyPermanodeClaim.Key(pnbr, verifiedKeyId, claim.ClaimDateString(), br)
|
2013-11-18 00:52:51 +00:00
|
|
|
mm.Set(claimKey, keyPermanodeClaim.Val(claim.ClaimType(), attr, value, vr.CamliSigner))
|
2011-11-29 20:40:33 +00:00
|
|
|
|
2013-01-22 17:32:40 +00:00
|
|
|
if strings.HasPrefix(attr, "camliPath:") {
|
2013-08-04 02:54:30 +00:00
|
|
|
targetRef, ok := blob.Parse(value)
|
|
|
|
if ok {
|
2011-12-01 18:43:57 +00:00
|
|
|
// TODO: deal with set-attribute vs. del-attribute
|
|
|
|
// properly? I think we get it for free when
|
|
|
|
// del-attribute has no Value, but we need to deal
|
|
|
|
// with the case where they explicitly delete the
|
|
|
|
// current value.
|
2013-01-22 17:32:40 +00:00
|
|
|
suffix := attr[len("camliPath:"):]
|
2011-12-01 18:43:57 +00:00
|
|
|
active := "Y"
|
2013-01-22 17:32:40 +00:00
|
|
|
if claim.ClaimType() == "del-attribute" {
|
2011-12-01 18:43:57 +00:00
|
|
|
active = "N"
|
|
|
|
}
|
2011-12-02 02:06:25 +00:00
|
|
|
baseRef := pnbr
|
|
|
|
claimRef := br
|
|
|
|
|
|
|
|
key := keyPathBackward.Key(verifiedKeyId, targetRef, claimRef)
|
2013-01-22 17:32:40 +00:00
|
|
|
val := keyPathBackward.Val(claim.ClaimDateString(), baseRef, active, suffix)
|
2013-11-14 23:49:52 +00:00
|
|
|
mm.Set(key, val)
|
2011-12-02 02:06:25 +00:00
|
|
|
|
2013-01-22 17:32:40 +00:00
|
|
|
key = keyPathForward.Key(verifiedKeyId, baseRef, suffix, claim.ClaimDateString(), claimRef)
|
2011-12-02 02:06:25 +00:00
|
|
|
val = keyPathForward.Val(active, targetRef)
|
2013-11-14 23:49:52 +00:00
|
|
|
mm.Set(key, val)
|
2011-12-01 18:43:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-29 21:36:58 +00:00
|
|
|
if claim.ClaimType() != string(schema.DelAttributeClaim) && IsIndexedAttribute(attr) {
|
2013-01-22 17:32:40 +00:00
|
|
|
key := keySignerAttrValue.Key(verifiedKeyId, attr, value, claim.ClaimDateString(), br)
|
2013-11-14 23:49:52 +00:00
|
|
|
mm.Set(key, keySignerAttrValue.Val(pnbr))
|
2011-11-28 03:29:23 +00:00
|
|
|
}
|
2012-11-03 13:25:48 +00:00
|
|
|
|
2013-11-16 23:00:30 +00:00
|
|
|
if IsBlobReferenceAttribute(attr) {
|
2013-08-04 02:54:30 +00:00
|
|
|
targetRef, ok := blob.Parse(value)
|
|
|
|
if ok {
|
2012-11-03 13:25:48 +00:00
|
|
|
key := keyEdgeBackward.Key(targetRef, pnbr, br)
|
2013-11-14 23:49:52 +00:00
|
|
|
mm.Set(key, keyEdgeBackward.Val("permanode", ""))
|
2012-11-03 13:25:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-11-27 23:21:26 +00:00
|
|
|
return nil
|
|
|
|
}
|
2011-11-29 20:40:33 +00:00
|
|
|
|
2013-11-27 16:35:21 +00:00
|
|
|
// updateDeletesCache updates the index deletes cache with the cl delete claim.
|
|
|
|
// deleteClaim is trusted to be a valid delete Claim.
|
2017-12-11 02:07:07 +00:00
|
|
|
func (ix *Index) updateDeletesCache(deleteClaim schema.Claim) error {
|
2013-11-27 16:35:21 +00:00
|
|
|
target := deleteClaim.Target()
|
|
|
|
deleter := deleteClaim.Blob()
|
|
|
|
when, err := deleter.ClaimDate()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Could not get date of delete claim %v: %v", deleteClaim, err)
|
|
|
|
}
|
2017-12-11 02:07:07 +00:00
|
|
|
targetDeletions := append(ix.deletes.m[target],
|
2013-11-27 16:35:21 +00:00
|
|
|
deletion{
|
|
|
|
deleter: deleter.BlobRef(),
|
|
|
|
when: when,
|
|
|
|
})
|
|
|
|
sort.Sort(sort.Reverse(byDeletionDate(targetDeletions)))
|
2017-12-11 02:07:07 +00:00
|
|
|
ix.deletes.m[target] = targetDeletions
|
2013-11-27 16:35:21 +00:00
|
|
|
return nil
|
2011-11-29 20:40:33 +00:00
|
|
|
}
|