search, index: rewrite, fix the "map" sort algorithm, index camliNodeType

Change-Id: Idb0e44c3f61bea9fc2cc76619223b86aa5aa4c58
This commit is contained in:
Brad Fitzpatrick 2018-04-25 19:10:00 -07:00
parent ad0b3918b7
commit 2b720aa101
10 changed files with 269 additions and 267 deletions

View File

@ -106,6 +106,11 @@ type Corpus struct {
permanodesByTime *lazySortedPermanodes // cache of permanodes sorted by creation time.
permanodesByModtime *lazySortedPermanodes // cache of permanodes sorted by modtime.
// permanodesSetByNodeType maps from a camliNodeType attribute
// value to the set of permanodes that ever had that
// value. The bool is always true.
permanodesSetByNodeType map[string]map[blob.Ref]bool
// scratch string slice
ss []string
}
@ -316,20 +321,21 @@ func (pm *PermanodeMeta) valuesAtSigner(at time.Time,
func newCorpus() *Corpus {
c := &Corpus{
blobs: make(map[blob.Ref]*camtypes.BlobMeta),
camBlobs: make(map[string]map[blob.Ref]*camtypes.BlobMeta),
files: make(map[blob.Ref]camtypes.FileInfo),
permanodes: make(map[blob.Ref]*PermanodeMeta),
imageInfo: make(map[blob.Ref]camtypes.ImageInfo),
deletedBy: make(map[blob.Ref]blob.Ref),
keyId: make(map[blob.Ref]string),
signerRefs: make(map[string]SignerRefSet),
brOfStr: make(map[string]blob.Ref),
fileWholeRef: make(map[blob.Ref]blob.Ref),
gps: make(map[blob.Ref]latLong),
mediaTags: make(map[blob.Ref]map[string]string),
deletes: make(map[blob.Ref][]deletion),
claimBack: make(map[blob.Ref][]*camtypes.Claim),
blobs: make(map[blob.Ref]*camtypes.BlobMeta),
camBlobs: make(map[string]map[blob.Ref]*camtypes.BlobMeta),
files: make(map[blob.Ref]camtypes.FileInfo),
permanodes: make(map[blob.Ref]*PermanodeMeta),
imageInfo: make(map[blob.Ref]camtypes.ImageInfo),
deletedBy: make(map[blob.Ref]blob.Ref),
keyId: make(map[blob.Ref]string),
signerRefs: make(map[string]SignerRefSet),
brOfStr: make(map[string]blob.Ref),
fileWholeRef: make(map[blob.Ref]blob.Ref),
gps: make(map[blob.Ref]latLong),
mediaTags: make(map[blob.Ref]map[string]string),
deletes: make(map[blob.Ref][]deletion),
claimBack: make(map[blob.Ref][]*camtypes.Claim),
permanodesSetByNodeType: make(map[string]map[blob.Ref]bool),
}
c.permanodesByModtime = &lazySortedPermanodes{
c: c,
@ -713,6 +719,14 @@ func (c *Corpus) mergeClaimRow(k, v []byte) error {
if vbr, ok := blob.Parse(cl.Value); ok {
c.claimBack[vbr] = append(c.claimBack[vbr], &cl)
}
if cl.Attr == "camliNodeType" {
set := c.permanodesSetByNodeType[cl.Value]
if set == nil {
set = make(map[blob.Ref]bool)
c.permanodesSetByNodeType[cl.Value] = set
}
set[pn] = true
}
return nil
}
@ -1055,6 +1069,22 @@ func (c *Corpus) EnumerateSingleBlob(fn func(camtypes.BlobMeta) bool, br blob.Re
}
}
// EnumeratePermanodesByNodeTypes enumerates over all permanodes that might
// have one of the provided camliNodeType values, calling fn for each. If fn returns false,
// enumeration ends.
func (c *Corpus) EnumeratePermanodesByNodeTypes(fn func(camtypes.BlobMeta) bool, camliNodeTypes []string) {
for _, t := range camliNodeTypes {
set := c.permanodesSetByNodeType[t]
for br := range set {
if bm := c.blobs[br]; bm != nil {
if !fn(*bm) {
return
}
}
}
}
}
func (c *Corpus) GetBlobMeta(ctx context.Context, br blob.Ref) (camtypes.BlobMeta, error) {
bm, ok := c.blobs[br]
if !ok {

View File

@ -331,6 +331,34 @@ func (c *Constraint) checkValid() error {
return nil
}
// matchesPermanodeTypes returns a set of valid permanode types that a matching
// permanode must have as its "camliNodeType" attribute.
// It returns a zero-length slice if this constraint might include things other
// things.
func (c *Constraint) matchesPermanodeTypes() []string {
if c == nil {
return nil
}
if pc := c.Permanode; pc != nil && pc.Attr == "camliNodeType" && pc.Value != "" {
return []string{pc.Value}
}
if lc := c.Logical; lc != nil {
sa := lc.A.matchesPermanodeTypes()
sb := lc.B.matchesPermanodeTypes()
switch lc.Op {
case "and":
if len(sa) != 0 {
return sa
}
return sb
case "or":
return append(sa, sb...)
}
}
return nil
}
// matchesAtMostOneBlob reports whether this constraint matches at most a single blob.
// If so, it returns that blob. Otherwise it returns a zero, invalid blob.Ref.
func (c *Constraint) matchesAtMostOneBlob() blob.Ref {
@ -904,13 +932,13 @@ func (h *Handler) Query(ctx context.Context, rawq *SearchQuery) (ret_ *SearchRes
if debugQuerySpeed {
t0 := time.Now()
jq, _ := json.Marshal(rawq)
log.Printf("Start %v, Doing search %s... ", t0.Format(time.RFC3339), jq)
log.Printf("[search=%p] Start %v, Doing search %s... ", rawq, t0.Format(time.RFC3339), jq)
defer func() {
d := time.Since(t0)
if ret_ != nil {
log.Printf("Start %v + %v = %v results", t0.Format(time.RFC3339), d, len(ret_.Blobs))
log.Printf("[search=%p] Start %v + %v = %v results", rawq, t0.Format(time.RFC3339), d, len(ret_.Blobs))
} else {
log.Printf("Start %v + %v = error", t0.Format(time.RFC3339), d)
log.Printf("[search=%p] Start %v + %v = error", rawq, t0.Format(time.RFC3339), d)
}
}()
}
@ -939,6 +967,9 @@ func (h *Handler) Query(ctx context.Context, rawq *SearchQuery) (ret_ *SearchRes
if candSourceHook != nil {
candSourceHook(cands.name)
}
if debugQuerySpeed {
log.Printf("[search=%p] using candidate source set %q", rawq, cands.name)
}
wantAround, foundAround := false, false
if q.Around.Valid() {
@ -1127,6 +1158,82 @@ func (h *Handler) Query(ctx context.Context, rawq *SearchQuery) (ret_ *SearchRes
return s.res, nil
}
// mapCell is which cell of an NxN cell grid of a map a point is in.
// The numbering is arbitrary but dense, starting with 0.
type mapCell int
// mapGrids contains 1 or 2 mapGrids, depending on whether the search
// area cross the dateline.
type mapGrids []*mapGrid
func (gs mapGrids) cellOf(loc camtypes.Location) mapCell {
for i, g := range gs {
cell, ok := g.cellOf(loc)
if ok {
return cell + mapCell(i*g.dim*g.dim)
}
}
return 0 // shouldn't happen, unless loc is malformed, in which case this is fine.
}
func newMapGrids(area camtypes.LocationBounds, dim int) mapGrids {
if !area.SpansDateLine() {
return mapGrids{newMapGrid(area, dim)}
}
return mapGrids{
newMapGrid(camtypes.LocationBounds{
North: area.North,
South: area.South,
West: area.West,
East: 180,
}, dim),
newMapGrid(camtypes.LocationBounds{
North: area.North,
South: area.South,
West: -180,
East: area.East,
}, dim),
}
}
type mapGrid struct {
dim int // grid is dim*dim cells
area camtypes.LocationBounds
cellWidth float64
cellHeight float64
}
// newMapGrid returns a grid matcher over an area. The area must not
// span the date line. The mapGrid maps locations to a grid of (dim *
// dim) cells.
func newMapGrid(area camtypes.LocationBounds, dim int) *mapGrid {
if area.SpansDateLine() {
panic("invalid use of newMapGrid: must be called with bounds not overlapping date line")
}
return &mapGrid{
dim: dim,
area: area,
cellWidth: area.Width() / float64(dim),
cellHeight: (area.North - area.South) / float64(dim),
}
}
func (g *mapGrid) cellOf(loc camtypes.Location) (c mapCell, ok bool) {
if loc.Latitude > g.area.North || loc.Latitude < g.area.South ||
loc.Longitude < g.area.West || loc.Longitude > g.area.East {
return
}
x := int((loc.Longitude - g.area.West) / g.cellWidth)
y := int((g.area.North - loc.Latitude) / g.cellHeight)
if x >= g.dim {
x = g.dim - 1
}
if y >= g.dim {
y = g.dim - 1
}
return mapCell(y*g.dim + x), true
}
// bestByLocation conditionally modifies res.Blobs if the number of blobs
// is greater than limit. If so, it modifies res.Blobs so only `limit`
// blobs remain, selecting those such that the results are evenly spread
@ -1153,87 +1260,52 @@ func bestByLocation(res *SearchResult, locm map[blob.Ref]camtypes.Location, limi
// No even one result node with a location was found.
return
}
area := res.LocationArea
// divide location area in a grid of ~limit cells, such as each cell is of the
// same proportion as the location area, i.e. equal number of lines and columns.
grid := make(map[camtypes.LocationBounds][]blob.Ref)
areaHeight := area.North - area.South
areaWidth := area.East - area.West
if area.West >= area.East {
// area is spanning over the antimeridian
areaWidth += 360
}
nbLines := math.Sqrt(float64(limit))
cellLat := areaHeight / nbLines
cellLong := areaWidth / nbLines
latZero := area.North
longZero := area.West
for _, v := range res.Blobs {
br := v.Blob
// Divide location area in a grid of (dim * dim) map cells,
// such that (dim * dim) is approximately the given limit,
// then track which search results are in which cell.
cellOccupants := make(map[mapCell][]blob.Ref)
dim := int(math.Round(math.Sqrt(float64(limit))))
if dim < 3 {
dim = 3
} else if dim > 100 {
dim = 100
}
grids := newMapGrids(*res.LocationArea, dim)
resBlob := map[blob.Ref]*SearchResultBlob{}
for _, srb := range res.Blobs {
br := srb.Blob
loc, ok := locm[br]
if !ok {
continue
}
relLat := latZero - loc.Latitude
relLong := loc.Longitude - longZero
if loc.Longitude < longZero {
// area is spanning over the antimeridian
relLong += 360
cellKey := grids.cellOf(loc)
occupants := cellOccupants[cellKey]
if len(occupants) >= limit {
// no sense in filling a cell to more than our overall limit
continue
}
line := int(relLat / cellLat)
col := int(relLong / cellLong)
cellKey := camtypes.LocationBounds{
North: latZero - float64(line)*cellLat,
West: camtypes.Longitude(longZero + float64(col)*cellLong).WrapTo180(),
South: latZero - float64(line+1)*cellLat,
East: camtypes.Longitude(longZero + float64(col+1)*cellLong).WrapTo180(),
}
var brs []blob.Ref
cell, ok := grid[cellKey]
if !ok {
// cell does not exist yet.
brs = []blob.Ref{br}
} else {
if len(cell) >= limit {
// no sense in filling a cell to more than our overall limit
continue
}
brs = append(cell, br)
}
grid[cellKey] = brs
cellOccupants[cellKey] = append(occupants, br)
resBlob[br] = srb
}
maxNodesPerCell := limit / len(grid)
if len(grid) > limit {
maxNodesPerCell = 1
}
var nodesKept []*SearchResultBlob
for _, v := range grid {
var brs []blob.Ref
if len(v) <= maxNodesPerCell {
brs = v
} else {
// TODO(mpl): remove the nodes that are the most clustered within a cell. For
// now simply do first found first picked, for each cell.
brs = v[:maxNodesPerCell]
}
for _, br := range brs {
// TODO(mpl): if grid was instead a
// map[camtypes.LocationBounds][]*SearchResultBlob from the start, then here we
// could instead do nodesKept = append(nodesKept, brs...), but I'm not sure that's a win?
nodesKept = append(nodesKept, &SearchResultBlob{
Blob: br,
})
for {
for cellKey, occupants := range cellOccupants {
nodesKept = append(nodesKept, resBlob[occupants[0]])
if len(nodesKept) == limit {
res.Blobs = nodesKept
return
}
if len(occupants) == 1 {
delete(cellOccupants, cellKey)
} else {
cellOccupants[cellKey] = occupants[1:]
}
}
}
res.Blobs = nodesKept
// TODO(mpl): we do not trim the described blobs, because some of the described
// are children of the kept blobs, and we wouldn't know whether to remove them or
// not. If we do care about the size of res.Describe, I suppose we should reissue a
// describe query on nodesKept.
}
// setResultContinue sets res.Continue if q is suitable for having a continue token.
@ -1315,6 +1387,14 @@ func (q *SearchQuery) pickCandidateSource(s *search) (src candidateSource) {
return
default:
src.sorted = false
if typs := c.matchesPermanodeTypes(); len(typs) != 0 {
src.name = "corpus_permanode_types"
src.send = func(ctx context.Context, s *search, fn func(camtypes.BlobMeta) bool) error {
corpus.EnumeratePermanodesByNodeTypes(fn, typs)
return nil
}
return
}
}
}
if br := c.matchesAtMostOneBlob(); br.Valid() {
@ -1633,18 +1713,22 @@ func (c *PermanodeConstraint) blobMatches(ctx context.Context, s *search, br blo
}
}
if c.Location != nil {
if c.Location != nil || s.q.Sort == MapSort {
l, err := s.h.lh.PermanodeLocation(ctx, br, c.At, s.h.owner)
if err != nil {
if err != os.ErrNotExist {
log.Printf("PermanodeLocation(ref %s): %v", br, err)
if c.Location != nil {
if err != nil {
if err != os.ErrNotExist {
log.Printf("PermanodeLocation(ref %s): %v", br, err)
}
return false, nil
}
if !c.Location.matchesLatLong(l.Latitude, l.Longitude) {
return false, nil
}
return false, nil
}
if !c.Location.matchesLatLong(l.Latitude, l.Longitude) {
return false, nil
if err == nil {
s.loc[br] = l
}
s.loc[br] = l
}
if cc := c.Continue; cc != nil {
@ -1835,6 +1919,13 @@ func (c *FileConstraint) blobMatches(ctx context.Context, s *search, br blob.Ref
Latitude: lat,
Longitude: long,
}
} else if s.q.Sort == MapSort {
if lat, long, found := corpus.FileLatLong(br); found {
s.loc[br] = camtypes.Location{
Latitude: lat,
Longitude: long,
}
}
}
// this makes sure, in conjunction with TestQueryFileLocation, that we only
// expand the location iff the location matched AND the whole constraint matched as

View File

@ -24,13 +24,10 @@ import (
"flag"
"fmt"
"image"
"image/color"
"image/jpeg"
"image/png"
"io/ioutil"
"log"
"math/rand"
"os"
"path/filepath"
"reflect"
"sort"
@ -1837,6 +1834,22 @@ func TestRefQuerySource_Logical(t *testing.T) {
})
}
// permanode camliNodeType candidate source
func TestIsCheckinQuerySource(t *testing.T) {
testQueryTypes(t, memIndexTypes, func(qt *queryTest) {
id := qt.id
pn := id.NewPlannedPermanode("photo")
id.SetAttribute(pn, "camliNodeType", "foursquare.com:checkin")
sq := &SearchQuery{
Expression: "is:checkin",
Sort: MapSort,
}
qt.candidateSource = "corpus_permanode_types"
qt.wantRes(sq, pn)
})
}
// BenchmarkLocationPredicate aims at measuring the impact of
// https://camlistore-review.googlesource.com/8049
// ( + https://camlistore-review.googlesource.com/8649)
@ -2138,160 +2151,30 @@ type locationPoints struct {
}
func TestBestByLocation(t *testing.T) {
if testing.Short() {
t.Skip()
res := &SearchResult{
LocationArea: &camtypes.LocationBounds{
North: 90,
South: -90,
East: 180,
West: -180,
},
}
data := make(map[string]locationPoints)
f, err := os.Open(filepath.Join("testdata", "locationPoints.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
dec := json.NewDecoder(f)
if err := dec.Decode(&data); err != nil {
t.Fatal(err)
locm := map[blob.Ref]camtypes.Location{}
const numResults = 5000
const limit = 117
const scale = 1000
for i := 0; i < numResults; i++ {
br := blob.RefFromString(fmt.Sprintf("foo %d", i))
res.Blobs = append(res.Blobs, &SearchResultBlob{Blob: br})
locm[br] = camtypes.Location{
Latitude: float64(rand.Intn(360*scale) - 180*scale),
Longitude: float64(rand.Intn(180*scale) - 90*scale),
}
}
for _, v := range data {
testBestByLocation(t, v, false)
}
}
// call with generate=true to regenerate the png files int testdata/ from testdata/locationPoints.json
func testBestByLocation(t *testing.T, data locationPoints, generate bool) {
var res SearchResult
var blobs []*SearchResultBlob
meta := make(map[string]*DescribedBlob)
var area camtypes.LocationBounds
locm := make(map[blob.Ref]camtypes.Location)
for _, v := range data.Points {
br := blob.RefFromString(fmt.Sprintf("%v,%v", v.Latitude, v.Longitude))
blobs = append(blobs, &SearchResultBlob{
Blob: br,
})
loc := camtypes.Location{
Latitude: v.Latitude,
Longitude: v.Longitude,
}
meta[br.String()] = &DescribedBlob{
Location: &loc,
}
locm[br] = loc
area = area.Expand(loc)
}
res.Blobs = blobs
res.Describe = &DescribeResponse{
Meta: meta,
}
res.LocationArea = &area
var widthRatio, heightRatio float64
initImage := func() *image.RGBA {
maxRelLat := area.North - area.South
maxRelLong := area.East - area.West
if area.West >= area.East {
// area is spanning over the antimeridian
maxRelLong += 360
}
// draw it all on a 1000 px wide image
height := int(1000 * maxRelLat / maxRelLong)
img := image.NewRGBA(image.Rect(0, 0, 1000, height))
for i := 0; i < 1000; i++ {
for j := 0; j < 1000; j++ {
img.Set(i, j, image.White)
}
}
widthRatio = 1000. / maxRelLong
heightRatio = float64(height) / maxRelLat
return img
}
img := initImage()
for _, v := range data.Points {
// draw a little cross of 3x3, because 1px dot is not visible enough.
relLong := v.Longitude - area.West
if v.Longitude < area.West {
relLong += 360
}
crossX := int(relLong * widthRatio)
crossY := int((area.North - v.Latitude) * heightRatio)
for i := -1; i < 2; i++ {
img.Set(crossX+i, crossY, color.RGBA{127, 0, 0, 127})
}
for j := -1; j < 2; j++ {
img.Set(crossX, crossY+j, color.RGBA{127, 0, 0, 127})
}
}
cmpImage := func(img *image.RGBA, wantImgFile string) {
f, err := os.Open(wantImgFile)
if err != nil {
t.Fatal(err)
}
defer f.Close()
wantImg, err := png.Decode(f)
if err != nil {
t.Fatal(err)
}
for j := 0; j < wantImg.Bounds().Max.Y; j++ {
for i := 0; i < wantImg.Bounds().Max.X; i++ {
r1, g1, b1, a1 := wantImg.At(i, j).RGBA()
r2, g2, b2, a2 := img.At(i, j).RGBA()
if r1 != r2 || g1 != g2 || b1 != b2 || a1 != a2 {
t.Fatalf("%v different from %v", wantImg.At(i, j), img.At(i, j))
}
}
}
}
genPng := func(img *image.RGBA, name string) {
f, err := os.Create(name)
if err != nil {
t.Fatal(err)
}
defer f.Close()
if err := png.Encode(f, img); err != nil {
t.Fatal(err)
}
}
if generate {
genPng(img, filepath.Join("testdata", fmt.Sprintf("%v-beforeMapSort.png", data.Name)))
} else {
cmpImage(img, filepath.Join("testdata", fmt.Sprintf("%v-beforeMapSort.png", data.Name)))
}
ExportBestByLocation(&res, locm, 100)
// check that all longitudes are in the [-180,180] range
for _, v := range res.Blobs {
longitude := meta[v.Blob.String()].Location.Longitude
if longitude < -180. || longitude > 180. {
t.Errorf("out of range location: %v", longitude)
}
}
img = initImage()
for _, v := range res.Blobs {
loc := meta[v.Blob.String()].Location
longitude := loc.Longitude
latitude := loc.Latitude
// draw a little cross of 3x3, because 1px dot is not visible enough.
relLong := longitude - area.West
if longitude < area.West {
relLong += 360
}
crossX := int(relLong * widthRatio)
crossY := int((area.North - latitude) * heightRatio)
for i := -1; i < 2; i++ {
img.Set(crossX+i, crossY, color.RGBA{127, 0, 0, 127})
}
for j := -1; j < 2; j++ {
img.Set(crossX, crossY+j, color.RGBA{127, 0, 0, 127})
}
}
if generate {
genPng(img, filepath.Join("testdata", fmt.Sprintf("%v-afterMapSort.png", data.Name)))
} else {
cmpImage(img, filepath.Join("testdata", fmt.Sprintf("%v-afterMapSort.png", data.Name)))
ExportBestByLocation(res, locm, limit)
if got := len(res.Blobs); got != limit {
t.Errorf("got %d blobs; want %d", got, limit)
}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

View File

@ -280,14 +280,24 @@ type LocationBounds struct {
East float64 `json:"east"`
}
func (b LocationBounds) isWithinLongitude(loc Location) bool {
if b.East < b.West {
// l is spanning over antimeridian
// SpansDateLine reports whether b spans the antimeridian international date line.
func (b LocationBounds) SpansDateLine() bool { return b.East < b.West }
// Contains reports whether loc is in the bounds b.
func (b LocationBounds) Contains(loc Location) bool {
if b.SpansDateLine() {
return loc.Longitude >= b.West || loc.Longitude <= b.East
}
return loc.Longitude >= b.West && loc.Longitude <= b.East
}
func (b LocationBounds) Width() float64 {
if !b.SpansDateLine() {
return b.East - b.West
}
return b.East - b.West + 360
}
// Expand returns a new LocationBounds nb. If either of loc coordinates is
// outside of b, nb is the dimensions of b expanded as little as possible in
// order to include loc. Otherwise, nb is just a copy of b.
@ -311,7 +321,7 @@ func (b LocationBounds) Expand(loc Location) LocationBounds {
} else if loc.Latitude < nb.South {
nb.South = loc.Latitude
}
if nb.isWithinLongitude(loc) {
if nb.Contains(loc) {
return nb
}
center := nb.center()

View File

@ -78,7 +78,7 @@ cam.MapAspect = React.createClass({
// (https://github.com/perkeep/perkeep/issues/937)
// However, the cluster plugin restricts the number of items displayed at the
// same time to a way lower number, allowing us to work-around these glitches.
QUERY_LIMIT_: 1000,
QUERY_LIMIT_: 250,
// ZOOM_COOLDOWN_ is how much time to wait, after we've stopped zooming/panning,
// before actually searching for new results.
ZOOM_COOLDOWN_: 500,