From 778f5cc6c401f69cdfbda8e82a6c325854dd303e Mon Sep 17 00:00:00 2001
From: Brad Fitzpatrick <brad@danga.com>
Date: Sat, 30 Nov 2013 12:04:04 -0800
Subject: [PATCH] index: show corpus building CPU usage, disabled blob.Parse
 cache, size maps

Change-Id: I089053e8de46978573e4b5fe5cdc14ccac0d54a2
---
 pkg/index/corpus.go | 58 +++++++++++++++++++++++++++++++++++++--------
 pkg/index/index.go  | 10 ++++----
 2 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/pkg/index/corpus.go b/pkg/index/corpus.go
index f8527d369..5189af9de 100644
--- a/pkg/index/corpus.go
+++ b/pkg/index/corpus.go
@@ -13,6 +13,7 @@ import (
 	"time"
 
 	"camlistore.org/pkg/blob"
+	"camlistore.org/pkg/osutil"
 	"camlistore.org/pkg/schema"
 	"camlistore.org/pkg/sorted"
 	"camlistore.org/pkg/strutil"
@@ -33,8 +34,9 @@ type Corpus struct {
 	// It's used as a query cache invalidator.
 	gen int64
 
-	strs      map[string]string // interned strings
-	brInterns int64
+	strs      map[string]string   // interned strings
+	brOfStr   map[string]blob.Ref // blob.Parse fast path
+	brInterns int64               // blob.Ref -> blob.Ref, via br method
 
 	blobs        map[blob.Ref]*camtypes.BlobMeta
 	sumBlobBytes int64
@@ -70,6 +72,7 @@ func newCorpus() *Corpus {
 		imageInfo:  make(map[blob.Ref]camtypes.ImageInfo),
 		deletedBy:  make(map[blob.Ref]blob.Ref),
 		keyId:      make(map[blob.Ref]string),
+		brOfStr:    make(map[string]blob.Ref),
 	}
 }
 
@@ -128,19 +131,28 @@ func (c *Corpus) scanFromStorage(s sorted.KeyValue) error {
 	c.building = true
 
 	ms0 := memstats()
-
 	log.Printf("Slurping corpus to memory from index...")
+
+	// We do the "meta" rows first, before the prefixes below, because it
+	// populates the blobs map (used for blobref interning) and the camBlobs
+	// map (used for hinting the size of other maps)
+	log.Printf("Slurping corpus to memory from index... (1/6: meta rows)")
+	if err := c.scanPrefix(s, "meta:"); err != nil {
+		return err
+	}
+	c.files = make(map[blob.Ref]camtypes.FileInfo, len(c.camBlobs["file"]))
+	c.permanodes = make(map[blob.Ref]*PermanodeMeta, len(c.camBlobs["permanode"]))
+	cpu0 := osutil.CPUUsage()
+
 	prefixes := []string{
-		"meta:", // should be first, for blobref interning
 		"signerkeyid:",
 		"claim|",
 		"fileinfo|",
 		"filetimes|",
 		"imagesize|",
 	}
-
 	for i, prefix := range prefixes {
-		log.Printf("Slurping corpus to memory from index... (%d/%d: prefix %q)", i+1, len(prefixes), prefix)
+		log.Printf("Slurping corpus to memory from index... (%d/%d: prefix %q)", i+2, len(prefixes)+1, prefix)
 		if err := c.scanPrefix(s, prefix); err != nil {
 			return err
 		}
@@ -161,9 +173,12 @@ func (c *Corpus) scanFromStorage(s sorted.KeyValue) error {
 		}
 
 	}
+	c.brOfStr = nil // drop this now.
 	c.building = false
 	// log.V(1).Printf("interned blob.Ref = %d", c.brInterns)
 
+	cpu := osutil.CPUUsage() - cpu0
+
 	ms1 := memstats()
 	memUsed := ms1.Alloc - ms0.Alloc
 	if ms1.Alloc < ms0.Alloc {
@@ -177,6 +192,7 @@ func (c *Corpus) scanFromStorage(s sorted.KeyValue) error {
 		len(c.permanodes),
 		len(c.files),
 		len(c.imageInfo))
+	log.Printf("Corpus scanning CPU usage: %v", cpu)
 	return nil
 }
 
@@ -226,6 +242,10 @@ func (c *Corpus) mergeMetaRow(k, v string) error {
 	if !ok {
 		return fmt.Errorf("bogus meta row: %q -> %q", k, v)
 	}
+	if useBlobParseCache && c.brOfStr != nil {
+		brstr := k[len("meta:"):]
+		c.brOfStr[brstr] = bm.Ref
+	}
 	if _, dup := c.blobs[bm.Ref]; dup {
 		// Um, shouldn't happen.  TODO(bradfitz): is it
 		// guaranteed elsewhere that duplicate blobs are never
@@ -259,7 +279,7 @@ func (c *Corpus) mergeSignerKeyIdRow(k, v string) error {
 }
 
 func (c *Corpus) mergeClaimRow(k, v string) error {
-	cl, ok := kvClaim(k, v)
+	cl, ok := kvClaim(k, v, c.blobParse)
 	if !ok || !cl.Permanode.Valid() {
 		return fmt.Errorf("bogus claim row: %q -> %q", k, v)
 	}
@@ -288,7 +308,7 @@ func (c *Corpus) mergeFileInfoRow(k, v string) error {
 	if len(c.ss) != 2 {
 		return fmt.Errorf("unexpected fileinfo key %q", k)
 	}
-	br, ok := blob.Parse(c.ss[1])
+	br, ok := c.blobParse(c.ss[1])
 	if !ok {
 		return fmt.Errorf("unexpected fileinfo blobref in key %q", k)
 	}
@@ -317,7 +337,7 @@ func (c *Corpus) mergeFileTimesRow(k, v string) error {
 	if len(c.ss) != 2 {
 		return fmt.Errorf("unexpected filetimes key %q", k)
 	}
-	br, ok := blob.Parse(c.ss[1])
+	br, ok := c.blobParse(c.ss[1])
 	if !ok {
 		return fmt.Errorf("unexpected filetimes blobref in key %q", k)
 	}
@@ -337,7 +357,7 @@ func (c *Corpus) mutateFileInfo(br blob.Ref, fn func(*camtypes.FileInfo)) {
 }
 
 func (c *Corpus) mergeImageSizeRow(k, v string) error {
-	br, okk := blob.Parse(k[len("imagesize|"):])
+	br, okk := c.blobParse(k[len("imagesize|"):])
 	ii, okv := kvImageInfo(v)
 	if !okk || !okv {
 		return fmt.Errorf("bogus row %q = %q", k, v)
@@ -347,6 +367,24 @@ func (c *Corpus) mergeImageSizeRow(k, v string) error {
 	return nil
 }
 
+// This enables the blob.Parse fast path cache, which reduces CPU (via
+// reduced GC from new garbage), but increases memory usage, even
+// though it shouldn't.  The GC should fully discard the brOfStr map
+// (which we nil out at the end of parsing), but the Go GC doesn't
+// seem to clear it all.
+// TODO: investigate / file bugs.
+const useBlobParseCache = false
+
+func (c *Corpus) blobParse(v string) (br blob.Ref, ok bool) {
+	if useBlobParseCache {
+		br, ok = c.brOfStr[v]
+		if ok {
+			return
+		}
+	}
+	return blob.Parse(v)
+}
+
 // str returns s, interned.
 func (c *Corpus) str(s string) string {
 	if s == "" {
diff --git a/pkg/index/index.go b/pkg/index/index.go
index 6c407437d..63ef3a66c 100644
--- a/pkg/index/index.go
+++ b/pkg/index/index.go
@@ -378,7 +378,7 @@ func (x *Index) AppendClaims(dst []camtypes.Claim, permaNode blob.Ref,
 		if mustHave != "" && !strings.Contains(val, mustHave) {
 			continue
 		}
-		cl, ok := kvClaim(it.Key(), val)
+		cl, ok := kvClaim(it.Key(), val, blob.Parse)
 		if !ok {
 			continue
 		}
@@ -396,22 +396,22 @@ func (x *Index) AppendClaims(dst []camtypes.Claim, permaNode blob.Ref,
 	return dst, nil
 }
 
-func kvClaim(k, v string) (c camtypes.Claim, ok bool) {
+func kvClaim(k, v string, blobParse func(string) (blob.Ref, bool)) (c camtypes.Claim, ok bool) {
 	// TODO(bradfitz): remove the strings.Split calls to reduce allocations.
 	keyPart := strings.Split(k, "|")
 	valPart := strings.Split(v, "|")
 	if len(keyPart) < 5 || len(valPart) < 4 {
 		return
 	}
-	signerRef, ok := blob.Parse(valPart[3])
+	signerRef, ok := blobParse(valPart[3])
 	if !ok {
 		return
 	}
-	permaNode, ok := blob.Parse(keyPart[1])
+	permaNode, ok := blobParse(keyPart[1])
 	if !ok {
 		return
 	}
-	claimRef, ok := blob.Parse(keyPart[4])
+	claimRef, ok := blobParse(keyPart[4])
 	if !ok {
 		return
 	}