perkeep/pkg/blob/ref.go

/*
Copyright 2013 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package blob defines types to refer to and retrieve low-level Camlistore blobs.
package blob

import (
	"bytes"
	"crypto/sha1"
	"errors"
	"fmt"
	"hash"
	"io"
	"reflect"
	"regexp"
	"strings"
)

// Pattern is the regular expression which matches a blobref.
// It does not contain ^ or $.
const Pattern = `\b([a-z][a-z0-9]*)-([a-f0-9]+)\b`

// whole blobref pattern
var blobRefPattern = regexp.MustCompile("^" + Pattern + "$")

// Ref is a reference to a Camlistore blob.
// It is used as a value type and supports equality (with ==) and the ability
// to use it as a map key.
type Ref struct {
	digest digestType
}

// SizedRef is like a Ref but includes a size.
// It should also be used as a value type and supports equality.
type SizedRef struct {
	Ref
	Size int64
}

func (sr SizedRef) String() string {
	return fmt.Sprintf("[%s; %d bytes]", sr.Ref.String(), sr.Size)
}

// digestType is an interface type, but any type implementing it must
// be of concrete type [N]byte, so it supports equality with ==,
// which is a requirement for ref.
type digestType interface {
	bytes() []byte
	digestName() string
	newHash() hash.Hash
}

func (r Ref) String() string {
	if r.digest == nil {
		return "<invalid-blob.Ref>"
	}
	// TODO: maybe memoize this.
	dname := r.digest.digestName()
	bs := r.digest.bytes()
	buf := getBuf(len(dname) + 1 + len(bs)*2)[:0]
	defer putBuf(buf)
	return string(r.appendString(buf))
}

func (r Ref) appendString(buf []byte) []byte {
	dname := r.digest.digestName()
	bs := r.digest.bytes()
	buf = append(buf, dname...)
	buf = append(buf, '-')
	for _, b := range bs {
		buf = append(buf, hexDigit[b>>4], hexDigit[b&0xf])
	}
	if o, ok := r.digest.(otherDigest); ok && o.odd {
		buf = buf[:len(buf)-1]
	}
	return buf
}

// HashName returns the lowercase hash function name of the reference.
// It panics if r is zero.
func (r Ref) HashName() string {
	if r.digest == nil {
		panic("HashName called on invalid Ref")
	}
	return r.digest.digestName()
}

// Digest returns the lower hex digest of the blobref, without
// the e.g. "sha1-" prefix. It panics if r is zero.
func (r Ref) Digest() string {
	if r.digest == nil {
		panic("Digest called on invalid Ref")
	}
	bs := r.digest.bytes()
	buf := getBuf(len(bs) * 2)[:0]
	defer putBuf(buf)
	for _, b := range bs {
		buf = append(buf, hexDigit[b>>4], hexDigit[b&0xf])
	}
	if o, ok := r.digest.(otherDigest); ok && o.odd {
		buf = buf[:len(buf)-1]
	}
	return string(buf)
}

func (r Ref) DigestPrefix(digits int) string {
	v := r.Digest()
	if len(v) < digits {
		return v
	}
	return v[:digits]
}

func (r Ref) DomID() string {
	if !r.Valid() {
		return ""
	}
	return "camli-" + r.String()
}

func (r Ref) Sum32() uint32 {
	var v uint32
	for _, b := range r.digest.bytes()[:4] {
		v = v<<8 | uint32(b)
	}
	return v
}

func (r Ref) Sum64() uint64 {
	var v uint64
	for _, b := range r.digest.bytes()[:8] {
		v = v<<8 | uint64(b)
	}
	return v
}

// Hash returns a new hash.Hash of r's type.
// It panics if r is zero.
func (r Ref) Hash() hash.Hash {
	return r.digest.newHash()
}

func (r Ref) HashMatches(h hash.Hash) bool {
	if r.digest == nil {
		return false
	}
	return bytes.Equal(h.Sum(nil), r.digest.bytes())
}

const hexDigit = "0123456789abcdef"

func (r Ref) Valid() bool { return r.digest != nil }

func (r Ref) IsSupported() bool {
	if !r.Valid() {
		return false
	}
	_, ok := metaFromString[r.digest.digestName()]
	return ok
}

// Parse parse s as a blobref and returns the ref and whether it was
// parsed successfully.
func Parse(s string) (ref Ref, ok bool) {
	i := strings.Index(s, "-")
	if i < 0 {
		return
	}
	name := s[:i] // e.g. "sha1"
	hex := s[i+1:]
	meta, ok := metaFromString[name]
	if !ok {
		return parseUnknown(name, hex)
	}
	if len(hex) != meta.size*2 {
		ok = false
		return
	}
	dt, ok := meta.ctors(hex)
	if !ok {
		return
	}
	return Ref{dt}, true
}

// ParseBytes is like Parse, but parses from a byte slice.
func ParseBytes(s []byte) (ref Ref, ok bool) {
	i := bytes.IndexByte(s, '-')
	if i < 0 {
		return
	}
	name := s[:i] // e.g. "sha1"
	hex := s[i+1:]
	meta, ok := metaFromBytes(name)
	if !ok {
		return parseUnknown(string(name), string(hex))
	}
	if len(hex) != meta.size*2 {
		ok = false
		return
	}
	dt, ok := meta.ctorb(hex)
	if !ok {
		return
	}
	return Ref{dt}, true
}

// Parse parse s as a blobref. If s is invalid, a zero Ref is returned
// which can be tested with the Valid method.
func ParseOrZero(s string) Ref {
	ref, ok := Parse(s)
	if !ok {
		return Ref{}
	}
	return ref
}

// MustParse parse s as a blobref and panics on failure.
func MustParse(s string) Ref {
	ref, ok := Parse(s)
	if !ok {
		panic("Invalid blobref " + s)
	}
	return ref
}

// '0' => 0 ... 'f' => 15, else sets *bad to true.
func hexVal(b byte, bad *bool) byte {
	if '0' <= b && b <= '9' {
		return b - '0'
	}
	if 'a' <= b && b <= 'f' {
		return b - 'a' + 10
	}
	*bad = true
	return 0
}

func validDigestName(name string) bool {
	if name == "" {
		return false
	}
	for _, r := range name {
		if 'a' <= r && r <= 'z' {
			continue
		}
		if '0' <= r && r <= '9' {
			continue
		}
		return false
	}
	return true
}

// parseUnknown parses a blobref where the digest type isn't known to this server.
// e.g. ("foo-ababab")
func parseUnknown(digest, hex string) (ref Ref, ok bool) {
	if !validDigestName(digest) {
		return
	}

	// TODO: remove this short hack and don't allow odd numbers of hex digits.
	odd := false
	if len(hex)%2 != 0 {
		hex += "0"
		odd = true
	}

	if len(hex) < 2 || len(hex)%2 != 0 || len(hex) > maxOtherDigestLen*2 {
		return
	}
	o := otherDigest{
		name:   digest,
		sumLen: len(hex) / 2,
		odd:    odd,
	}
	bad := false
	for i := 0; i < len(hex); i += 2 {
		o.sum[i/2] = hexVal(hex[i], &bad)<<4 | hexVal(hex[i+1], &bad)
	}
	if bad {
		return
	}
	return Ref{o}, true
}

func sha1FromBinary(b []byte) digestType {
	var d sha1Digest
	if len(d) != len(b) {
		panic("bogus sha-1 length")
	}
	copy(d[:], b)
	return d
}

func sha1FromHexString(hex string) (digestType, bool) {
	var d sha1Digest
	var bad bool
	for i := 0; i < len(hex); i += 2 {
		d[i/2] = hexVal(hex[i], &bad)<<4 | hexVal(hex[i+1], &bad)
	}
	if bad {
		return nil, false
	}
	return d, true
}

// yawn. exact copy of sha1FromHexString.
func sha1FromHexBytes(hex []byte) (digestType, bool) {
	var d sha1Digest
	var bad bool
	for i := 0; i < len(hex); i += 2 {
		d[i/2] = hexVal(hex[i], &bad)<<4 | hexVal(hex[i+1], &bad)
	}
	if bad {
		return nil, false
	}
	return d, true
}

// RefFromHash returns a blobref representing the given hash.
// It panics if the hash isn't of a known type.
func RefFromHash(h hash.Hash) Ref {
	meta, ok := metaFromType[reflect.TypeOf(h)]
	if !ok {
		panic(fmt.Sprintf("Currently-unsupported hash type %T", h))
	}
	return Ref{meta.ctor(h.Sum(nil))}
}

// RefFromString returns a blobref from the given string, for the currently
// recommended hash function
func RefFromString(s string) Ref {
	return SHA1FromString(s)
}

// SHA1FromString returns a SHA-1 blobref of the provided string.
func SHA1FromString(s string) Ref {
	s1 := sha1.New()
	s1.Write([]byte(s))
	return RefFromHash(s1)
}

// SHA1FromBytes returns a SHA-1 blobref of the provided bytes.
func SHA1FromBytes(b []byte) Ref {
	s1 := sha1.New()
	s1.Write(b)
	return RefFromHash(s1)
}

type sha1Digest [20]byte

func (s sha1Digest) digestName() string { return "sha1" }
func (s sha1Digest) bytes() []byte      { return s[:] }
func (s sha1Digest) newHash() hash.Hash { return sha1.New() }

const maxOtherDigestLen = 128

type otherDigest struct {
	name   string
	sum    [maxOtherDigestLen]byte
	sumLen int  // bytes in sum that are valid
	odd    bool // odd number of hex digits in input
}

func (d otherDigest) digestName() string { return d.name }
func (d otherDigest) bytes() []byte      { return d.sum[:d.sumLen] }
func (d otherDigest) newHash() hash.Hash { return nil }

var sha1Meta = &digestMeta{
	ctor:  sha1FromBinary,
	ctors: sha1FromHexString,
	ctorb: sha1FromHexBytes,
	size:  sha1.Size,
}

var metaFromString = map[string]*digestMeta{
	"sha1": sha1Meta,
}

type blobTypeAndMeta struct {
	name []byte
	meta *digestMeta
}

var metas []blobTypeAndMeta

func metaFromBytes(name []byte) (meta *digestMeta, ok bool) {
	for _, bm := range metas {
		if bytes.Equal(name, bm.name) {
			return bm.meta, true
		}
	}
	return
}

func init() {
	for name, meta := range metaFromString {
		metas = append(metas, blobTypeAndMeta{
			name: []byte(name),
			meta: meta,
		})
	}
}

var sha1Type = reflect.TypeOf(sha1.New())

var metaFromType = map[reflect.Type]*digestMeta{
	sha1Type: sha1Meta,
}

type digestMeta struct {
	ctor  func(binary []byte) digestType
	ctors func(hex string) (digestType, bool)
	ctorb func(hex []byte) (digestType, bool)
	size  int // bytes of digest
}

var bufPool = make(chan []byte, 20)

func getBuf(size int) []byte {
	for {
		select {
		case b := <-bufPool:
			if cap(b) >= size {
				return b[:size]
			}
		default:
			return make([]byte, size)
		}
	}
}

func putBuf(b []byte) {
	select {
	case bufPool <- b:
	default:
	}
}

// NewHash returns a new hash.Hash of the currently recommended hash type.
// Currently this is just SHA-1, but will likely change within the next
// year or so.
func NewHash() hash.Hash {
	return sha1.New()
}

func ValidRefString(s string) bool {
	// TODO: optimize to not allocate
	return ParseOrZero(s).Valid()
}

var null = []byte(`null`)

func (r *Ref) UnmarshalJSON(d []byte) error {
	if r.digest != nil {
		return errors.New("Can't UnmarshalJSON into a non-zero Ref")
	}
	if len(d) == 0 || bytes.Equal(d, null) {
		return nil
	}
	if len(d) < 2 || d[0] != '"' || d[len(d)-1] != '"' {
		return fmt.Errorf("blob: expecting a JSON string to unmarshal, got %q", d)
	}
	d = d[1 : len(d)-1]
	p, ok := ParseBytes(d)
	if !ok {
		return fmt.Errorf("blobref: invalid blobref %q (%d)", d, len(d))
	}
	*r = p
	return nil
}

func (r Ref) MarshalJSON() ([]byte, error) {
	if !r.Valid() {
		return null, nil
	}
	dname := r.digest.digestName()
	bs := r.digest.bytes()
	buf := make([]byte, 0, 3+len(dname)+len(bs)*2)
	buf = append(buf, '"')
	buf = r.appendString(buf)
	buf = append(buf, '"')
	return buf, nil
}

// MarshalBinary implements Go's encoding.BinaryMarshaler interface.
func (r Ref) MarshalBinary() (data []byte, err error) {
	dname := r.digest.digestName()
	bs := r.digest.bytes()
	data = make([]byte, 0, len(dname)+1+len(bs))
	data = append(data, dname...)
	data = append(data, '-')
	data = append(data, bs...)
	return
}

// UnmarshalBinary implements Go's encoding.BinaryUnmarshaler interface.
func (r *Ref) UnmarshalBinary(data []byte) error {
	if r.digest != nil {
		return errors.New("Can't UnmarshalBinary into a non-zero Ref")
	}
	i := bytes.IndexByte(data, '-')
	if i < 1 {
		return errors.New("no digest name")
	}

	digName := string(data[:i])
	buf := data[i+1:]

	meta, ok := metaFromString[digName]
	if !ok {
		r2, ok := parseUnknown(digName, fmt.Sprintf("%x", buf))
		if !ok {
			return errors.New("invalid blobref binary data")
		}
		*r = r2
		return nil
	}
	if len(buf) != meta.size {
		return errors.New("wrong size of data for digest " + digName)
	}
	r.digest = meta.ctor(buf)
	return nil
}

// Less reports whether r sorts before o. Invalid references blobs sort first.
func (r Ref) Less(o Ref) bool {
	if r.Valid() != o.Valid() {
		return o.Valid()
	}
	if !r.Valid() {
		return false
	}
	if n1, n2 := r.digest.digestName(), o.digest.digestName(); n1 != n2 {
		return n1 < n2
	}
	return bytes.Compare(r.digest.bytes(), o.digest.bytes()) < 0
}

// ByRef sorts blob references.
type ByRef []Ref

func (s ByRef) Len() int           { return len(s) }
func (s ByRef) Less(i, j int) bool { return s[i].Less(s[j]) }
func (s ByRef) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }

// SizedByRef sorts SizedRefs by their blobref.
type SizedByRef []SizedRef

func (s SizedByRef) Len() int           { return len(s) }
func (s SizedByRef) Less(i, j int) bool { return s[i].Less(s[j].Ref) }
func (s SizedByRef) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }

// Blob represents a blob. Use the methods Size, SizedRef and
// Open to query and get data from Blob.
type Blob struct {
	ref       Ref
	size      uint32
	newReader func() io.ReadCloser
}

// NewBlob constructs a Blob from its Ref, size and a function that
// returns an io.ReadCloser from which the blob can be read. Any error
// in the function newReader when constructing the io.ReadCloser should
// be returned upon the first call to Read or Close.
func NewBlob(ref Ref, size uint32, newReader func() io.ReadCloser) Blob {
	return Blob{ref, size, newReader}
}

// Size returns the size of the blob (in bytes).
func (b Blob) Size() uint32 {
	return b.size
}

// SizedRef returns the SizedRef corresponding to the blob.
func (b Blob) SizedRef() SizedRef {
	return SizedRef{b.ref, int64(b.size)}
}

// Open returns an io.ReadCloser that can be used to read the blob
// data. The caller must close the io.ReadCloser when finished.
func (b Blob) Open() io.ReadCloser {
	return b.newReader()
}