Improve oshash code (#1829)

Reduce allocations. Don't create intermediary arrays which we then consume right after. Manually fuse the arrays and decode straight into the sum instead. Furthermore, don't invoke a Reader, but carve out the locations via a loop, directly. These two changes taken together speeds up oshash computations by a factor of 10 according to the benchmark tests. The main reason for this change is a much lowered memory allocation rate which in turn improves GC pressure. While here, add a benchmark for oshash computations and use it for testing the performance.
2021-10-12 02:59:51 +02:00 · 2021-10-12 02:59:51 +02:00 · 38384f2c60
parent e9d48683f8
commit 38384f2c60
2 changed files with 52 additions and 16 deletions
--- a/pkg/utils/oshash.go
+++ b/pkg/utils/oshash.go
@ -1,37 +1,44 @@
 package utils

 import (
-	"bytes"
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"os"
 )

 const chunkSize int64 = 64 * 1024

-func oshash(size int64, head []byte, tail []byte) (string, error) {
-	// put the head and tail together
-	buf := append(head, tail...)
+var ErrOsHashLen = errors.New("buffer is not a multiple of 8")

-	// convert bytes into uint64
-	ints := make([]uint64, len(buf)/8)
-	reader := bytes.NewReader(buf)
-	err := binary.Read(reader, binary.LittleEndian, &ints)
-	if err != nil {
-		return "", err
+func sumBytes(buf []byte) (uint64, error) {
+	if len(buf)%8 != 0 {
+		return 0, ErrOsHashLen
 	}

-	// sum the integers
+	sz := len(buf) / 8
 	var sum uint64
-	for _, v := range ints {
-		sum += v
+	for j := 0; j < sz; j++ {
+		sum += binary.LittleEndian.Uint64(buf[8*j : 8*(j+1)])
 	}

-	// add the filesize
-	sum += uint64(size)
+	return sum, nil
+}

+func oshash(size int64, head []byte, tail []byte) (string, error) {
+	headSum, err := sumBytes(head)
+	if err != nil {
+		return "", fmt.Errorf("oshash head: %w", err)
+	}
+	tailSum, err := sumBytes(tail)
+	if err != nil {
+		return "", fmt.Errorf("oshash tail: %w", err)
+	}
+
+	// Compute the sum of the head, tail and file size
+	result := headSum + tailSum + uint64(size)
 	// output as hex
-	return fmt.Sprintf("%016x", sum), nil
+	return fmt.Sprintf("%016x", result), nil
 }

 // OSHashFromFilePath calculates the hash using the same algorithm that
--- a/pkg/utils/oshash_internal_test.go
+++ b/pkg/utils/oshash_internal_test.go
@ -1,6 +1,7 @@
 package utils

 import (
+	"math/rand"
 	"testing"
 )

@ -44,3 +45,31 @@ func TestOshashCollisions(t *testing.T) {
 		t.Errorf("TestOshashCollisions: oshash(n, k, ... %v) =! oshash(n, k, ... %v)", buf1, buf2)
 	}
 }
+
+func BenchmarkOsHash(b *testing.B) {
+	src := rand.NewSource(9999)
+	r := rand.New(src)
+
+	size := int64(1234567890)
+
+	head := make([]byte, 1024*64)
+	_, err := r.Read(head)
+	if err != nil {
+		b.Errorf("unable to generate head array: %v", err)
+	}
+
+	tail := make([]byte, 1024*64)
+	_, err = r.Read(tail)
+	if err != nil {
+		b.Errorf("unable to generate tail array: %v", err)
+	}
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		_, err := oshash(size, head, tail)
+		if err != nil {
+			b.Errorf("unexpected error: %v", err)
+		}
+	}
+}