Fix a bunch of scanning / tagging bugs (#3154)

* Fix possible infinite loop/stack overflow with weird/broken zip files * Fix path length calculation using bytes instead of characters (runes) * Fix bug where oshash gets buffers with size not actually multiple of 8 * Add oshash tests Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2022-12-01 15:48:04 +10:00 · 2022-12-01 15:48:04 +10:00 · 87cea80e7b
parent e614ca8d26
commit 87cea80e7b
5 changed files with 125 additions and 82 deletions
--- a/pkg/file/walk.go
+++ b/pkg/file/walk.go
@ -125,7 +125,12 @@ func walkDir(f FS, path string, d fs.DirEntry, walkDirFn fs.WalkDirFunc) error {
 	}

 	for _, d1 := range dirs {
-		path1 := filepath.Join(path, d1.Name())
+		name := d1.Name()
+		// Prevent infinite loops; this can happen with certain FS implementations (e.g. ZipFS).
+		if name == "" || name == "." {
+			continue
+		}
+		path1 := filepath.Join(path, name)
 		if err := walkDir(f, path1, d1, walkDirFn); err != nil {
 			if errors.Is(err, fs.SkipDir) {
 				break
--- a/pkg/hash/oshash/oshash.go
+++ b/pkg/hash/oshash/oshash.go
@ -46,15 +46,16 @@ func oshash(size int64, head []byte, tail []byte) (string, error) {
 	return fmt.Sprintf("%016x", result), nil
 }

-// FromFilePath calculates the hash reading from src.
+// FromReader calculates the hash reading from src.
 func FromReader(src io.ReadSeeker, fileSize int64) (string, error) {
-	if fileSize <= 0 {
-		return "", fmt.Errorf("cannot calculate oshash for empty file (size %d)", fileSize)
+	if fileSize <= 8 {
+		return "", fmt.Errorf("cannot calculate oshash where size < 8 (%d)", fileSize)
 	}

 	fileChunkSize := chunkSize
 	if fileSize < fileChunkSize {
-		fileChunkSize = fileSize
+		// Must be a multiple of 8.
+		fileChunkSize = (fileSize / 8) * 8
 	}

 	head := make([]byte, fileChunkSize)
@ -67,7 +68,7 @@ func FromReader(src io.ReadSeeker, fileSize int64) (string, error) {
 	}

 	// seek to the end of the file - the chunk size
-	_, err = src.Seek(-fileChunkSize, 2)
+	_, err = src.Seek(-fileChunkSize, io.SeekEnd)
 	if err != nil {
 		return "", err
 	}
--- a/pkg/hash/oshash/oshash_internal_test.go
+++ b/pkg/hash/oshash/oshash_internal_test.go
@ -1,75 +0,0 @@
-package oshash
-
-import (
-	"math/rand"
-	"testing"
-)
-
-// Note that the public API returns "" instead.
-func TestOshashEmpty(t *testing.T) {
-	var size int64
-	head := make([]byte, chunkSize)
-	tail := make([]byte, chunkSize)
-	want := "0000000000000000"
-	got, err := oshash(size, head, tail)
-	if err != nil {
-		t.Errorf("TestOshashEmpty: Error from oshash: %v", err)
-	}
-	if got != want {
-		t.Errorf("TestOshashEmpty: oshash(0, 0, 0) = %q; want %q", got, want)
-	}
-}
-
-// As oshash sums byte values, causing collisions is trivial.
-func TestOshashCollisions(t *testing.T) {
-	buf1 := []byte("this is dumb")
-	buf2 := []byte("dumb is this")
-	size := int64(len(buf1))
-	head := make([]byte, chunkSize)
-
-	tail1 := make([]byte, chunkSize)
-	copy(tail1[len(tail1)-len(buf1):], buf1)
-	hash1, err := oshash(size, head, tail1)
-	if err != nil {
-		t.Errorf("TestOshashCollisions: Error from oshash: %v", err)
-	}
-
-	tail2 := make([]byte, chunkSize)
-	copy(tail2[len(tail2)-len(buf2):], buf2)
-	hash2, err := oshash(size, head, tail2)
-	if err != nil {
-		t.Errorf("TestOshashCollisions: Error from oshash: %v", err)
-	}
-
-	if hash1 != hash2 {
-		t.Errorf("TestOshashCollisions: oshash(n, k, ... %v) =! oshash(n, k, ... %v)", buf1, buf2)
-	}
-}
-
-func BenchmarkOsHash(b *testing.B) {
-	src := rand.NewSource(9999)
-	r := rand.New(src)
-
-	size := int64(1234567890)
-
-	head := make([]byte, 1024*64)
-	_, err := r.Read(head)
-	if err != nil {
-		b.Errorf("unable to generate head array: %v", err)
-	}
-
-	tail := make([]byte, 1024*64)
-	_, err = r.Read(tail)
-	if err != nil {
-		b.Errorf("unable to generate tail array: %v", err)
-	}
-
-	b.ResetTimer()
-
-	for n := 0; n < b.N; n++ {
-		_, err := oshash(size, head, tail)
-		if err != nil {
-			b.Errorf("unexpected error: %v", err)
-		}
-	}
-}
--- a/pkg/hash/oshash/oshash_test.go
+++ b/pkg/hash/oshash/oshash_test.go
@ -0,0 +1,111 @@
+package oshash
+
+import (
+	"bytes"
+	"math/rand"
+	"testing"
+)
+
+func BenchmarkOsHash(b *testing.B) {
+	src := rand.NewSource(9999)
+	r := rand.New(src)
+
+	size := int64(1234567890)
+
+	head := make([]byte, 1024*64)
+	_, err := r.Read(head)
+	if err != nil {
+		b.Errorf("unable to generate head array: %v", err)
+	}
+
+	tail := make([]byte, 1024*64)
+	_, err = r.Read(tail)
+	if err != nil {
+		b.Errorf("unable to generate tail array: %v", err)
+	}
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		_, err := oshash(size, head, tail)
+		if err != nil {
+			b.Errorf("unexpected error: %v", err)
+		}
+	}
+}
+
+func TestFromReader(t *testing.T) {
+	makeByteArray := func(base []byte, mag int) []byte {
+		ret := base
+		for i := 0; i < mag; i++ {
+			ret = append(ret, ret...)
+		}
+		return ret
+	}
+
+	makeTailArray := func(base []byte, tail []byte) []byte {
+		ret := base
+		t := make([]byte, chunkSize)
+		copy(t[len(t)-len(tail):], tail)
+		ret = append(ret, t...)
+		return ret
+	}
+
+	tests := []struct {
+		name    string
+		data    []byte
+		want    string
+		wantErr bool
+	}{
+		{
+			"empty",
+			[]byte{},
+			"",
+			true,
+		},
+		{
+			"regular",
+			makeByteArray([]byte("this is a test"), 15),
+			"6a0eba04654d0b9b",
+			false,
+		},
+		{
+			"< chunk size",
+			[]byte("hello world"),
+			"d3e392dee38cd4df",
+			false,
+		},
+		{
+			"< 8",
+			[]byte("hello"),
+			"",
+			true,
+		},
+		{
+			"identical #1",
+			makeTailArray(make([]byte, chunkSize), []byte("this is dumb")),
+			"d5d6ddd820756920",
+			false,
+		},
+		{
+			"identical #2",
+			makeTailArray(make([]byte, chunkSize), []byte("dumb is this")),
+			"d5d6ddd820756920",
+			false,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			r := bytes.NewReader(tt.data)
+
+			got, err := FromReader(r, int64(len(tt.data)))
+			if (err != nil) != tt.wantErr {
+				t.Errorf("FromReader() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if got != tt.want {
+				t.Errorf("FromReader() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
--- a/pkg/match/path.go
+++ b/pkg/match/path.go
@ -7,6 +7,7 @@ import (
 	"regexp"
 	"strings"
 	"unicode"
+	"unicode/utf8"

 	"github.com/stashapp/stash/pkg/gallery"
 	"github.com/stashapp/stash/pkg/image"
@ -77,7 +78,7 @@ func getPathWords(path string, trimExt bool) []string {
 	// remove any single letter words
 	var ret []string
 	for _, w := range words {
-		if len(w) > 1 {
+		if utf8.RuneCountInString(w) > 1 {
 			// #1450 - we need to open up the criteria for matching so that we
 			// can match where path has no space between subject names -
 			// ie name = "foo bar" - path = "foobar"