Add folder rename detection (#3817)

This commit is contained in:
WithoutPants 2023-07-11 11:53:49 +10:00 committed by GitHub
parent 5c38836ade
commit 93b41fb650
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 287 additions and 13 deletions

View File

@ -154,10 +154,12 @@ type Getter interface {
FindByFingerprint(ctx context.Context, fp Fingerprint) ([]File, error)
FindByZipFileID(ctx context.Context, zipFileID ID) ([]File, error)
FindAllInPaths(ctx context.Context, p []string, limit, offset int) ([]File, error)
FindByFileInfo(ctx context.Context, info fs.FileInfo, size int64) ([]File, error)
}
type Counter interface {
CountAllInPaths(ctx context.Context, p []string) (int, error)
CountByFolderID(ctx context.Context, folderID FolderID) (int, error)
}
// Creator provides methods to create Files.

View File

@ -0,0 +1,195 @@
package file
import (
"context"
"errors"
"fmt"
"io/fs"
"github.com/stashapp/stash/pkg/logger"
)
type folderRenameCandidate struct {
folder *Folder
found int
files int
}
type folderRenameDetector struct {
// candidates is a map of folder id to the number of files that match
candidates map[FolderID]folderRenameCandidate
// rejects is a set of folder ids which were found to still exist
rejects map[FolderID]struct{}
}
func (d *folderRenameDetector) isReject(id FolderID) bool {
_, ok := d.rejects[id]
return ok
}
func (d *folderRenameDetector) getCandidate(id FolderID) *folderRenameCandidate {
c, ok := d.candidates[id]
if !ok {
return nil
}
return &c
}
func (d *folderRenameDetector) setCandidate(c folderRenameCandidate) {
d.candidates[c.folder.ID] = c
}
func (d *folderRenameDetector) reject(id FolderID) {
d.rejects[id] = struct{}{}
}
// bestCandidate returns the folder that is the best candidate for a rename.
// This is the folder that has the largest number of its original files that
// are still present in the new location.
func (d *folderRenameDetector) bestCandidate() *Folder {
if len(d.candidates) == 0 {
return nil
}
var best *folderRenameCandidate
for _, c := range d.candidates {
// ignore folders that have less than 50% of their original files
if c.found < c.files/2 {
continue
}
// prefer the folder with the most files if the ratio is the same
if best == nil || c.found > best.found {
cc := c
best = &cc
}
}
if best == nil {
return nil
}
return best.folder
}
func (s *scanJob) detectFolderMove(ctx context.Context, file scanFile) (*Folder, error) {
// in order for a folder to be considered moved, the existing folder must be
// missing, and the majority of the old folder's files must be present, unchanged,
// in the new folder.
detector := folderRenameDetector{
candidates: make(map[FolderID]folderRenameCandidate),
rejects: make(map[FolderID]struct{}),
}
// rejects is a set of folder ids which were found to still exist
if err := symWalk(file.fs, file.Path, func(path string, d fs.DirEntry, err error) error {
if err != nil {
// don't let errors prevent scanning
logger.Errorf("error scanning %s: %v", path, err)
return nil
}
// ignore root
if path == file.Path {
return nil
}
// ignore directories
if d.IsDir() {
return fs.SkipDir
}
info, err := d.Info()
if err != nil {
return fmt.Errorf("reading info for %q: %w", path, err)
}
if !s.acceptEntry(ctx, path, info) {
return nil
}
size, err := getFileSize(file.fs, path, info)
if err != nil {
return fmt.Errorf("getting file size for %q: %w", path, err)
}
// check if the file exists in the database based on basename, size and mod time
existing, err := s.Repository.Store.FindByFileInfo(ctx, info, size)
if err != nil {
return fmt.Errorf("checking for existing file %q: %w", path, err)
}
for _, e := range existing {
// ignore files in zip files
if e.Base().ZipFileID != nil {
continue
}
parentFolderID := e.Base().ParentFolderID
if detector.isReject(parentFolderID) {
// folder was found to still exist, not a candidate
continue
}
c := detector.getCandidate(parentFolderID)
if c == nil {
// need to check if the folder exists in the filesystem
pf, err := s.Repository.FolderStore.Find(ctx, e.Base().ParentFolderID)
if err != nil {
return fmt.Errorf("getting parent folder %d: %w", e.Base().ParentFolderID, err)
}
if pf == nil {
// shouldn't happen, but just in case
continue
}
// parent folder must be missing
_, err = file.fs.Lstat(pf.Path)
if err == nil {
// parent folder exists, not a candidate
detector.reject(parentFolderID)
continue
}
if !errors.Is(err, fs.ErrNotExist) {
return fmt.Errorf("checking for parent folder %q: %w", pf.Path, err)
}
// parent folder is missing, possible candidate
// count the total number of files in the existing folder
count, err := s.Repository.Store.CountByFolderID(ctx, parentFolderID)
if err != nil {
return fmt.Errorf("counting files in folder %d: %w", parentFolderID, err)
}
if count == 0 {
// no files in the folder, not a candidate
detector.reject(parentFolderID)
continue
}
c = &folderRenameCandidate{
folder: pf,
found: 0,
files: count,
}
}
// increment the count and set it in the map
c.found++
detector.setCandidate(*c)
}
return nil
}); err != nil {
return nil, fmt.Errorf("walking filesystem for folder rename detection: %w", err)
}
return detector.bestCandidate(), nil
}

View File

@ -215,19 +215,6 @@ func (s *scanJob) queueFileFunc(ctx context.Context, f FS, zipFile *scanFile) fs
return fmt.Errorf("reading info for %q: %w", path, err)
}
var size int64
// #2196/#3042 - replace size with target size if file is a symlink
if info.Mode()&os.ModeSymlink == os.ModeSymlink {
targetInfo, err := f.Stat(path)
if err != nil {
return fmt.Errorf("reading info for symlink %q: %w", path, err)
}
size = targetInfo.Size()
} else {
size = info.Size()
}
if !s.acceptEntry(ctx, path, info) {
if info.IsDir() {
return fs.SkipDir
@ -236,6 +223,11 @@ func (s *scanJob) queueFileFunc(ctx context.Context, f FS, zipFile *scanFile) fs
return nil
}
size, err := getFileSize(f, path, info)
if err != nil {
return err
}
ff := scanFile{
BaseFile: &BaseFile{
DirEntry: DirEntry{
@ -294,6 +286,19 @@ func (s *scanJob) queueFileFunc(ctx context.Context, f FS, zipFile *scanFile) fs
}
}
func getFileSize(f FS, path string, info fs.FileInfo) (int64, error) {
// #2196/#3042 - replace size with target size if file is a symlink
if info.Mode()&os.ModeSymlink == os.ModeSymlink {
targetInfo, err := f.Stat(path)
if err != nil {
return 0, fmt.Errorf("reading info for symlink %q: %w", path, err)
}
return targetInfo.Size(), nil
}
return info.Size(), nil
}
func (s *scanJob) acceptEntry(ctx context.Context, path string, info fs.FileInfo) bool {
// always accept if there's no filters
accept := len(s.options.ScanFilters) == 0
@ -485,6 +490,15 @@ func (s *scanJob) handleFolder(ctx context.Context, file scanFile) error {
}
func (s *scanJob) onNewFolder(ctx context.Context, file scanFile) (*Folder, error) {
renamed, err := s.handleFolderRename(ctx, file)
if err != nil {
return nil, err
}
if renamed != nil {
return renamed, nil
}
now := time.Now()
toCreate := &Folder{
@ -522,6 +536,42 @@ func (s *scanJob) onNewFolder(ctx context.Context, file scanFile) (*Folder, erro
return toCreate, nil
}
func (s *scanJob) handleFolderRename(ctx context.Context, file scanFile) (*Folder, error) {
// ignore folders in zip files
if file.ZipFileID != nil {
return nil, nil
}
// check if the folder was moved from elsewhere
renamedFrom, err := s.detectFolderMove(ctx, file)
if err != nil {
return nil, fmt.Errorf("detecting folder move: %w", err)
}
if renamedFrom == nil {
return nil, nil
}
// if the folder was moved, update the existing folder
logger.Infof("%s moved to %s. Updating path...", renamedFrom.Path, file.Path)
renamedFrom.Path = file.Path
// update the parent folder ID
// find the parent folder
parentFolderID, err := s.getFolderID(ctx, filepath.Dir(file.Path))
if err != nil {
return nil, fmt.Errorf("getting parent folder for %q: %w", file.Path, err)
}
renamedFrom.ParentFolderID = parentFolderID
if err := s.Repository.FolderStore.Update(ctx, renamedFrom); err != nil {
return nil, fmt.Errorf("updating folder for rename %q: %w", renamedFrom.Path, err)
}
return renamedFrom, nil
}
func (s *scanJob) onExistingFolder(ctx context.Context, f scanFile, existing *Folder) (*Folder, error) {
update := false

View File

@ -5,8 +5,10 @@ import (
"database/sql"
"errors"
"fmt"
"io/fs"
"path/filepath"
"strings"
"time"
"github.com/doug-martin/goqu/v9"
"github.com/doug-martin/goqu/v9/exp"
@ -713,6 +715,31 @@ func (qb *FileStore) FindByZipFileID(ctx context.Context, zipFileID file.ID) ([]
return qb.getMany(ctx, q)
}
// FindByFileInfo finds files that match the base name, size, and mod time of the given file.
func (qb *FileStore) FindByFileInfo(ctx context.Context, info fs.FileInfo, size int64) ([]file.File, error) {
table := qb.table()
modTime := info.ModTime().Format(time.RFC3339)
q := qb.selectDataset().Prepared(true).Where(
table.Col("basename").Eq(info.Name()),
table.Col("size").Eq(size),
table.Col("mod_time").Eq(modTime),
)
return qb.getMany(ctx, q)
}
func (qb *FileStore) CountByFolderID(ctx context.Context, folderID file.FolderID) (int, error) {
table := qb.table()
q := qb.countDataset().Prepared(true).Where(
table.Col("parent_folder_id").Eq(folderID),
)
return count(ctx, q)
}
func (qb *FileStore) IsPrimary(ctx context.Context, fileID file.ID) (bool, error) {
joinTables := []exp.IdentifierExpression{
scenesFilesJoinTable,