From 93b41fb6506d8ebe4fe432dd5b45f5bc36a63e01 Mon Sep 17 00:00:00 2001 From: WithoutPants <53250216+WithoutPants@users.noreply.github.com> Date: Tue, 11 Jul 2023 11:53:49 +1000 Subject: [PATCH] Add folder rename detection (#3817) --- pkg/file/file.go | 2 + pkg/file/folder_rename_detect.go | 195 +++++++++++++++++++++++++++++++ pkg/file/scan.go | 76 +++++++++--- pkg/sqlite/file.go | 27 +++++ 4 files changed, 287 insertions(+), 13 deletions(-) create mode 100644 pkg/file/folder_rename_detect.go diff --git a/pkg/file/file.go b/pkg/file/file.go index 5b6f8d447..50a2d6138 100644 --- a/pkg/file/file.go +++ b/pkg/file/file.go @@ -154,10 +154,12 @@ type Getter interface { FindByFingerprint(ctx context.Context, fp Fingerprint) ([]File, error) FindByZipFileID(ctx context.Context, zipFileID ID) ([]File, error) FindAllInPaths(ctx context.Context, p []string, limit, offset int) ([]File, error) + FindByFileInfo(ctx context.Context, info fs.FileInfo, size int64) ([]File, error) } type Counter interface { CountAllInPaths(ctx context.Context, p []string) (int, error) + CountByFolderID(ctx context.Context, folderID FolderID) (int, error) } // Creator provides methods to create Files. diff --git a/pkg/file/folder_rename_detect.go b/pkg/file/folder_rename_detect.go new file mode 100644 index 000000000..0e52eb785 --- /dev/null +++ b/pkg/file/folder_rename_detect.go @@ -0,0 +1,195 @@ +package file + +import ( + "context" + "errors" + "fmt" + "io/fs" + + "github.com/stashapp/stash/pkg/logger" +) + +type folderRenameCandidate struct { + folder *Folder + found int + files int +} + +type folderRenameDetector struct { + // candidates is a map of folder id to the number of files that match + candidates map[FolderID]folderRenameCandidate + // rejects is a set of folder ids which were found to still exist + rejects map[FolderID]struct{} +} + +func (d *folderRenameDetector) isReject(id FolderID) bool { + _, ok := d.rejects[id] + return ok +} + +func (d *folderRenameDetector) getCandidate(id FolderID) *folderRenameCandidate { + c, ok := d.candidates[id] + if !ok { + return nil + } + + return &c +} + +func (d *folderRenameDetector) setCandidate(c folderRenameCandidate) { + d.candidates[c.folder.ID] = c +} + +func (d *folderRenameDetector) reject(id FolderID) { + d.rejects[id] = struct{}{} +} + +// bestCandidate returns the folder that is the best candidate for a rename. +// This is the folder that has the largest number of its original files that +// are still present in the new location. +func (d *folderRenameDetector) bestCandidate() *Folder { + if len(d.candidates) == 0 { + return nil + } + + var best *folderRenameCandidate + + for _, c := range d.candidates { + // ignore folders that have less than 50% of their original files + if c.found < c.files/2 { + continue + } + + // prefer the folder with the most files if the ratio is the same + if best == nil || c.found > best.found { + cc := c + best = &cc + } + } + + if best == nil { + return nil + } + + return best.folder +} + +func (s *scanJob) detectFolderMove(ctx context.Context, file scanFile) (*Folder, error) { + // in order for a folder to be considered moved, the existing folder must be + // missing, and the majority of the old folder's files must be present, unchanged, + // in the new folder. + + detector := folderRenameDetector{ + candidates: make(map[FolderID]folderRenameCandidate), + rejects: make(map[FolderID]struct{}), + } + // rejects is a set of folder ids which were found to still exist + + if err := symWalk(file.fs, file.Path, func(path string, d fs.DirEntry, err error) error { + if err != nil { + // don't let errors prevent scanning + logger.Errorf("error scanning %s: %v", path, err) + return nil + } + + // ignore root + if path == file.Path { + return nil + } + + // ignore directories + if d.IsDir() { + return fs.SkipDir + } + + info, err := d.Info() + if err != nil { + return fmt.Errorf("reading info for %q: %w", path, err) + } + + if !s.acceptEntry(ctx, path, info) { + return nil + } + + size, err := getFileSize(file.fs, path, info) + if err != nil { + return fmt.Errorf("getting file size for %q: %w", path, err) + } + + // check if the file exists in the database based on basename, size and mod time + existing, err := s.Repository.Store.FindByFileInfo(ctx, info, size) + if err != nil { + return fmt.Errorf("checking for existing file %q: %w", path, err) + } + + for _, e := range existing { + // ignore files in zip files + if e.Base().ZipFileID != nil { + continue + } + + parentFolderID := e.Base().ParentFolderID + + if detector.isReject(parentFolderID) { + // folder was found to still exist, not a candidate + continue + } + + c := detector.getCandidate(parentFolderID) + + if c == nil { + // need to check if the folder exists in the filesystem + pf, err := s.Repository.FolderStore.Find(ctx, e.Base().ParentFolderID) + if err != nil { + return fmt.Errorf("getting parent folder %d: %w", e.Base().ParentFolderID, err) + } + + if pf == nil { + // shouldn't happen, but just in case + continue + } + + // parent folder must be missing + _, err = file.fs.Lstat(pf.Path) + if err == nil { + // parent folder exists, not a candidate + detector.reject(parentFolderID) + continue + } + + if !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("checking for parent folder %q: %w", pf.Path, err) + } + + // parent folder is missing, possible candidate + // count the total number of files in the existing folder + count, err := s.Repository.Store.CountByFolderID(ctx, parentFolderID) + if err != nil { + return fmt.Errorf("counting files in folder %d: %w", parentFolderID, err) + } + + if count == 0 { + // no files in the folder, not a candidate + detector.reject(parentFolderID) + continue + } + + c = &folderRenameCandidate{ + folder: pf, + found: 0, + files: count, + } + } + + // increment the count and set it in the map + c.found++ + detector.setCandidate(*c) + } + + return nil + }); err != nil { + return nil, fmt.Errorf("walking filesystem for folder rename detection: %w", err) + } + + return detector.bestCandidate(), nil +} diff --git a/pkg/file/scan.go b/pkg/file/scan.go index dcd625ff6..badb5ab23 100644 --- a/pkg/file/scan.go +++ b/pkg/file/scan.go @@ -215,19 +215,6 @@ func (s *scanJob) queueFileFunc(ctx context.Context, f FS, zipFile *scanFile) fs return fmt.Errorf("reading info for %q: %w", path, err) } - var size int64 - - // #2196/#3042 - replace size with target size if file is a symlink - if info.Mode()&os.ModeSymlink == os.ModeSymlink { - targetInfo, err := f.Stat(path) - if err != nil { - return fmt.Errorf("reading info for symlink %q: %w", path, err) - } - size = targetInfo.Size() - } else { - size = info.Size() - } - if !s.acceptEntry(ctx, path, info) { if info.IsDir() { return fs.SkipDir @@ -236,6 +223,11 @@ func (s *scanJob) queueFileFunc(ctx context.Context, f FS, zipFile *scanFile) fs return nil } + size, err := getFileSize(f, path, info) + if err != nil { + return err + } + ff := scanFile{ BaseFile: &BaseFile{ DirEntry: DirEntry{ @@ -294,6 +286,19 @@ func (s *scanJob) queueFileFunc(ctx context.Context, f FS, zipFile *scanFile) fs } } +func getFileSize(f FS, path string, info fs.FileInfo) (int64, error) { + // #2196/#3042 - replace size with target size if file is a symlink + if info.Mode()&os.ModeSymlink == os.ModeSymlink { + targetInfo, err := f.Stat(path) + if err != nil { + return 0, fmt.Errorf("reading info for symlink %q: %w", path, err) + } + return targetInfo.Size(), nil + } + + return info.Size(), nil +} + func (s *scanJob) acceptEntry(ctx context.Context, path string, info fs.FileInfo) bool { // always accept if there's no filters accept := len(s.options.ScanFilters) == 0 @@ -485,6 +490,15 @@ func (s *scanJob) handleFolder(ctx context.Context, file scanFile) error { } func (s *scanJob) onNewFolder(ctx context.Context, file scanFile) (*Folder, error) { + renamed, err := s.handleFolderRename(ctx, file) + if err != nil { + return nil, err + } + + if renamed != nil { + return renamed, nil + } + now := time.Now() toCreate := &Folder{ @@ -522,6 +536,42 @@ func (s *scanJob) onNewFolder(ctx context.Context, file scanFile) (*Folder, erro return toCreate, nil } +func (s *scanJob) handleFolderRename(ctx context.Context, file scanFile) (*Folder, error) { + // ignore folders in zip files + if file.ZipFileID != nil { + return nil, nil + } + + // check if the folder was moved from elsewhere + renamedFrom, err := s.detectFolderMove(ctx, file) + if err != nil { + return nil, fmt.Errorf("detecting folder move: %w", err) + } + + if renamedFrom == nil { + return nil, nil + } + + // if the folder was moved, update the existing folder + logger.Infof("%s moved to %s. Updating path...", renamedFrom.Path, file.Path) + renamedFrom.Path = file.Path + + // update the parent folder ID + // find the parent folder + parentFolderID, err := s.getFolderID(ctx, filepath.Dir(file.Path)) + if err != nil { + return nil, fmt.Errorf("getting parent folder for %q: %w", file.Path, err) + } + + renamedFrom.ParentFolderID = parentFolderID + + if err := s.Repository.FolderStore.Update(ctx, renamedFrom); err != nil { + return nil, fmt.Errorf("updating folder for rename %q: %w", renamedFrom.Path, err) + } + + return renamedFrom, nil +} + func (s *scanJob) onExistingFolder(ctx context.Context, f scanFile, existing *Folder) (*Folder, error) { update := false diff --git a/pkg/sqlite/file.go b/pkg/sqlite/file.go index 87834a2df..760a77465 100644 --- a/pkg/sqlite/file.go +++ b/pkg/sqlite/file.go @@ -5,8 +5,10 @@ import ( "database/sql" "errors" "fmt" + "io/fs" "path/filepath" "strings" + "time" "github.com/doug-martin/goqu/v9" "github.com/doug-martin/goqu/v9/exp" @@ -713,6 +715,31 @@ func (qb *FileStore) FindByZipFileID(ctx context.Context, zipFileID file.ID) ([] return qb.getMany(ctx, q) } +// FindByFileInfo finds files that match the base name, size, and mod time of the given file. +func (qb *FileStore) FindByFileInfo(ctx context.Context, info fs.FileInfo, size int64) ([]file.File, error) { + table := qb.table() + + modTime := info.ModTime().Format(time.RFC3339) + + q := qb.selectDataset().Prepared(true).Where( + table.Col("basename").Eq(info.Name()), + table.Col("size").Eq(size), + table.Col("mod_time").Eq(modTime), + ) + + return qb.getMany(ctx, q) +} + +func (qb *FileStore) CountByFolderID(ctx context.Context, folderID file.FolderID) (int, error) { + table := qb.table() + + q := qb.countDataset().Prepared(true).Where( + table.Col("parent_folder_id").Eq(folderID), + ) + + return count(ctx, q) +} + func (qb *FileStore) IsPrimary(ctx context.Context, fileID file.ID) (bool, error) { joinTables := []exp.IdentifierExpression{ scenesFilesJoinTable,