stash/pkg/manager/task_scan.go

package manager

import (
	"context"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"time"

	"github.com/remeh/sizedwaitgroup"

	"github.com/stashapp/stash/pkg/file"
	"github.com/stashapp/stash/pkg/job"
	"github.com/stashapp/stash/pkg/logger"
	"github.com/stashapp/stash/pkg/manager/config"
	"github.com/stashapp/stash/pkg/models"
	"github.com/stashapp/stash/pkg/utils"
)

const scanQueueSize = 200000

type ScanJob struct {
	txnManager    models.TransactionManager
	input         models.ScanMetadataInput
	subscriptions *subscriptionManager
}

type scanFile struct {
	path            string
	info            os.FileInfo
	caseSensitiveFs bool
}

func (j *ScanJob) Execute(ctx context.Context, progress *job.Progress) {
	input := j.input
	paths := getScanPaths(input.Paths)

	if job.IsCancelled(ctx) {
		logger.Info("Stopping due to user request")
		return
	}

	start := time.Now()
	config := config.GetInstance()
	parallelTasks := config.GetParallelTasksWithAutoDetection()

	logger.Infof("Scan started with %d parallel tasks", parallelTasks)

	fileQueue := make(chan scanFile, scanQueueSize)
	go func() {
		total, newFiles := j.queueFiles(ctx, paths, fileQueue, parallelTasks)

		if !job.IsCancelled(ctx) {
			progress.SetTotal(total)
			logger.Infof("Finished counting files. Total files to scan: %d, %d new files found", total, newFiles)
		}
	}()

	wg := sizedwaitgroup.New(parallelTasks)

	fileNamingAlgo := config.GetVideoFileNamingAlgorithm()
	calculateMD5 := config.IsCalculateMD5()

	var err error

	var galleries []string

	mutexManager := utils.NewMutexManager()

	for f := range fileQueue {
		if job.IsCancelled(ctx) {
			break
		}

		if isGallery(f.path) {
			galleries = append(galleries, f.path)
		}

		if err := instance.Paths.Generated.EnsureTmpDir(); err != nil {
			logger.Warnf("couldn't create temporary directory: %v", err)
		}

		wg.Add()
		task := ScanTask{
			TxnManager:           j.txnManager,
			file:                 file.FSFile(f.path, f.info),
			UseFileMetadata:      utils.IsTrue(input.UseFileMetadata),
			StripFileExtension:   utils.IsTrue(input.StripFileExtension),
			fileNamingAlgorithm:  fileNamingAlgo,
			calculateMD5:         calculateMD5,
			GeneratePreview:      utils.IsTrue(input.ScanGeneratePreviews),
			GenerateImagePreview: utils.IsTrue(input.ScanGenerateImagePreviews),
			GenerateSprite:       utils.IsTrue(input.ScanGenerateSprites),
			GeneratePhash:        utils.IsTrue(input.ScanGeneratePhashes),
			GenerateThumbnails:   utils.IsTrue(input.ScanGenerateThumbnails),
			progress:             progress,
			CaseSensitiveFs:      f.caseSensitiveFs,
			ctx:                  ctx,
			mutexManager:         mutexManager,
		}

		go func() {
			task.Start(ctx)
			wg.Done()
			progress.Increment()
		}()
	}

	wg.Wait()

	if err := instance.Paths.Generated.EmptyTmpDir(); err != nil {
		logger.Warnf("couldn't empty temporary directory: %v", err)
	}

	elapsed := time.Since(start)
	logger.Info(fmt.Sprintf("Scan finished (%s)", elapsed))

	if job.IsCancelled(ctx) {
		logger.Info("Stopping due to user request")
		return
	}

	if err != nil {
		return
	}

	progress.ExecuteTask("Associating galleries", func() {
		for _, path := range galleries {
			wg.Add()
			task := ScanTask{
				TxnManager:      j.txnManager,
				file:            file.FSFile(path, nil), // hopefully info is not needed
				UseFileMetadata: false,
			}

			go task.associateGallery(&wg)
			wg.Wait()
		}
		logger.Info("Finished gallery association")
	})

	j.subscriptions.notify()
}

func (j *ScanJob) queueFiles(ctx context.Context, paths []*models.StashConfig, scanQueue chan<- scanFile, parallelTasks int) (total int, newFiles int) {
	defer close(scanQueue)

	wg := sizedwaitgroup.New(parallelTasks)

	for _, sp := range paths {
		csFs, er := utils.IsFsPathCaseSensitive(sp.Path)
		if er != nil {
			logger.Warnf("Cannot determine fs case sensitivity: %s", er.Error())
		}

		err := walkFilesToScan(sp, func(path string, info os.FileInfo, err error) error {
			// check stop
			if job.IsCancelled(ctx) {
				return context.Canceled
			}

			wg.Add()

			go func() {
				defer wg.Done()

				// #1756 - skip zero length files and directories
				if info.IsDir() {
					return
				}

				if info.Size() == 0 {
					logger.Infof("Skipping zero-length file: %s", path)
					return
				}

				total++
				if !j.doesPathExist(path) {
					newFiles++
				}

				scanQueue <- scanFile{
					path:            path,
					info:            info,
					caseSensitiveFs: csFs,
				}
			}()

			return nil
		})

		wg.Wait()

		if err != nil && !errors.Is(err, context.Canceled) {
			logger.Errorf("Error encountered queuing files to scan: %s", err.Error())
			return
		}
	}

	return
}

func (j *ScanJob) doesPathExist(path string) bool {
	config := config.GetInstance()
	vidExt := config.GetVideoExtensions()
	imgExt := config.GetImageExtensions()
	gExt := config.GetGalleryExtensions()

	ret := false
	txnErr := j.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error {
		if utils.MatchExtension(path, gExt) {
			gallery, _ := r.Gallery().FindByPath(path)
			if gallery != nil {
				ret = true
			}
		} else if utils.MatchExtension(path, vidExt) {
			s, _ := r.Scene().FindByPath(path)
			if s != nil {
				ret = true
			}
		} else if utils.MatchExtension(path, imgExt) {
			i, _ := r.Image().FindByPath(path)
			if i != nil {
				ret = true
			}
		}

		return nil
	})
	if txnErr != nil {
		logger.Warnf("error checking if file exists in database: %v", txnErr)
	}

	return ret
}

type ScanTask struct {
	ctx                  context.Context
	TxnManager           models.TransactionManager
	file                 file.SourceFile
	UseFileMetadata      bool
	StripFileExtension   bool
	calculateMD5         bool
	fileNamingAlgorithm  models.HashAlgorithm
	GenerateSprite       bool
	GeneratePhash        bool
	GeneratePreview      bool
	GenerateImagePreview bool
	GenerateThumbnails   bool
	zipGallery           *models.Gallery
	progress             *job.Progress
	CaseSensitiveFs      bool

	mutexManager *utils.MutexManager
}

func (t *ScanTask) Start(ctx context.Context) {
	var s *models.Scene
	path := t.file.Path()
	t.progress.ExecuteTask("Scanning "+path, func() {
		if isGallery(path) {
			t.scanGallery(ctx)
		} else if isVideo(path) {
			s = t.scanScene()
		} else if isImage(path) {
			t.scanImage()
		}
	})

	if s != nil {
		iwg := sizedwaitgroup.New(2)

		if t.GenerateSprite {
			iwg.Add()

			go t.progress.ExecuteTask(fmt.Sprintf("Generating sprites for %s", path), func() {
				taskSprite := GenerateSpriteTask{
					Scene:               *s,
					Overwrite:           false,
					fileNamingAlgorithm: t.fileNamingAlgorithm,
				}
				taskSprite.Start()
				iwg.Done()
			})
		}

		if t.GeneratePhash {
			iwg.Add()

			go t.progress.ExecuteTask(fmt.Sprintf("Generating phash for %s", path), func() {
				taskPhash := GeneratePhashTask{
					Scene:               *s,
					fileNamingAlgorithm: t.fileNamingAlgorithm,
					txnManager:          t.TxnManager,
				}
				taskPhash.Start()
				iwg.Done()
			})
		}

		if t.GeneratePreview {
			iwg.Add()

			go t.progress.ExecuteTask(fmt.Sprintf("Generating preview for %s", path), func() {
				config := config.GetInstance()
				var previewSegmentDuration = config.GetPreviewSegmentDuration()
				var previewSegments = config.GetPreviewSegments()
				var previewExcludeStart = config.GetPreviewExcludeStart()
				var previewExcludeEnd = config.GetPreviewExcludeEnd()
				var previewPresent = config.GetPreviewPreset()

				// NOTE: the reuse of this model like this is painful.
				previewOptions := models.GeneratePreviewOptionsInput{
					PreviewSegments:        &previewSegments,
					PreviewSegmentDuration: &previewSegmentDuration,
					PreviewExcludeStart:    &previewExcludeStart,
					PreviewExcludeEnd:      &previewExcludeEnd,
					PreviewPreset:          &previewPresent,
				}

				taskPreview := GeneratePreviewTask{
					Scene:               *s,
					ImagePreview:        t.GenerateImagePreview,
					Options:             previewOptions,
					Overwrite:           false,
					fileNamingAlgorithm: t.fileNamingAlgorithm,
				}
				taskPreview.Start()
				iwg.Done()
			})
		}

		iwg.Wait()
	}
}

func walkFilesToScan(s *models.StashConfig, f filepath.WalkFunc) error {
	config := config.GetInstance()
	vidExt := config.GetVideoExtensions()
	imgExt := config.GetImageExtensions()
	gExt := config.GetGalleryExtensions()
	excludeVidRegex := generateRegexps(config.GetExcludes())
	excludeImgRegex := generateRegexps(config.GetImageExcludes())

	// don't scan zip images directly
	if file.IsZipPath(s.Path) {
		logger.Warnf("Cannot rescan zip image %s. Rescan zip gallery instead.", s.Path)
		return nil
	}

	generatedPath := config.GetGeneratedPath()

	return utils.SymWalk(s.Path, func(path string, info os.FileInfo, err error) error {
		if err != nil {
			logger.Warnf("error scanning %s: %s", path, err.Error())
			return nil
		}

		if info.IsDir() {
			// #1102 - ignore files in generated path
			if utils.IsPathInDir(generatedPath, path) {
				return filepath.SkipDir
			}

			// shortcut: skip the directory entirely if it matches both exclusion patterns
			// add a trailing separator so that it correctly matches against patterns like path/.*
			pathExcludeTest := path + string(filepath.Separator)
			if (s.ExcludeVideo || matchFileRegex(pathExcludeTest, excludeVidRegex)) && (s.ExcludeImage || matchFileRegex(pathExcludeTest, excludeImgRegex)) {
				return filepath.SkipDir
			}

			return nil
		}

		if !s.ExcludeVideo && utils.MatchExtension(path, vidExt) && !matchFileRegex(path, excludeVidRegex) {
			return f(path, info, err)
		}

		if !s.ExcludeImage {
			if (utils.MatchExtension(path, imgExt) || utils.MatchExtension(path, gExt)) && !matchFileRegex(path, excludeImgRegex) {
				return f(path, info, err)
			}
		}

		return nil
	})
}