stash/pkg/manager/task_scan.go

389 lines
9.6 KiB
Go
Raw Normal View History

2019-02-09 12:30:49 +00:00
package manager
import (
"context"
2021-05-24 04:24:18 +00:00
"errors"
"fmt"
"os"
"path/filepath"
"time"
"github.com/remeh/sizedwaitgroup"
"github.com/stashapp/stash/pkg/file"
2021-05-24 04:24:18 +00:00
"github.com/stashapp/stash/pkg/job"
2019-02-14 23:42:52 +00:00
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
2019-02-14 23:42:52 +00:00
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/utils"
2019-02-09 12:30:49 +00:00
)
const scanQueueSize = 200000
2021-05-24 04:24:18 +00:00
type ScanJob struct {
txnManager models.TransactionManager
input models.ScanMetadataInput
subscriptions *subscriptionManager
}
type scanFile struct {
path string
info os.FileInfo
caseSensitiveFs bool
}
2021-05-24 04:24:18 +00:00
func (j *ScanJob) Execute(ctx context.Context, progress *job.Progress) {
input := j.input
paths := getScanPaths(input.Paths)
if job.IsCancelled(ctx) {
logger.Info("Stopping due to user request")
return
}
start := time.Now()
config := config.GetInstance()
parallelTasks := config.GetParallelTasksWithAutoDetection()
2021-05-24 04:24:18 +00:00
logger.Infof("Scan started with %d parallel tasks", parallelTasks)
fileQueue := make(chan scanFile, scanQueueSize)
go func() {
total, newFiles := j.queueFiles(ctx, paths, fileQueue, parallelTasks)
if !job.IsCancelled(ctx) {
progress.SetTotal(total)
logger.Infof("Finished counting files. Total files to scan: %d, %d new files found", total, newFiles)
}
}()
wg := sizedwaitgroup.New(parallelTasks)
2021-05-24 04:24:18 +00:00
fileNamingAlgo := config.GetVideoFileNamingAlgorithm()
calculateMD5 := config.IsCalculateMD5()
var err error
var galleries []string
mutexManager := utils.NewMutexManager()
2021-05-24 04:24:18 +00:00
for f := range fileQueue {
if job.IsCancelled(ctx) {
break
}
2021-05-24 04:24:18 +00:00
if isGallery(f.path) {
galleries = append(galleries, f.path)
}
2021-05-24 04:24:18 +00:00
if err := instance.Paths.Generated.EnsureTmpDir(); err != nil {
logger.Warnf("couldn't create temporary directory: %v", err)
2021-05-24 04:24:18 +00:00
}
wg.Add()
task := ScanTask{
TxnManager: j.txnManager,
file: file.FSFile(f.path, f.info),
UseFileMetadata: utils.IsTrue(input.UseFileMetadata),
StripFileExtension: utils.IsTrue(input.StripFileExtension),
fileNamingAlgorithm: fileNamingAlgo,
calculateMD5: calculateMD5,
GeneratePreview: utils.IsTrue(input.ScanGeneratePreviews),
GenerateImagePreview: utils.IsTrue(input.ScanGenerateImagePreviews),
GenerateSprite: utils.IsTrue(input.ScanGenerateSprites),
GeneratePhash: utils.IsTrue(input.ScanGeneratePhashes),
GenerateThumbnails: utils.IsTrue(input.ScanGenerateThumbnails),
progress: progress,
CaseSensitiveFs: f.caseSensitiveFs,
ctx: ctx,
mutexManager: mutexManager,
2021-05-24 04:24:18 +00:00
}
go func() {
task.Start(ctx)
wg.Done()
progress.Increment()
}()
2021-05-24 04:24:18 +00:00
}
wg.Wait()
Errcheck phase 1 (#1715) * Avoid redundant logging in migrations Return the error and let the caller handle the logging of the error if needed. While here, defer m.Close() to the function boundary. * Treat errors as values Use %v rather than %s and pass the errors directly. * Generate a wrapped error on stat-failure * Log 3 unchecked errors Rather than ignore errors, log them at the WARNING log level. The server has been functioning without these, so assume they are not at the ERROR level. * Propagate errors upward Failure in path generation was ignored. Propagate the errors upward the call stack, so it can be handled at the level of orchestration. * Warn on errors Log errors rather than quenching them. Errors are logged at the Warn-level for now. * Check error when creating test databases Use the builtin log package and stop the program fatally on error. * Add warnings to uncheck task errors Focus on the task system in a single commit, logging unchecked errors as warnings. * Warn-on-error in API routes Look through the API routes, and make sure errors are being logged if they occur. Prefer the Warn-log-level because none of these has proven to be fatal in the system up until now. * Propagate error when adding Util API * Propagate error on adding util API * Return unhandled error * JS log API: propagate and log errors * JS Plugins: log GQL addition failures. * Warn on failure to write to stdin * Warn on failure to stop task * Wrap viper.BindEnv The current viper code only errors if no name is provided, so it should never fail. Rewrite the code flow to factor through a panic-function. This removes error warnings from this part of the code. * Log errors in concurrency test If we can't initialize the configuration, treat the test as a failure. * Warn on errors in configuration code * Plug an unchecked error in gallery zip walking * Warn on screenshot serving failure * Warn on encoder screenshot failure * Warn on errors in path-handling code * Undo the errcheck on configurations for now. * Use one-line initializers where applicable rather than using err := f() if err!= nil { .. prefer the shorter if err := f(); err != nil { .. If f() isn't too long of a name, or wraps a function with a body.
2021-09-20 23:34:25 +00:00
if err := instance.Paths.Generated.EmptyTmpDir(); err != nil {
logger.Warnf("couldn't empty temporary directory: %v", err)
}
2021-05-24 04:24:18 +00:00
elapsed := time.Since(start)
logger.Info(fmt.Sprintf("Scan finished (%s)", elapsed))
if job.IsCancelled(ctx) {
logger.Info("Stopping due to user request")
return
}
if err != nil {
2021-05-24 04:24:18 +00:00
return
}
progress.ExecuteTask("Associating galleries", func() {
for _, path := range galleries {
wg.Add()
task := ScanTask{
TxnManager: j.txnManager,
file: file.FSFile(path, nil), // hopefully info is not needed
2021-05-24 04:24:18 +00:00
UseFileMetadata: false,
}
go task.associateGallery(&wg)
wg.Wait()
}
logger.Info("Finished gallery association")
})
j.subscriptions.notify()
}
func (j *ScanJob) queueFiles(ctx context.Context, paths []*models.StashConfig, scanQueue chan<- scanFile, parallelTasks int) (total int, newFiles int) {
defer close(scanQueue)
2021-05-24 04:24:18 +00:00
wg := sizedwaitgroup.New(parallelTasks)
2021-05-24 04:24:18 +00:00
for _, sp := range paths {
csFs, er := utils.IsFsPathCaseSensitive(sp.Path)
if er != nil {
logger.Warnf("Cannot determine fs case sensitivity: %s", er.Error())
}
2021-05-24 04:24:18 +00:00
err := walkFilesToScan(sp, func(path string, info os.FileInfo, err error) error {
2021-05-24 04:24:18 +00:00
// check stop
if job.IsCancelled(ctx) {
return context.Canceled
2021-05-24 04:24:18 +00:00
}
wg.Add()
go func() {
defer wg.Done()
// #1756 - skip zero length files and directories
if info.IsDir() {
return
}
if info.Size() == 0 {
logger.Infof("Skipping zero-length file: %s", path)
return
}
total++
if !j.doesPathExist(path) {
newFiles++
}
scanQueue <- scanFile{
path: path,
info: info,
caseSensitiveFs: csFs,
}
}()
2021-05-24 04:24:18 +00:00
return nil
})
wg.Wait()
if err != nil && !errors.Is(err, context.Canceled) {
logger.Errorf("Error encountered queuing files to scan: %s", err.Error())
return
2021-05-24 04:24:18 +00:00
}
}
2021-05-24 04:24:18 +00:00
return
}
func (j *ScanJob) doesPathExist(path string) bool {
config := config.GetInstance()
vidExt := config.GetVideoExtensions()
imgExt := config.GetImageExtensions()
gExt := config.GetGalleryExtensions()
ret := false
txnErr := j.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error {
if utils.MatchExtension(path, gExt) {
gallery, _ := r.Gallery().FindByPath(path)
if gallery != nil {
ret = true
}
} else if utils.MatchExtension(path, vidExt) {
s, _ := r.Scene().FindByPath(path)
if s != nil {
ret = true
}
} else if utils.MatchExtension(path, imgExt) {
i, _ := r.Image().FindByPath(path)
if i != nil {
ret = true
}
2021-05-24 04:24:18 +00:00
}
return nil
})
if txnErr != nil {
logger.Warnf("error checking if file exists in database: %v", txnErr)
2021-05-24 04:24:18 +00:00
}
return ret
2021-05-24 04:24:18 +00:00
}
2019-02-09 12:30:49 +00:00
type ScanTask struct {
ctx context.Context
TxnManager models.TransactionManager
file file.SourceFile
UseFileMetadata bool
StripFileExtension bool
calculateMD5 bool
fileNamingAlgorithm models.HashAlgorithm
GenerateSprite bool
GeneratePhash bool
GeneratePreview bool
GenerateImagePreview bool
GenerateThumbnails bool
zipGallery *models.Gallery
2021-05-24 04:24:18 +00:00
progress *job.Progress
CaseSensitiveFs bool
mutexManager *utils.MutexManager
2019-02-09 12:30:49 +00:00
}
Toward better context handling (#1835) * Use the request context The code uses context.Background() in a flow where there is a http.Request. Use the requests context instead. * Use a true context in the plugin example Let AddTag/RemoveTag take a context and use that context throughout the example. * Avoid the use of context.Background Prefer context.TODO over context.Background deep in the call chain. This marks the site as something which we need to context-handle later, and also makes it clear to the reader that the context is sort-of temporary in the code base. While here, be consistent in handling the `act` variable in each branch of the if .. { .. } .. check. * Prefer context.TODO over context.Background For the different scraping operations here, there is a context higher up the call chain, which we ought to use. Mark the call-sites as TODO for now, so we can come back later on a sweep of which parts can be context-lifted. * Thread context upwards Initialization requires context for transactions. Thread the context upward the call chain. At the intialization call, add a context.TODO since we can't break this yet. The singleton assumption prevents us from pulling it up into main for now. * make tasks context-aware Change the task interface to understand contexts. Pass the context down in some of the branches where it is needed. * Make QueryStashBoxScene context-aware This call naturally sits inside the request-context. Use it. * Introduce a context in the JS plugin code This allows us to use a context for HTTP calls inside the system. Mark the context with a TODO at top level for now. * Nitpick error formatting Use %v rather than %s for error interfaces. Do not begin an error strong with a capital letter. * Avoid the use of http.Get in FFMPEG download chain Since http.Get has no context, it isn't possible to break out or have policy induced. The call will block until the GET completes. Rewrite to use a http Request and provide a context. Thread the context through the call chain for now. provide context.TODO() at the top level of the initialization chain. * Make getRemoteCDPWSAddress aware of contexts Eliminate a call to http.Get and replace it with a context-aware variant. Push the context upwards in the call chain, but plug it before the scraper interface so we don't have to rewrite said interface yet. Plugged with context.TODO() * Scraper: make the getImage function context-aware Use a context, and pass it upwards. Plug it with context.TODO() up the chain before the rewrite gets too much out of hand for now. Minor tweaks along the way, remove a call to context.Background() deep in the call chain. * Make NOTIFY request context-aware The call sits inside a Request-handler. So it's natural to use the requests context as the context for the outgoing HTTP request. * Use a context in the url scraper code We are sitting in code which has a context, so utilize it for the request as well. * Use a context when checking versions When we check the version of stash on Github, use a context. Thread the context up to the initialization routine of the HTTP/GraphQL server and plug it with a context.TODO() for now. This paves the way for providing a context to the HTTP server code in a future patch. * Make utils func ReadImage context-aware In almost all of the cases, there is a context in the call chain which is a natural use. This is true for all the GraphQL mutations. The exception is in task_stash_box_tag, so plug that task with context.TODO() for now. * Make stash-box get context-aware Thread a context through the call chain until we hit the Client API. Plug it with context.TODO() there for now. * Enable the noctx linter The code is now free of any uncontexted HTTP request. This means we pass the noctx linter, and we can enable it in the code base.
2021-10-14 04:32:41 +00:00
func (t *ScanTask) Start(ctx context.Context) {
2021-05-24 04:24:18 +00:00
var s *models.Scene
path := t.file.Path()
t.progress.ExecuteTask("Scanning "+path, func() {
if isGallery(path) {
Toward better context handling (#1835) * Use the request context The code uses context.Background() in a flow where there is a http.Request. Use the requests context instead. * Use a true context in the plugin example Let AddTag/RemoveTag take a context and use that context throughout the example. * Avoid the use of context.Background Prefer context.TODO over context.Background deep in the call chain. This marks the site as something which we need to context-handle later, and also makes it clear to the reader that the context is sort-of temporary in the code base. While here, be consistent in handling the `act` variable in each branch of the if .. { .. } .. check. * Prefer context.TODO over context.Background For the different scraping operations here, there is a context higher up the call chain, which we ought to use. Mark the call-sites as TODO for now, so we can come back later on a sweep of which parts can be context-lifted. * Thread context upwards Initialization requires context for transactions. Thread the context upward the call chain. At the intialization call, add a context.TODO since we can't break this yet. The singleton assumption prevents us from pulling it up into main for now. * make tasks context-aware Change the task interface to understand contexts. Pass the context down in some of the branches where it is needed. * Make QueryStashBoxScene context-aware This call naturally sits inside the request-context. Use it. * Introduce a context in the JS plugin code This allows us to use a context for HTTP calls inside the system. Mark the context with a TODO at top level for now. * Nitpick error formatting Use %v rather than %s for error interfaces. Do not begin an error strong with a capital letter. * Avoid the use of http.Get in FFMPEG download chain Since http.Get has no context, it isn't possible to break out or have policy induced. The call will block until the GET completes. Rewrite to use a http Request and provide a context. Thread the context through the call chain for now. provide context.TODO() at the top level of the initialization chain. * Make getRemoteCDPWSAddress aware of contexts Eliminate a call to http.Get and replace it with a context-aware variant. Push the context upwards in the call chain, but plug it before the scraper interface so we don't have to rewrite said interface yet. Plugged with context.TODO() * Scraper: make the getImage function context-aware Use a context, and pass it upwards. Plug it with context.TODO() up the chain before the rewrite gets too much out of hand for now. Minor tweaks along the way, remove a call to context.Background() deep in the call chain. * Make NOTIFY request context-aware The call sits inside a Request-handler. So it's natural to use the requests context as the context for the outgoing HTTP request. * Use a context in the url scraper code We are sitting in code which has a context, so utilize it for the request as well. * Use a context when checking versions When we check the version of stash on Github, use a context. Thread the context up to the initialization routine of the HTTP/GraphQL server and plug it with a context.TODO() for now. This paves the way for providing a context to the HTTP server code in a future patch. * Make utils func ReadImage context-aware In almost all of the cases, there is a context in the call chain which is a natural use. This is true for all the GraphQL mutations. The exception is in task_stash_box_tag, so plug that task with context.TODO() for now. * Make stash-box get context-aware Thread a context through the call chain until we hit the Client API. Plug it with context.TODO() there for now. * Enable the noctx linter The code is now free of any uncontexted HTTP request. This means we pass the noctx linter, and we can enable it in the code base.
2021-10-14 04:32:41 +00:00
t.scanGallery(ctx)
} else if isVideo(path) {
2021-05-24 04:24:18 +00:00
s = t.scanScene()
} else if isImage(path) {
2021-05-24 04:24:18 +00:00
t.scanImage()
}
})
if s != nil {
iwg := sizedwaitgroup.New(2)
if t.GenerateSprite {
iwg.Add()
go t.progress.ExecuteTask(fmt.Sprintf("Generating sprites for %s", path), func() {
taskSprite := GenerateSpriteTask{
Scene: *s,
Overwrite: false,
fileNamingAlgorithm: t.fileNamingAlgorithm,
}
taskSprite.Start()
iwg.Done()
2021-05-24 04:24:18 +00:00
})
}
if t.GeneratePhash {
iwg.Add()
go t.progress.ExecuteTask(fmt.Sprintf("Generating phash for %s", path), func() {
taskPhash := GeneratePhashTask{
Scene: *s,
fileNamingAlgorithm: t.fileNamingAlgorithm,
txnManager: t.TxnManager,
}
taskPhash.Start()
iwg.Done()
2021-05-24 04:24:18 +00:00
})
}
2021-05-24 04:24:18 +00:00
if t.GeneratePreview {
iwg.Add()
go t.progress.ExecuteTask(fmt.Sprintf("Generating preview for %s", path), func() {
config := config.GetInstance()
var previewSegmentDuration = config.GetPreviewSegmentDuration()
var previewSegments = config.GetPreviewSegments()
var previewExcludeStart = config.GetPreviewExcludeStart()
var previewExcludeEnd = config.GetPreviewExcludeEnd()
var previewPresent = config.GetPreviewPreset()
// NOTE: the reuse of this model like this is painful.
previewOptions := models.GeneratePreviewOptionsInput{
PreviewSegments: &previewSegments,
PreviewSegmentDuration: &previewSegmentDuration,
PreviewExcludeStart: &previewExcludeStart,
PreviewExcludeEnd: &previewExcludeEnd,
PreviewPreset: &previewPresent,
}
taskPreview := GeneratePreviewTask{
Scene: *s,
ImagePreview: t.GenerateImagePreview,
Options: previewOptions,
Overwrite: false,
fileNamingAlgorithm: t.fileNamingAlgorithm,
}
taskPreview.Start()
iwg.Done()
2021-05-24 04:24:18 +00:00
})
}
2019-02-09 12:30:49 +00:00
2021-05-24 04:24:18 +00:00
iwg.Wait()
}
2019-02-09 12:30:49 +00:00
}
func walkFilesToScan(s *models.StashConfig, f filepath.WalkFunc) error {
config := config.GetInstance()
vidExt := config.GetVideoExtensions()
imgExt := config.GetImageExtensions()
gExt := config.GetGalleryExtensions()
excludeVidRegex := generateRegexps(config.GetExcludes())
excludeImgRegex := generateRegexps(config.GetImageExcludes())
// don't scan zip images directly
if file.IsZipPath(s.Path) {
logger.Warnf("Cannot rescan zip image %s. Rescan zip gallery instead.", s.Path)
return nil
}
generatedPath := config.GetGeneratedPath()
return utils.SymWalk(s.Path, func(path string, info os.FileInfo, err error) error {
2020-10-13 23:51:36 +00:00
if err != nil {
logger.Warnf("error scanning %s: %s", path, err.Error())
return nil
}
if info.IsDir() {
// #1102 - ignore files in generated path
if utils.IsPathInDir(generatedPath, path) {
return filepath.SkipDir
}
// shortcut: skip the directory entirely if it matches both exclusion patterns
// add a trailing separator so that it correctly matches against patterns like path/.*
pathExcludeTest := path + string(filepath.Separator)
if (s.ExcludeVideo || matchFileRegex(pathExcludeTest, excludeVidRegex)) && (s.ExcludeImage || matchFileRegex(pathExcludeTest, excludeImgRegex)) {
return filepath.SkipDir
}
return nil
}
if !s.ExcludeVideo && utils.MatchExtension(path, vidExt) && !matchFileRegex(path, excludeVidRegex) {
return f(path, info, err)
}
if !s.ExcludeImage {
if (utils.MatchExtension(path, imgExt) || utils.MatchExtension(path, gExt)) && !matchFileRegex(path, excludeImgRegex) {
return f(path, info, err)
}
}
return nil
})
}