stash/pkg/scraper/cache.go

package scraper

import (
	"context"
	"crypto/tls"
	"fmt"
	"net/http"
	"os"
	"path/filepath"
	"sort"
	"strings"
	"time"

	"github.com/stashapp/stash/pkg/fsutil"
	"github.com/stashapp/stash/pkg/logger"
	"github.com/stashapp/stash/pkg/match"
	"github.com/stashapp/stash/pkg/models"
	"github.com/stashapp/stash/pkg/txn"
)

const (
	// scrapeGetTimeout is the timeout for scraper HTTP requests. Includes transfer time.
	// We may want to bump this at some point and use local context-timeouts if more granularity
	// is needed.
	scrapeGetTimeout = time.Second * 60

	// maxIdleConnsPerHost is the maximum number of idle connections the HTTP client will
	// keep on a per-host basis.
	maxIdleConnsPerHost = 8

	// maxRedirects defines the maximum number of redirects the HTTP client will follow
	maxRedirects = 20
)

// GlobalConfig contains the global scraper options.
type GlobalConfig interface {
	GetScraperUserAgent() string
	GetScrapersPath() string
	GetScraperCDPPath() string
	GetScraperCertCheck() bool
	GetPythonPath() string
	GetProxy() string
}

func isCDPPathHTTP(c GlobalConfig) bool {
	return strings.HasPrefix(c.GetScraperCDPPath(), "http://") || strings.HasPrefix(c.GetScraperCDPPath(), "https://")
}

func isCDPPathWS(c GlobalConfig) bool {
	return strings.HasPrefix(c.GetScraperCDPPath(), "ws://")
}

type SceneFinder interface {
	models.SceneGetter
	models.URLLoader
	models.VideoFileLoader
}

type PerformerFinder interface {
	models.PerformerAutoTagQueryer
	match.PerformerFinder
}

type StudioFinder interface {
	models.StudioAutoTagQueryer
	FindByStashID(ctx context.Context, stashID models.StashID) ([]*models.Studio, error)
}

type TagFinder interface {
	models.TagGetter
	models.TagAutoTagQueryer
}

type GalleryFinder interface {
	models.GalleryGetter
	models.FileLoader
	models.URLLoader
}

type Repository struct {
	TxnManager models.TxnManager

	SceneFinder     SceneFinder
	GalleryFinder   GalleryFinder
	TagFinder       TagFinder
	PerformerFinder PerformerFinder
	GroupFinder     match.GroupNamesFinder
	StudioFinder    StudioFinder
}

func NewRepository(repo models.Repository) Repository {
	return Repository{
		TxnManager:      repo.TxnManager,
		SceneFinder:     repo.Scene,
		GalleryFinder:   repo.Gallery,
		TagFinder:       repo.Tag,
		PerformerFinder: repo.Performer,
		GroupFinder:     repo.Group,
		StudioFinder:    repo.Studio,
	}
}

func (r *Repository) WithReadTxn(ctx context.Context, fn txn.TxnFunc) error {
	return txn.WithReadTxn(ctx, r.TxnManager, fn)
}

// Cache stores the database of scrapers
type Cache struct {
	client       *http.Client
	scrapers     map[string]scraper // Scraper ID -> Scraper
	globalConfig GlobalConfig

	repository Repository
}

// newClient creates a scraper-local http client we use throughout the scraper subsystem.
func newClient(gc GlobalConfig) *http.Client {
	client := &http.Client{
		Transport: &http.Transport{ // ignore insecure certificates
			TLSClientConfig:     &tls.Config{InsecureSkipVerify: !gc.GetScraperCertCheck()},
			MaxIdleConnsPerHost: maxIdleConnsPerHost,
			Proxy:               http.ProxyFromEnvironment,
		},
		Timeout: scrapeGetTimeout,
		// defaultCheckRedirect code with max changed from 10 to maxRedirects
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			if len(via) >= maxRedirects {
				return fmt.Errorf("%w: gave up after %d redirects", ErrMaxRedirects, maxRedirects)
			}
			return nil
		},
	}

	return client
}

// NewCache returns a new Cache.
//
// Scraper configurations are loaded from yml files in the scrapers
// directory in the config and any subdirectories.
//
// Does not load scrapers. Scrapers will need to be
// loaded explicitly using ReloadScrapers.
func NewCache(globalConfig GlobalConfig, repo Repository) *Cache {
	// HTTP Client setup
	client := newClient(globalConfig)

	return &Cache{
		client:       client,
		globalConfig: globalConfig,
		repository:   repo,
	}
}

// ReloadScrapers clears the scraper cache and reloads from the scraper path.
// If a scraper cannot be loaded, an error is logged and the scraper is skipped.
func (c *Cache) ReloadScrapers() {
	path := c.globalConfig.GetScrapersPath()
	scrapers := make(map[string]scraper)

	// Add built-in scrapers
	freeOnes := getFreeonesScraper(c.globalConfig)
	autoTag := getAutoTagScraper(c.repository, c.globalConfig)
	scrapers[freeOnes.spec().ID] = freeOnes
	scrapers[autoTag.spec().ID] = autoTag

	logger.Debugf("Reading scraper configs from %s", path)

	err := fsutil.SymWalk(path, func(fp string, f os.FileInfo, err error) error {
		if filepath.Ext(fp) == ".yml" {
			conf, err := loadConfigFromYAMLFile(fp)
			if err != nil {
				logger.Errorf("Error loading scraper %s: %v", fp, err)
			} else {
				scraper := newGroupScraper(*conf, c.globalConfig)
				scrapers[scraper.spec().ID] = scraper
			}
		}
		return nil
	})

	if err != nil {
		logger.Errorf("Error reading scraper configs: %v", err)
	}

	c.scrapers = scrapers
}

// ListScrapers lists scrapers matching one of the given types.
// Returns a list of scrapers, sorted by their name.
func (c Cache) ListScrapers(tys []ScrapeContentType) []*Scraper {
	var ret []*Scraper
	for _, s := range c.scrapers {
		for _, t := range tys {
			if s.supports(t) {
				spec := s.spec()
				ret = append(ret, &spec)
				break
			}
		}
	}

	sort.Slice(ret, func(i, j int) bool {
		return strings.ToLower(ret[i].Name) < strings.ToLower(ret[j].Name)
	})

	return ret
}

// GetScraper returns the scraper matching the provided id.
func (c Cache) GetScraper(scraperID string) *Scraper {
	s := c.findScraper(scraperID)
	if s != nil {
		spec := s.spec()
		return &spec
	}

	return nil
}

func (c Cache) findScraper(scraperID string) scraper {
	s, ok := c.scrapers[scraperID]
	if ok {
		return s
	}

	return nil
}

func (c Cache) ScrapeName(ctx context.Context, id, query string, ty ScrapeContentType) ([]ScrapedContent, error) {
	// find scraper with the provided id
	s := c.findScraper(id)
	if s == nil {
		return nil, fmt.Errorf("%w: id %s", ErrNotFound, id)
	}
	if !s.supports(ty) {
		return nil, fmt.Errorf("%w: cannot use scraper %s as a %v scraper", ErrNotSupported, id, ty)
	}

	ns, ok := s.(nameScraper)
	if !ok {
		return nil, fmt.Errorf("%w: cannot use scraper %s to scrape by name", ErrNotSupported, id)
	}

	content, err := ns.viaName(ctx, c.client, query, ty)
	if err != nil {
		return nil, fmt.Errorf("error while name scraping with scraper %s: %w", id, err)
	}

	for i, cc := range content {
		content[i], err = c.postScrape(ctx, cc)
		if err != nil {
			return nil, fmt.Errorf("error while post-scraping with scraper %s: %w", id, err)
		}
	}

	return content, nil
}

// ScrapeFragment uses the given fragment input to scrape
func (c Cache) ScrapeFragment(ctx context.Context, id string, input Input) (ScrapedContent, error) {
	// set the deprecated URL field if it's not set
	input.populateURL()

	s := c.findScraper(id)
	if s == nil {
		return nil, fmt.Errorf("%w: id %s", ErrNotFound, id)
	}

	fs, ok := s.(fragmentScraper)
	if !ok {
		return nil, fmt.Errorf("%w: cannot use scraper %s as a fragment scraper", ErrNotSupported, id)
	}

	content, err := fs.viaFragment(ctx, c.client, input)
	if err != nil {
		return nil, fmt.Errorf("error while fragment scraping with scraper %s: %w", id, err)
	}

	return c.postScrape(ctx, content)
}

// ScrapeURL scrapes a given url for the given content. Searches the scraper cache
// and picks the first scraper capable of scraping the given url into the desired
// content. Returns the scraped content or an error if the scrape fails.
func (c Cache) ScrapeURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error) {
	for _, s := range c.scrapers {
		if s.supportsURL(url, ty) {
			ul, ok := s.(urlScraper)
			if !ok {
				return nil, fmt.Errorf("%w: cannot use scraper %s as an url scraper", ErrNotSupported, s.spec().ID)
			}
			ret, err := ul.viaURL(ctx, c.client, url, ty)
			if err != nil {
				return nil, err
			}

			if ret == nil {
				return ret, nil
			}

			return c.postScrape(ctx, ret)
		}
	}

	return nil, nil
}

func (c Cache) ScrapeID(ctx context.Context, scraperID string, id int, ty ScrapeContentType) (ScrapedContent, error) {
	s := c.findScraper(scraperID)
	if s == nil {
		return nil, fmt.Errorf("%w: id %s", ErrNotFound, scraperID)
	}

	if !s.supports(ty) {
		return nil, fmt.Errorf("%w: cannot use scraper %s to scrape %v content", ErrNotSupported, scraperID, ty)
	}

	var ret ScrapedContent
	switch ty {
	case ScrapeContentTypeScene:
		ss, ok := s.(sceneScraper)
		if !ok {
			return nil, fmt.Errorf("%w: cannot use scraper %s as a scene scraper", ErrNotSupported, scraperID)
		}

		scene, err := c.getScene(ctx, id)
		if err != nil {
			return nil, fmt.Errorf("scraper %s: unable to load scene id %v: %w", scraperID, id, err)
		}

		// don't assign nil concrete pointer to ret interface, otherwise nil
		// detection is harder
		scraped, err := ss.viaScene(ctx, c.client, scene)
		if err != nil {
			return nil, fmt.Errorf("scraper %s: %w", scraperID, err)
		}

		if scraped != nil {
			ret = scraped
		}
	case ScrapeContentTypeGallery:
		gs, ok := s.(galleryScraper)
		if !ok {
			return nil, fmt.Errorf("%w: cannot use scraper %s as a gallery scraper", ErrNotSupported, scraperID)
		}

		gallery, err := c.getGallery(ctx, id)
		if err != nil {
			return nil, fmt.Errorf("scraper %s: unable to load gallery id %v: %w", scraperID, id, err)
		}

		// don't assign nil concrete pointer to ret interface, otherwise nil
		// detection is harder
		scraped, err := gs.viaGallery(ctx, c.client, gallery)
		if err != nil {
			return nil, fmt.Errorf("scraper %s: %w", scraperID, err)
		}

		if scraped != nil {
			ret = scraped
		}
	}

	return c.postScrape(ctx, ret)
}

func (c Cache) getScene(ctx context.Context, sceneID int) (*models.Scene, error) {
	var ret *models.Scene
	r := c.repository
	if err := r.WithReadTxn(ctx, func(ctx context.Context) error {
		qb := r.SceneFinder

		var err error
		ret, err = qb.Find(ctx, sceneID)
		if err != nil {
			return err
		}

		if ret == nil {
			return fmt.Errorf("scene with id %d not found", sceneID)
		}

		if err := ret.LoadURLs(ctx, qb); err != nil {
			return err
		}

		if err := ret.LoadFiles(ctx, qb); err != nil {
			return err
		}

		return nil
	}); err != nil {
		return nil, err
	}
	return ret, nil
}

func (c Cache) getGallery(ctx context.Context, galleryID int) (*models.Gallery, error) {
	var ret *models.Gallery
	r := c.repository
	if err := r.WithReadTxn(ctx, func(ctx context.Context) error {
		qb := r.GalleryFinder

		var err error
		ret, err = qb.Find(ctx, galleryID)
		if err != nil {
			return err
		}

		if ret == nil {
			return fmt.Errorf("gallery with id %d not found", galleryID)
		}

		if err := ret.LoadURLs(ctx, qb); err != nil {
			return err
		}

		if err := ret.LoadFiles(ctx, qb); err != nil {
			return err
		}

		return nil
	}); err != nil {
		return nil, err
	}
	return ret, nil
}