package scraper import ( "context" "crypto/tls" "fmt" "net/http" "os" "path/filepath" "sort" "strings" "time" "github.com/stashapp/stash/pkg/fsutil" "github.com/stashapp/stash/pkg/logger" "github.com/stashapp/stash/pkg/match" "github.com/stashapp/stash/pkg/models" "github.com/stashapp/stash/pkg/txn" ) const ( // scrapeGetTimeout is the timeout for scraper HTTP requests. Includes transfer time. // We may want to bump this at some point and use local context-timeouts if more granularity // is needed. scrapeGetTimeout = time.Second * 60 // maxIdleConnsPerHost is the maximum number of idle connections the HTTP client will // keep on a per-host basis. maxIdleConnsPerHost = 8 // maxRedirects defines the maximum number of redirects the HTTP client will follow maxRedirects = 20 ) // GlobalConfig contains the global scraper options. type GlobalConfig interface { GetScraperUserAgent() string GetScrapersPath() string GetScraperCDPPath() string GetScraperCertCheck() bool GetPythonPath() string GetProxy() string } func isCDPPathHTTP(c GlobalConfig) bool { return strings.HasPrefix(c.GetScraperCDPPath(), "http://") || strings.HasPrefix(c.GetScraperCDPPath(), "https://") } func isCDPPathWS(c GlobalConfig) bool { return strings.HasPrefix(c.GetScraperCDPPath(), "ws://") } type SceneFinder interface { models.SceneGetter models.URLLoader models.VideoFileLoader } type PerformerFinder interface { models.PerformerAutoTagQueryer match.PerformerFinder } type StudioFinder interface { models.StudioAutoTagQueryer FindByStashID(ctx context.Context, stashID models.StashID) ([]*models.Studio, error) } type TagFinder interface { models.TagGetter models.TagAutoTagQueryer } type GalleryFinder interface { models.GalleryGetter models.FileLoader models.URLLoader } type Repository struct { TxnManager models.TxnManager SceneFinder SceneFinder GalleryFinder GalleryFinder TagFinder TagFinder PerformerFinder PerformerFinder GroupFinder match.GroupNamesFinder StudioFinder StudioFinder } func NewRepository(repo models.Repository) Repository { return Repository{ TxnManager: repo.TxnManager, SceneFinder: repo.Scene, GalleryFinder: repo.Gallery, TagFinder: repo.Tag, PerformerFinder: repo.Performer, GroupFinder: repo.Group, StudioFinder: repo.Studio, } } func (r *Repository) WithReadTxn(ctx context.Context, fn txn.TxnFunc) error { return txn.WithReadTxn(ctx, r.TxnManager, fn) } // Cache stores the database of scrapers type Cache struct { client *http.Client scrapers map[string]scraper // Scraper ID -> Scraper globalConfig GlobalConfig repository Repository } // newClient creates a scraper-local http client we use throughout the scraper subsystem. func newClient(gc GlobalConfig) *http.Client { client := &http.Client{ Transport: &http.Transport{ // ignore insecure certificates TLSClientConfig: &tls.Config{InsecureSkipVerify: !gc.GetScraperCertCheck()}, MaxIdleConnsPerHost: maxIdleConnsPerHost, Proxy: http.ProxyFromEnvironment, }, Timeout: scrapeGetTimeout, // defaultCheckRedirect code with max changed from 10 to maxRedirects CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= maxRedirects { return fmt.Errorf("%w: gave up after %d redirects", ErrMaxRedirects, maxRedirects) } return nil }, } return client } // NewCache returns a new Cache. // // Scraper configurations are loaded from yml files in the scrapers // directory in the config and any subdirectories. // // Does not load scrapers. Scrapers will need to be // loaded explicitly using ReloadScrapers. func NewCache(globalConfig GlobalConfig, repo Repository) *Cache { // HTTP Client setup client := newClient(globalConfig) return &Cache{ client: client, globalConfig: globalConfig, repository: repo, } } // ReloadScrapers clears the scraper cache and reloads from the scraper path. // If a scraper cannot be loaded, an error is logged and the scraper is skipped. func (c *Cache) ReloadScrapers() { path := c.globalConfig.GetScrapersPath() scrapers := make(map[string]scraper) // Add built-in scrapers freeOnes := getFreeonesScraper(c.globalConfig) autoTag := getAutoTagScraper(c.repository, c.globalConfig) scrapers[freeOnes.spec().ID] = freeOnes scrapers[autoTag.spec().ID] = autoTag logger.Debugf("Reading scraper configs from %s", path) err := fsutil.SymWalk(path, func(fp string, f os.FileInfo, err error) error { if filepath.Ext(fp) == ".yml" { conf, err := loadConfigFromYAMLFile(fp) if err != nil { logger.Errorf("Error loading scraper %s: %v", fp, err) } else { scraper := newGroupScraper(*conf, c.globalConfig) scrapers[scraper.spec().ID] = scraper } } return nil }) if err != nil { logger.Errorf("Error reading scraper configs: %v", err) } c.scrapers = scrapers } // ListScrapers lists scrapers matching one of the given types. // Returns a list of scrapers, sorted by their name. func (c Cache) ListScrapers(tys []ScrapeContentType) []*Scraper { var ret []*Scraper for _, s := range c.scrapers { for _, t := range tys { if s.supports(t) { spec := s.spec() ret = append(ret, &spec) break } } } sort.Slice(ret, func(i, j int) bool { return strings.ToLower(ret[i].Name) < strings.ToLower(ret[j].Name) }) return ret } // GetScraper returns the scraper matching the provided id. func (c Cache) GetScraper(scraperID string) *Scraper { s := c.findScraper(scraperID) if s != nil { spec := s.spec() return &spec } return nil } func (c Cache) findScraper(scraperID string) scraper { s, ok := c.scrapers[scraperID] if ok { return s } return nil } func (c Cache) ScrapeName(ctx context.Context, id, query string, ty ScrapeContentType) ([]ScrapedContent, error) { // find scraper with the provided id s := c.findScraper(id) if s == nil { return nil, fmt.Errorf("%w: id %s", ErrNotFound, id) } if !s.supports(ty) { return nil, fmt.Errorf("%w: cannot use scraper %s as a %v scraper", ErrNotSupported, id, ty) } ns, ok := s.(nameScraper) if !ok { return nil, fmt.Errorf("%w: cannot use scraper %s to scrape by name", ErrNotSupported, id) } content, err := ns.viaName(ctx, c.client, query, ty) if err != nil { return nil, fmt.Errorf("error while name scraping with scraper %s: %w", id, err) } for i, cc := range content { content[i], err = c.postScrape(ctx, cc) if err != nil { return nil, fmt.Errorf("error while post-scraping with scraper %s: %w", id, err) } } return content, nil } // ScrapeFragment uses the given fragment input to scrape func (c Cache) ScrapeFragment(ctx context.Context, id string, input Input) (ScrapedContent, error) { // set the deprecated URL field if it's not set input.populateURL() s := c.findScraper(id) if s == nil { return nil, fmt.Errorf("%w: id %s", ErrNotFound, id) } fs, ok := s.(fragmentScraper) if !ok { return nil, fmt.Errorf("%w: cannot use scraper %s as a fragment scraper", ErrNotSupported, id) } content, err := fs.viaFragment(ctx, c.client, input) if err != nil { return nil, fmt.Errorf("error while fragment scraping with scraper %s: %w", id, err) } return c.postScrape(ctx, content) } // ScrapeURL scrapes a given url for the given content. Searches the scraper cache // and picks the first scraper capable of scraping the given url into the desired // content. Returns the scraped content or an error if the scrape fails. func (c Cache) ScrapeURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error) { for _, s := range c.scrapers { if s.supportsURL(url, ty) { ul, ok := s.(urlScraper) if !ok { return nil, fmt.Errorf("%w: cannot use scraper %s as an url scraper", ErrNotSupported, s.spec().ID) } ret, err := ul.viaURL(ctx, c.client, url, ty) if err != nil { return nil, err } if ret == nil { return ret, nil } return c.postScrape(ctx, ret) } } return nil, nil } func (c Cache) ScrapeID(ctx context.Context, scraperID string, id int, ty ScrapeContentType) (ScrapedContent, error) { s := c.findScraper(scraperID) if s == nil { return nil, fmt.Errorf("%w: id %s", ErrNotFound, scraperID) } if !s.supports(ty) { return nil, fmt.Errorf("%w: cannot use scraper %s to scrape %v content", ErrNotSupported, scraperID, ty) } var ret ScrapedContent switch ty { case ScrapeContentTypeScene: ss, ok := s.(sceneScraper) if !ok { return nil, fmt.Errorf("%w: cannot use scraper %s as a scene scraper", ErrNotSupported, scraperID) } scene, err := c.getScene(ctx, id) if err != nil { return nil, fmt.Errorf("scraper %s: unable to load scene id %v: %w", scraperID, id, err) } // don't assign nil concrete pointer to ret interface, otherwise nil // detection is harder scraped, err := ss.viaScene(ctx, c.client, scene) if err != nil { return nil, fmt.Errorf("scraper %s: %w", scraperID, err) } if scraped != nil { ret = scraped } case ScrapeContentTypeGallery: gs, ok := s.(galleryScraper) if !ok { return nil, fmt.Errorf("%w: cannot use scraper %s as a gallery scraper", ErrNotSupported, scraperID) } gallery, err := c.getGallery(ctx, id) if err != nil { return nil, fmt.Errorf("scraper %s: unable to load gallery id %v: %w", scraperID, id, err) } // don't assign nil concrete pointer to ret interface, otherwise nil // detection is harder scraped, err := gs.viaGallery(ctx, c.client, gallery) if err != nil { return nil, fmt.Errorf("scraper %s: %w", scraperID, err) } if scraped != nil { ret = scraped } } return c.postScrape(ctx, ret) } func (c Cache) getScene(ctx context.Context, sceneID int) (*models.Scene, error) { var ret *models.Scene r := c.repository if err := r.WithReadTxn(ctx, func(ctx context.Context) error { qb := r.SceneFinder var err error ret, err = qb.Find(ctx, sceneID) if err != nil { return err } if ret == nil { return fmt.Errorf("scene with id %d not found", sceneID) } if err := ret.LoadURLs(ctx, qb); err != nil { return err } if err := ret.LoadFiles(ctx, qb); err != nil { return err } return nil }); err != nil { return nil, err } return ret, nil } func (c Cache) getGallery(ctx context.Context, galleryID int) (*models.Gallery, error) { var ret *models.Gallery r := c.repository if err := r.WithReadTxn(ctx, func(ctx context.Context) error { qb := r.GalleryFinder var err error ret, err = qb.Find(ctx, galleryID) if err != nil { return err } if ret == nil { return fmt.Errorf("gallery with id %d not found", galleryID) } if err := ret.LoadURLs(ctx, qb); err != nil { return err } if err := ret.LoadFiles(ctx, qb); err != nil { return err } return nil }); err != nil { return nil, err } return ret, nil }