2019-11-19 02:49:05 +00:00
|
|
|
package scraper
|
|
|
|
|
|
|
|
import (
|
|
|
|
"errors"
|
2020-05-18 22:44:33 +00:00
|
|
|
"os"
|
2019-11-19 02:49:05 +00:00
|
|
|
"path/filepath"
|
2019-12-16 01:35:34 +00:00
|
|
|
"strconv"
|
2020-08-04 00:42:40 +00:00
|
|
|
"strings"
|
2019-11-19 02:49:05 +00:00
|
|
|
|
|
|
|
"github.com/stashapp/stash/pkg/logger"
|
|
|
|
"github.com/stashapp/stash/pkg/models"
|
|
|
|
)
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// GlobalConfig contains the global scraper options.
|
|
|
|
type GlobalConfig struct {
|
|
|
|
// User Agent used when scraping using http.
|
|
|
|
UserAgent string
|
2020-08-04 00:42:40 +00:00
|
|
|
|
|
|
|
// Path (file or remote address) to a Chrome CDP instance.
|
|
|
|
CDPPath string
|
|
|
|
Path string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c GlobalConfig) isCDPPathHTTP() bool {
|
|
|
|
return strings.HasPrefix(c.CDPPath, "http://") || strings.HasPrefix(c.CDPPath, "https://")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c GlobalConfig) isCDPPathWS() bool {
|
|
|
|
return strings.HasPrefix(c.CDPPath, "ws://")
|
2020-07-21 04:06:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Cache stores scraper details.
|
|
|
|
type Cache struct {
|
|
|
|
scrapers []config
|
|
|
|
globalConfig GlobalConfig
|
|
|
|
}
|
2019-11-19 02:49:05 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// NewCache returns a new Cache loading scraper configurations from the
|
|
|
|
// scraper path provided in the global config object. It returns a new
|
|
|
|
// instance and an error if the scraper directory could not be loaded.
|
|
|
|
//
|
|
|
|
// Scraper configurations are loaded from yml files in the provided scrapers
|
|
|
|
// directory and any subdirectories.
|
|
|
|
func NewCache(globalConfig GlobalConfig) (*Cache, error) {
|
|
|
|
scrapers, err := loadScrapers(globalConfig.Path)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2019-11-19 02:49:05 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
return &Cache{
|
|
|
|
globalConfig: globalConfig,
|
|
|
|
scrapers: scrapers,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func loadScrapers(path string) ([]config, error) {
|
|
|
|
scrapers := make([]config, 0)
|
2019-11-19 02:49:05 +00:00
|
|
|
|
|
|
|
logger.Debugf("Reading scraper configs from %s", path)
|
2020-05-18 22:44:33 +00:00
|
|
|
scraperFiles := []string{}
|
|
|
|
err := filepath.Walk(path, func(fp string, f os.FileInfo, err error) error {
|
|
|
|
if filepath.Ext(fp) == ".yml" {
|
|
|
|
scraperFiles = append(scraperFiles, fp)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
})
|
2019-11-19 02:49:05 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
logger.Errorf("Error reading scraper configs: %s", err.Error())
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// add built-in freeones scraper
|
2020-07-21 04:06:25 +00:00
|
|
|
scrapers = append(scrapers, getFreeonesScraper())
|
2019-11-19 02:49:05 +00:00
|
|
|
|
|
|
|
for _, file := range scraperFiles {
|
2020-05-15 10:10:20 +00:00
|
|
|
scraper, err := loadScraperFromYAMLFile(file)
|
2019-11-19 02:49:05 +00:00
|
|
|
if err != nil {
|
|
|
|
logger.Errorf("Error loading scraper %s: %s", file, err.Error())
|
|
|
|
} else {
|
|
|
|
scrapers = append(scrapers, *scraper)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return scrapers, nil
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// ReloadScrapers clears the scraper cache and reloads from the scraper path.
|
|
|
|
// In the event of an error during loading, the cache will be left empty.
|
|
|
|
func (c *Cache) ReloadScrapers() error {
|
|
|
|
c.scrapers = nil
|
|
|
|
scrapers, err := loadScrapers(c.globalConfig.Path)
|
2019-11-19 02:49:05 +00:00
|
|
|
if err != nil {
|
2020-07-21 04:06:25 +00:00
|
|
|
return err
|
2019-11-19 02:49:05 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
c.scrapers = scrapers
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// UpdateConfig updates the global config for the cache. If the scraper path
|
|
|
|
// has changed, ReloadScrapers will need to be called separately.
|
|
|
|
func (c *Cache) UpdateConfig(globalConfig GlobalConfig) {
|
|
|
|
c.globalConfig = globalConfig
|
|
|
|
}
|
|
|
|
|
|
|
|
// ListPerformerScrapers returns a list of scrapers that are capable of
|
|
|
|
// scraping performers.
|
|
|
|
func (c Cache) ListPerformerScrapers() []*models.Scraper {
|
2019-11-19 02:49:05 +00:00
|
|
|
var ret []*models.Scraper
|
2020-07-21 04:06:25 +00:00
|
|
|
for _, s := range c.scrapers {
|
2019-11-19 02:49:05 +00:00
|
|
|
// filter on type
|
2019-12-12 19:27:44 +00:00
|
|
|
if s.supportsPerformers() {
|
2019-11-19 02:49:05 +00:00
|
|
|
ret = append(ret, s.toScraper())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
return ret
|
2019-11-19 02:49:05 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// ListSceneScrapers returns a list of scrapers that are capable of
|
|
|
|
// scraping scenes.
|
|
|
|
func (c Cache) ListSceneScrapers() []*models.Scraper {
|
2019-12-16 01:35:34 +00:00
|
|
|
var ret []*models.Scraper
|
2020-07-21 04:06:25 +00:00
|
|
|
for _, s := range c.scrapers {
|
2019-12-16 01:35:34 +00:00
|
|
|
// filter on type
|
|
|
|
if s.supportsScenes() {
|
|
|
|
ret = append(ret, s.toScraper())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
return ret
|
2019-12-16 01:35:34 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c Cache) findScraper(scraperID string) *config {
|
|
|
|
for _, s := range c.scrapers {
|
2019-11-19 02:49:05 +00:00
|
|
|
if s.ID == scraperID {
|
|
|
|
return &s
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// ScrapePerformerList uses the scraper with the provided ID to query for
|
|
|
|
// performers using the provided query string. It returns a list of
|
|
|
|
// scraped performer data.
|
|
|
|
func (c Cache) ScrapePerformerList(scraperID string, query string) ([]*models.ScrapedPerformer, error) {
|
2019-11-19 02:49:05 +00:00
|
|
|
// find scraper with the provided id
|
2020-07-21 04:06:25 +00:00
|
|
|
s := c.findScraper(scraperID)
|
2019-11-19 02:49:05 +00:00
|
|
|
if s != nil {
|
2020-07-21 04:06:25 +00:00
|
|
|
return s.ScrapePerformerNames(query, c.globalConfig)
|
2019-11-19 02:49:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil, errors.New("Scraper with ID " + scraperID + " not found")
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// ScrapePerformer uses the scraper with the provided ID to scrape a
|
|
|
|
// performer using the provided performer fragment.
|
|
|
|
func (c Cache) ScrapePerformer(scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
2019-11-19 02:49:05 +00:00
|
|
|
// find scraper with the provided id
|
2020-07-21 04:06:25 +00:00
|
|
|
s := c.findScraper(scraperID)
|
2019-11-19 02:49:05 +00:00
|
|
|
if s != nil {
|
2020-07-21 04:06:25 +00:00
|
|
|
ret, err := s.ScrapePerformer(scrapedPerformer, c.globalConfig)
|
2020-03-11 00:41:55 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// post-process - set the image if applicable
|
2020-07-21 04:06:25 +00:00
|
|
|
if err := setPerformerImage(ret, c.globalConfig); err != nil {
|
2020-03-11 00:41:55 +00:00
|
|
|
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret, nil
|
2019-11-19 02:49:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil, errors.New("Scraper with ID " + scraperID + " not found")
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// ScrapePerformerURL uses the first scraper it finds that matches the URL
|
|
|
|
// provided to scrape a performer. If no scrapers are found that matches
|
|
|
|
// the URL, then nil is returned.
|
|
|
|
func (c Cache) ScrapePerformerURL(url string) (*models.ScrapedPerformer, error) {
|
|
|
|
for _, s := range c.scrapers {
|
2019-12-12 19:27:44 +00:00
|
|
|
if s.matchesPerformerURL(url) {
|
2020-07-21 04:06:25 +00:00
|
|
|
ret, err := s.ScrapePerformerURL(url, c.globalConfig)
|
2020-03-11 00:41:55 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// post-process - set the image if applicable
|
2020-07-21 04:06:25 +00:00
|
|
|
if err := setPerformerImage(ret, c.globalConfig); err != nil {
|
2020-03-11 00:41:55 +00:00
|
|
|
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret, nil
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
2019-11-19 02:49:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil, nil
|
|
|
|
}
|
2019-12-16 01:35:34 +00:00
|
|
|
|
|
|
|
func matchPerformer(p *models.ScrapedScenePerformer) error {
|
|
|
|
qb := models.NewPerformerQueryBuilder()
|
|
|
|
|
2020-05-24 06:19:22 +00:00
|
|
|
performers, err := qb.FindByNames([]string{p.Name}, nil, true)
|
2019-12-16 01:35:34 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(performers) != 1 {
|
|
|
|
// ignore - cannot match
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
id := strconv.Itoa(performers[0].ID)
|
|
|
|
p.ID = &id
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func matchStudio(s *models.ScrapedSceneStudio) error {
|
|
|
|
qb := models.NewStudioQueryBuilder()
|
|
|
|
|
2020-05-24 06:19:22 +00:00
|
|
|
studio, err := qb.FindByName(s.Name, nil, true)
|
2019-12-16 01:35:34 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if studio == nil {
|
|
|
|
// ignore - cannot match
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
id := strconv.Itoa(studio.ID)
|
|
|
|
s.ID = &id
|
|
|
|
return nil
|
|
|
|
}
|
2020-07-21 04:06:25 +00:00
|
|
|
|
2020-03-10 03:28:15 +00:00
|
|
|
func matchMovie(m *models.ScrapedSceneMovie) error {
|
|
|
|
qb := models.NewMovieQueryBuilder()
|
|
|
|
|
2020-05-24 06:19:22 +00:00
|
|
|
movies, err := qb.FindByNames([]string{m.Name}, nil, true)
|
2020-03-10 03:28:15 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-05-15 10:10:20 +00:00
|
|
|
if len(movies) != 1 {
|
2020-03-10 03:28:15 +00:00
|
|
|
// ignore - cannot match
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
id := strconv.Itoa(movies[0].ID)
|
|
|
|
m.ID = &id
|
|
|
|
return nil
|
|
|
|
}
|
2019-12-16 01:35:34 +00:00
|
|
|
|
|
|
|
func matchTag(s *models.ScrapedSceneTag) error {
|
|
|
|
qb := models.NewTagQueryBuilder()
|
|
|
|
|
2020-05-24 06:19:22 +00:00
|
|
|
tag, err := qb.FindByName(s.Name, nil, true)
|
2019-12-16 01:35:34 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if tag == nil {
|
|
|
|
// ignore - cannot match
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
id := strconv.Itoa(tag.ID)
|
|
|
|
s.ID = &id
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c Cache) postScrapeScene(ret *models.ScrapedScene) error {
|
2019-12-16 01:35:34 +00:00
|
|
|
for _, p := range ret.Performers {
|
|
|
|
err := matchPerformer(p)
|
2020-03-10 03:28:15 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, p := range ret.Movies {
|
|
|
|
err := matchMovie(p)
|
2019-12-16 01:35:34 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, t := range ret.Tags {
|
|
|
|
err := matchTag(t)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ret.Studio != nil {
|
|
|
|
err := matchStudio(ret.Studio)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-11 00:41:55 +00:00
|
|
|
// post-process - set the image if applicable
|
2020-07-21 04:06:25 +00:00
|
|
|
if err := setSceneImage(ret, c.globalConfig); err != nil {
|
2020-03-11 00:41:55 +00:00
|
|
|
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
|
|
|
|
}
|
|
|
|
|
2019-12-16 01:35:34 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// ScrapeScene uses the scraper with the provided ID to scrape a scene.
|
|
|
|
func (c Cache) ScrapeScene(scraperID string, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
2019-12-16 01:35:34 +00:00
|
|
|
// find scraper with the provided id
|
2020-07-21 04:06:25 +00:00
|
|
|
s := c.findScraper(scraperID)
|
2019-12-16 01:35:34 +00:00
|
|
|
if s != nil {
|
2020-07-21 04:06:25 +00:00
|
|
|
ret, err := s.ScrapeScene(scene, c.globalConfig)
|
2019-12-16 01:35:34 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2019-12-21 00:13:23 +00:00
|
|
|
if ret != nil {
|
2020-07-21 04:06:25 +00:00
|
|
|
err = c.postScrapeScene(ret)
|
2019-12-21 00:13:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2019-12-16 01:35:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, errors.New("Scraper with ID " + scraperID + " not found")
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// ScrapeSceneURL uses the first scraper it finds that matches the URL
|
|
|
|
// provided to scrape a scene. If no scrapers are found that matches
|
|
|
|
// the URL, then nil is returned.
|
|
|
|
func (c Cache) ScrapeSceneURL(url string) (*models.ScrapedScene, error) {
|
|
|
|
for _, s := range c.scrapers {
|
2019-12-16 01:35:34 +00:00
|
|
|
if s.matchesSceneURL(url) {
|
2020-07-21 04:06:25 +00:00
|
|
|
ret, err := s.ScrapeSceneURL(url, c.globalConfig)
|
2019-12-16 01:35:34 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
err = c.postScrapeScene(ret)
|
2019-12-16 01:35:34 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, nil
|
|
|
|
}
|