stash/pkg/scraper/scraper.go

// Package scraper provides interfaces to interact with the scraper subsystem.
// The [Cache] type is the main entry point to the scraper subsystem.
package scraper

import (
	"context"
	"errors"
	"fmt"
	"io"
	"net/http"
	"strconv"

	"github.com/stashapp/stash/pkg/models"
)

type Source struct {
	// Index of the configured stash-box instance to use. Should be unset if scraper_id is set
	StashBoxIndex *int `json:"stash_box_index"`
	// Stash-box endpoint
	StashBoxEndpoint *string `json:"stash_box_endpoint"`
	// Scraper ID to scrape with. Should be unset if stash_box_index is set
	ScraperID *string `json:"scraper_id"`
}

// Scraped Content is the forming union over the different scrapers
type ScrapedContent interface {
	IsScrapedContent()
}

// Type of the content a scraper generates
type ScrapeContentType string

const (
	ScrapeContentTypeGallery   ScrapeContentType = "GALLERY"
	ScrapeContentTypeMovie     ScrapeContentType = "MOVIE"
	ScrapeContentTypeGroup     ScrapeContentType = "GROUP"
	ScrapeContentTypePerformer ScrapeContentType = "PERFORMER"
	ScrapeContentTypeScene     ScrapeContentType = "SCENE"
)

var AllScrapeContentType = []ScrapeContentType{
	ScrapeContentTypeGallery,
	ScrapeContentTypeMovie,
	ScrapeContentTypeGroup,
	ScrapeContentTypePerformer,
	ScrapeContentTypeScene,
}

func (e ScrapeContentType) IsValid() bool {
	switch e {
	case ScrapeContentTypeGallery, ScrapeContentTypeMovie, ScrapeContentTypeGroup, ScrapeContentTypePerformer, ScrapeContentTypeScene:
		return true
	}
	return false
}

func (e ScrapeContentType) String() string {
	return string(e)
}

func (e *ScrapeContentType) UnmarshalGQL(v interface{}) error {
	str, ok := v.(string)
	if !ok {
		return fmt.Errorf("enums must be strings")
	}

	*e = ScrapeContentType(str)
	if !e.IsValid() {
		return fmt.Errorf("%s is not a valid ScrapeContentType", str)
	}
	return nil
}

func (e ScrapeContentType) MarshalGQL(w io.Writer) {
	fmt.Fprint(w, strconv.Quote(e.String()))
}

type Scraper struct {
	ID   string `json:"id"`
	Name string `json:"name"`
	// Details for performer scraper
	Performer *ScraperSpec `json:"performer"`
	// Details for scene scraper
	Scene *ScraperSpec `json:"scene"`
	// Details for gallery scraper
	Gallery *ScraperSpec `json:"gallery"`
	// Details for movie scraper
	Group *ScraperSpec `json:"group"`
	// Details for movie scraper
	Movie *ScraperSpec `json:"movie"`
}

type ScraperSpec struct {
	// URLs matching these can be scraped with
	Urls             []string     `json:"urls"`
	SupportedScrapes []ScrapeType `json:"supported_scrapes"`
}

type ScrapeType string

const (
	// From text query
	ScrapeTypeName ScrapeType = "NAME"
	// From existing object
	ScrapeTypeFragment ScrapeType = "FRAGMENT"
	// From URL
	ScrapeTypeURL ScrapeType = "URL"
)

var AllScrapeType = []ScrapeType{
	ScrapeTypeName,
	ScrapeTypeFragment,
	ScrapeTypeURL,
}

func (e ScrapeType) IsValid() bool {
	switch e {
	case ScrapeTypeName, ScrapeTypeFragment, ScrapeTypeURL:
		return true
	}
	return false
}

func (e ScrapeType) String() string {
	return string(e)
}

func (e *ScrapeType) UnmarshalGQL(v interface{}) error {
	str, ok := v.(string)
	if !ok {
		return fmt.Errorf("enums must be strings")
	}

	*e = ScrapeType(str)
	if !e.IsValid() {
		return fmt.Errorf("%s is not a valid ScrapeType", str)
	}
	return nil
}

func (e ScrapeType) MarshalGQL(w io.Writer) {
	fmt.Fprint(w, strconv.Quote(e.String()))
}

var (
	// ErrMaxRedirects is returned if the max number of HTTP redirects are reached.
	ErrMaxRedirects = errors.New("maximum number of HTTP redirects reached")

	// ErrNotFound is returned when an entity isn't found
	ErrNotFound = errors.New("scraper not found")

	// ErrNotSupported is returned when a given invocation isn't supported, and there
	// is a guard function which should be able to guard against it.
	ErrNotSupported = errors.New("scraper operation not supported")
)

// Input coalesces inputs of different types into a single structure.
// The system expects one of these to be set, and the remaining to be
// set to nil.
type Input struct {
	Performer *ScrapedPerformerInput
	Scene     *ScrapedSceneInput
	Gallery   *ScrapedGalleryInput
}

// populateURL populates the URL field of the input based on the
// URLs field of the input. Does nothing if the URL field is already set.
func (i *Input) populateURL() {
	if i.Scene != nil && i.Scene.URL == nil && len(i.Scene.URLs) > 0 {
		i.Scene.URL = &i.Scene.URLs[0]
	}
	if i.Gallery != nil && i.Gallery.URL == nil && len(i.Gallery.URLs) > 0 {
		i.Gallery.URL = &i.Gallery.URLs[0]
	}
	if i.Performer != nil && i.Performer.URL == nil && len(i.Performer.URLs) > 0 {
		i.Performer.URL = &i.Performer.URLs[0]
	}
}

// simple type definitions that can help customize
// actions per query
type QueryType int

const (
	// for now only SearchQuery is needed
	SearchQuery QueryType = iota + 1
)

// scraper is the generic interface to the scraper subsystems
type scraper interface {
	// spec returns the scraper specification, suitable for graphql
	spec() Scraper
	// supports tests if the scraper supports a given content type
	supports(ScrapeContentType) bool
	// supportsURL tests if the scraper supports scrapes of a given url, producing a given content type
	supportsURL(url string, ty ScrapeContentType) bool
}

// urlScraper is the interface of scrapers supporting url loads
type urlScraper interface {
	scraper

	viaURL(ctx context.Context, client *http.Client, url string, ty ScrapeContentType) (ScrapedContent, error)
}

// nameScraper is the interface of scrapers supporting name loads
type nameScraper interface {
	scraper

	viaName(ctx context.Context, client *http.Client, name string, ty ScrapeContentType) ([]ScrapedContent, error)
}

// fragmentScraper is the interface of scrapers supporting fragment loads
type fragmentScraper interface {
	scraper

	viaFragment(ctx context.Context, client *http.Client, input Input) (ScrapedContent, error)
}

// sceneScraper is a scraper which supports scene scrapes with
// scene data as the input.
type sceneScraper interface {
	scraper

	viaScene(ctx context.Context, client *http.Client, scene *models.Scene) (*ScrapedScene, error)
}

// galleryScraper is a scraper which supports gallery scrapes with
// gallery data as the input.
type galleryScraper interface {
	scraper

	viaGallery(ctx context.Context, client *http.Client, gallery *models.Gallery) (*ScrapedGallery, error)
}