stash/pkg/scraper/scraper.go

235 lines
6.2 KiB
Go

// Package scraper provides interfaces to interact with the scraper subsystem.
// The [Cache] type is the main entry point to the scraper subsystem.
package scraper
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"strconv"
"github.com/stashapp/stash/pkg/models"
)
type Source struct {
// Index of the configured stash-box instance to use. Should be unset if scraper_id is set
StashBoxIndex *int `json:"stash_box_index"`
// Stash-box endpoint
StashBoxEndpoint *string `json:"stash_box_endpoint"`
// Scraper ID to scrape with. Should be unset if stash_box_index is set
ScraperID *string `json:"scraper_id"`
}
// Scraped Content is the forming union over the different scrapers
type ScrapedContent interface {
IsScrapedContent()
}
// Type of the content a scraper generates
type ScrapeContentType string
const (
ScrapeContentTypeGallery ScrapeContentType = "GALLERY"
ScrapeContentTypeMovie ScrapeContentType = "MOVIE"
ScrapeContentTypeGroup ScrapeContentType = "GROUP"
ScrapeContentTypePerformer ScrapeContentType = "PERFORMER"
ScrapeContentTypeScene ScrapeContentType = "SCENE"
)
var AllScrapeContentType = []ScrapeContentType{
ScrapeContentTypeGallery,
ScrapeContentTypeMovie,
ScrapeContentTypeGroup,
ScrapeContentTypePerformer,
ScrapeContentTypeScene,
}
func (e ScrapeContentType) IsValid() bool {
switch e {
case ScrapeContentTypeGallery, ScrapeContentTypeMovie, ScrapeContentTypeGroup, ScrapeContentTypePerformer, ScrapeContentTypeScene:
return true
}
return false
}
func (e ScrapeContentType) String() string {
return string(e)
}
func (e *ScrapeContentType) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = ScrapeContentType(str)
if !e.IsValid() {
return fmt.Errorf("%s is not a valid ScrapeContentType", str)
}
return nil
}
func (e ScrapeContentType) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
type Scraper struct {
ID string `json:"id"`
Name string `json:"name"`
// Details for performer scraper
Performer *ScraperSpec `json:"performer"`
// Details for scene scraper
Scene *ScraperSpec `json:"scene"`
// Details for gallery scraper
Gallery *ScraperSpec `json:"gallery"`
// Details for movie scraper
Group *ScraperSpec `json:"group"`
// Details for movie scraper
Movie *ScraperSpec `json:"movie"`
}
type ScraperSpec struct {
// URLs matching these can be scraped with
Urls []string `json:"urls"`
SupportedScrapes []ScrapeType `json:"supported_scrapes"`
}
type ScrapeType string
const (
// From text query
ScrapeTypeName ScrapeType = "NAME"
// From existing object
ScrapeTypeFragment ScrapeType = "FRAGMENT"
// From URL
ScrapeTypeURL ScrapeType = "URL"
)
var AllScrapeType = []ScrapeType{
ScrapeTypeName,
ScrapeTypeFragment,
ScrapeTypeURL,
}
func (e ScrapeType) IsValid() bool {
switch e {
case ScrapeTypeName, ScrapeTypeFragment, ScrapeTypeURL:
return true
}
return false
}
func (e ScrapeType) String() string {
return string(e)
}
func (e *ScrapeType) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = ScrapeType(str)
if !e.IsValid() {
return fmt.Errorf("%s is not a valid ScrapeType", str)
}
return nil
}
func (e ScrapeType) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
var (
// ErrMaxRedirects is returned if the max number of HTTP redirects are reached.
ErrMaxRedirects = errors.New("maximum number of HTTP redirects reached")
// ErrNotFound is returned when an entity isn't found
ErrNotFound = errors.New("scraper not found")
// ErrNotSupported is returned when a given invocation isn't supported, and there
// is a guard function which should be able to guard against it.
ErrNotSupported = errors.New("scraper operation not supported")
)
// Input coalesces inputs of different types into a single structure.
// The system expects one of these to be set, and the remaining to be
// set to nil.
type Input struct {
Performer *ScrapedPerformerInput
Scene *ScrapedSceneInput
Gallery *ScrapedGalleryInput
}
// populateURL populates the URL field of the input based on the
// URLs field of the input. Does nothing if the URL field is already set.
func (i *Input) populateURL() {
if i.Scene != nil && i.Scene.URL == nil && len(i.Scene.URLs) > 0 {
i.Scene.URL = &i.Scene.URLs[0]
}
if i.Gallery != nil && i.Gallery.URL == nil && len(i.Gallery.URLs) > 0 {
i.Gallery.URL = &i.Gallery.URLs[0]
}
if i.Performer != nil && i.Performer.URL == nil && len(i.Performer.URLs) > 0 {
i.Performer.URL = &i.Performer.URLs[0]
}
}
// simple type definitions that can help customize
// actions per query
type QueryType int
const (
// for now only SearchQuery is needed
SearchQuery QueryType = iota + 1
)
// scraper is the generic interface to the scraper subsystems
type scraper interface {
// spec returns the scraper specification, suitable for graphql
spec() Scraper
// supports tests if the scraper supports a given content type
supports(ScrapeContentType) bool
// supportsURL tests if the scraper supports scrapes of a given url, producing a given content type
supportsURL(url string, ty ScrapeContentType) bool
}
// urlScraper is the interface of scrapers supporting url loads
type urlScraper interface {
scraper
viaURL(ctx context.Context, client *http.Client, url string, ty ScrapeContentType) (ScrapedContent, error)
}
// nameScraper is the interface of scrapers supporting name loads
type nameScraper interface {
scraper
viaName(ctx context.Context, client *http.Client, name string, ty ScrapeContentType) ([]ScrapedContent, error)
}
// fragmentScraper is the interface of scrapers supporting fragment loads
type fragmentScraper interface {
scraper
viaFragment(ctx context.Context, client *http.Client, input Input) (ScrapedContent, error)
}
// sceneScraper is a scraper which supports scene scrapes with
// scene data as the input.
type sceneScraper interface {
scraper
viaScene(ctx context.Context, client *http.Client, scene *models.Scene) (*ScrapedScene, error)
}
// galleryScraper is a scraper which supports gallery scrapes with
// gallery data as the input.
type galleryScraper interface {
scraper
viaGallery(ctx context.Context, client *http.Client, gallery *models.Gallery) (*ScrapedGallery, error)
}