2019-12-12 19:27:44 +00:00
|
|
|
package scraper
|
|
|
|
|
|
|
|
import (
|
2020-07-21 04:06:25 +00:00
|
|
|
"errors"
|
|
|
|
"fmt"
|
2020-05-15 10:10:20 +00:00
|
|
|
"io"
|
2019-12-12 19:27:44 +00:00
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
|
|
|
|
|
|
|
"gopkg.in/yaml.v2"
|
|
|
|
|
|
|
|
"github.com/stashapp/stash/pkg/models"
|
|
|
|
)
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
type config struct {
|
|
|
|
ID string
|
|
|
|
path string
|
2019-12-21 00:13:23 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// The name of the scraper. This is displayed in the UI.
|
|
|
|
Name string `yaml:"name"`
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// Configuration for querying performers by name
|
|
|
|
PerformerByName *scraperTypeConfig `yaml:"performerByName"`
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// Configuration for querying performers by a Performer fragment
|
|
|
|
PerformerByFragment *scraperTypeConfig `yaml:"performerByFragment"`
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// Configuration for querying a performer by a URL
|
|
|
|
PerformerByURL []*scrapeByURLConfig `yaml:"performerByURL"`
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// Configuration for querying scenes by a Scene fragment
|
|
|
|
SceneByFragment *scraperTypeConfig `yaml:"sceneByFragment"`
|
2019-12-21 00:13:23 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// Configuration for querying a scene by a URL
|
|
|
|
SceneByURL []*scrapeByURLConfig `yaml:"sceneByURL"`
|
2020-01-31 22:17:40 +00:00
|
|
|
|
2020-08-10 05:34:15 +00:00
|
|
|
// Configuration for querying a movie by a URL
|
|
|
|
MovieByURL []*scrapeByURLConfig `yaml:"movieByURL"`
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// Scraper debugging options
|
|
|
|
DebugOptions *scraperDebugOptions `yaml:"debug"`
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// Stash server configuration
|
|
|
|
StashServer *stashServer `yaml:"stashServer"`
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// Xpath scraping configurations
|
|
|
|
XPathScrapers mappedScrapers `yaml:"xPathScrapers"`
|
2020-08-10 04:21:50 +00:00
|
|
|
|
|
|
|
// Json scraping configurations
|
|
|
|
JsonScrapers mappedScrapers `yaml:"jsonScrapers"`
|
2020-08-04 00:42:40 +00:00
|
|
|
|
|
|
|
// Scraping driver options
|
|
|
|
DriverOptions *scraperDriverOptions `yaml:"driver"`
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) validate() error {
|
|
|
|
if strings.TrimSpace(c.Name) == "" {
|
|
|
|
return errors.New("name must not be empty")
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
if c.PerformerByName != nil {
|
|
|
|
if err := c.PerformerByName.validate(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
if c.PerformerByFragment != nil {
|
|
|
|
if err := c.PerformerByFragment.validate(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
if c.SceneByFragment != nil {
|
|
|
|
if err := c.SceneByFragment.validate(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
for _, s := range c.PerformerByURL {
|
|
|
|
if err := s.validate(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2019-12-16 01:35:34 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
for _, s := range c.SceneByURL {
|
|
|
|
if err := s.validate(); err != nil {
|
|
|
|
return err
|
2019-12-16 01:35:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-10 05:34:15 +00:00
|
|
|
for _, s := range c.MovieByURL {
|
|
|
|
if err := s.validate(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
type stashServer struct {
|
|
|
|
URL string `yaml:"url"`
|
2019-12-16 01:35:34 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
type scraperTypeConfig struct {
|
|
|
|
Action scraperAction `yaml:"action"`
|
|
|
|
Script []string `yaml:"script,flow"`
|
|
|
|
Scraper string `yaml:"scraper"`
|
2019-12-16 01:35:34 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
// for xpath name scraper only
|
|
|
|
QueryURL string `yaml:"queryURL"`
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c scraperTypeConfig) validate() error {
|
|
|
|
if !c.Action.IsValid() {
|
|
|
|
return fmt.Errorf("%s is not a valid scraper action", c.Action)
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
if c.Action == scraperActionScript && len(c.Script) == 0 {
|
|
|
|
return errors.New("script is mandatory for script scraper action")
|
|
|
|
}
|
2019-12-16 01:35:34 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
return nil
|
2019-12-16 01:35:34 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
type scrapeByURLConfig struct {
|
|
|
|
scraperTypeConfig `yaml:",inline"`
|
|
|
|
URL []string `yaml:"url,flow"`
|
2019-12-16 01:35:34 +00:00
|
|
|
}
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c scrapeByURLConfig) validate() error {
|
|
|
|
if len(c.URL) == 0 {
|
|
|
|
return errors.New("url is mandatory for scrape by url scrapers")
|
|
|
|
}
|
2019-12-16 01:35:34 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
return c.scraperTypeConfig.validate()
|
2019-12-16 01:35:34 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c scrapeByURLConfig) matchesURL(url string) bool {
|
|
|
|
for _, thisURL := range c.URL {
|
|
|
|
if strings.Contains(url, thisURL) {
|
|
|
|
return true
|
|
|
|
}
|
2019-12-16 01:35:34 +00:00
|
|
|
}
|
2020-07-21 04:06:25 +00:00
|
|
|
|
|
|
|
return false
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
|
|
|
|
2020-03-20 21:55:15 +00:00
|
|
|
type scraperDebugOptions struct {
|
|
|
|
PrintHTML bool `yaml:"printHTML"`
|
|
|
|
}
|
|
|
|
|
2020-08-04 00:42:40 +00:00
|
|
|
type scraperDriverOptions struct {
|
|
|
|
UseCDP bool `yaml:"useCDP"`
|
|
|
|
Sleep int `yaml:"sleep"`
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func loadScraperFromYAML(id string, reader io.Reader) (*config, error) {
|
|
|
|
ret := &config{}
|
2019-12-12 19:27:44 +00:00
|
|
|
|
2020-05-15 10:10:20 +00:00
|
|
|
parser := yaml.NewDecoder(reader)
|
2019-12-12 19:27:44 +00:00
|
|
|
parser.SetStrict(true)
|
2020-05-15 10:10:20 +00:00
|
|
|
err := parser.Decode(&ret)
|
2019-12-12 19:27:44 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
ret.ID = id
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
if err := ret.validate(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2019-12-12 19:27:44 +00:00
|
|
|
|
|
|
|
return ret, nil
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func loadScraperFromYAMLFile(path string) (*config, error) {
|
2020-05-15 10:10:20 +00:00
|
|
|
file, err := os.Open(path)
|
|
|
|
defer file.Close()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// set id to the filename
|
|
|
|
id := filepath.Base(path)
|
|
|
|
id = id[:strings.LastIndex(id, ".")]
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
ret, err := loadScraperFromYAML(id, file)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
2019-12-16 01:35:34 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
ret.path = path
|
|
|
|
|
|
|
|
return ret, nil
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) toScraper() *models.Scraper {
|
2019-12-12 19:27:44 +00:00
|
|
|
ret := models.Scraper{
|
|
|
|
ID: c.ID,
|
|
|
|
Name: c.Name,
|
|
|
|
}
|
|
|
|
|
|
|
|
performer := models.ScraperSpec{}
|
|
|
|
if c.PerformerByName != nil {
|
|
|
|
performer.SupportedScrapes = append(performer.SupportedScrapes, models.ScrapeTypeName)
|
|
|
|
}
|
|
|
|
if c.PerformerByFragment != nil {
|
|
|
|
performer.SupportedScrapes = append(performer.SupportedScrapes, models.ScrapeTypeFragment)
|
|
|
|
}
|
|
|
|
if len(c.PerformerByURL) > 0 {
|
|
|
|
performer.SupportedScrapes = append(performer.SupportedScrapes, models.ScrapeTypeURL)
|
|
|
|
for _, v := range c.PerformerByURL {
|
|
|
|
performer.Urls = append(performer.Urls, v.URL...)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(performer.SupportedScrapes) > 0 {
|
|
|
|
ret.Performer = &performer
|
|
|
|
}
|
|
|
|
|
2019-12-16 01:35:34 +00:00
|
|
|
scene := models.ScraperSpec{}
|
|
|
|
if c.SceneByFragment != nil {
|
|
|
|
scene.SupportedScrapes = append(scene.SupportedScrapes, models.ScrapeTypeFragment)
|
|
|
|
}
|
|
|
|
if len(c.SceneByURL) > 0 {
|
|
|
|
scene.SupportedScrapes = append(scene.SupportedScrapes, models.ScrapeTypeURL)
|
|
|
|
for _, v := range c.SceneByURL {
|
|
|
|
scene.Urls = append(scene.Urls, v.URL...)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(scene.SupportedScrapes) > 0 {
|
|
|
|
ret.Scene = &scene
|
|
|
|
}
|
|
|
|
|
2020-08-10 05:34:15 +00:00
|
|
|
movie := models.ScraperSpec{}
|
|
|
|
if len(c.MovieByURL) > 0 {
|
|
|
|
movie.SupportedScrapes = append(movie.SupportedScrapes, models.ScrapeTypeURL)
|
|
|
|
for _, v := range c.MovieByURL {
|
|
|
|
movie.Urls = append(movie.Urls, v.URL...)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(movie.SupportedScrapes) > 0 {
|
|
|
|
ret.Movie = &movie
|
|
|
|
}
|
|
|
|
|
2019-12-12 19:27:44 +00:00
|
|
|
return &ret
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) supportsPerformers() bool {
|
2019-12-12 19:27:44 +00:00
|
|
|
return c.PerformerByName != nil || c.PerformerByFragment != nil || len(c.PerformerByURL) > 0
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) matchesPerformerURL(url string) bool {
|
2019-12-12 19:27:44 +00:00
|
|
|
for _, scraper := range c.PerformerByURL {
|
|
|
|
if scraper.matchesURL(url) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) ScrapePerformerNames(name string, globalConfig GlobalConfig) ([]*models.ScrapedPerformer, error) {
|
|
|
|
if c.PerformerByName != nil {
|
|
|
|
s := getScraper(*c.PerformerByName, c, globalConfig)
|
|
|
|
return s.scrapePerformersByName(name)
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) ScrapePerformer(scrapedPerformer models.ScrapedPerformerInput, globalConfig GlobalConfig) (*models.ScrapedPerformer, error) {
|
|
|
|
if c.PerformerByFragment != nil {
|
|
|
|
s := getScraper(*c.PerformerByFragment, c, globalConfig)
|
|
|
|
return s.scrapePerformerByFragment(scrapedPerformer)
|
2019-12-12 19:27:44 +00:00
|
|
|
}
|
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
// try to match against URL if present
|
|
|
|
if scrapedPerformer.URL != nil && *scrapedPerformer.URL != "" {
|
2020-07-21 04:06:25 +00:00
|
|
|
return c.ScrapePerformerURL(*scrapedPerformer.URL, globalConfig)
|
2020-01-31 22:17:40 +00:00
|
|
|
}
|
|
|
|
|
2019-12-12 19:27:44 +00:00
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) ScrapePerformerURL(url string, globalConfig GlobalConfig) (*models.ScrapedPerformer, error) {
|
2019-12-12 19:27:44 +00:00
|
|
|
for _, scraper := range c.PerformerByURL {
|
2020-07-21 04:06:25 +00:00
|
|
|
if scraper.matchesURL(url) {
|
|
|
|
s := getScraper(scraper.scraperTypeConfig, c, globalConfig)
|
|
|
|
ret, err := s.scrapePerformerByURL(url)
|
2019-12-12 19:27:44 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if ret != nil {
|
|
|
|
return ret, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, nil
|
|
|
|
}
|
2019-12-16 01:35:34 +00:00
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) supportsScenes() bool {
|
2019-12-16 01:35:34 +00:00
|
|
|
return c.SceneByFragment != nil || len(c.SceneByURL) > 0
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) matchesSceneURL(url string) bool {
|
2019-12-16 01:35:34 +00:00
|
|
|
for _, scraper := range c.SceneByURL {
|
|
|
|
if scraper.matchesURL(url) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2020-08-10 05:34:15 +00:00
|
|
|
func (c config) supportsMovies() bool {
|
|
|
|
return len(c.MovieByURL) > 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c config) matchesMovieURL(url string) bool {
|
|
|
|
for _, scraper := range c.MovieByURL {
|
|
|
|
if scraper.matchesURL(url) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) ScrapeScene(scene models.SceneUpdateInput, globalConfig GlobalConfig) (*models.ScrapedScene, error) {
|
|
|
|
if c.SceneByFragment != nil {
|
|
|
|
s := getScraper(*c.SceneByFragment, c, globalConfig)
|
|
|
|
return s.scrapeSceneByFragment(scene)
|
2019-12-16 01:35:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2020-07-21 04:06:25 +00:00
|
|
|
func (c config) ScrapeSceneURL(url string, globalConfig GlobalConfig) (*models.ScrapedScene, error) {
|
2019-12-16 01:35:34 +00:00
|
|
|
for _, scraper := range c.SceneByURL {
|
2020-07-21 04:06:25 +00:00
|
|
|
if scraper.matchesURL(url) {
|
|
|
|
s := getScraper(scraper.scraperTypeConfig, c, globalConfig)
|
|
|
|
ret, err := s.scrapeSceneByURL(url)
|
2019-12-16 01:35:34 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if ret != nil {
|
|
|
|
return ret, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, nil
|
|
|
|
}
|
2020-08-10 05:34:15 +00:00
|
|
|
|
|
|
|
func (c config) ScrapeMovieURL(url string, globalConfig GlobalConfig) (*models.ScrapedMovie, error) {
|
|
|
|
for _, scraper := range c.MovieByURL {
|
|
|
|
if scraper.matchesURL(url) {
|
|
|
|
s := getScraper(scraper.scraperTypeConfig, c, globalConfig)
|
|
|
|
ret, err := s.scrapeMovieByURL(url)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if ret != nil {
|
|
|
|
return ret, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, nil
|
|
|
|
}
|