package scraper import ( "context" "encoding/json" "errors" "fmt" "io" "os/exec" "path/filepath" "strings" "github.com/stashapp/stash/pkg/desktop" "github.com/stashapp/stash/pkg/logger" "github.com/stashapp/stash/pkg/models" ) var ErrScraperScript = errors.New("scraper script error") type scriptScraper struct { scraper scraperTypeConfig config config globalConfig GlobalConfig } func newScriptScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *scriptScraper { return &scriptScraper{ scraper: scraper, config: config, globalConfig: globalConfig, } } func (s *scriptScraper) runScraperScript(inString string, out interface{}) error { command := s.scraper.Script if command[0] == "python" || command[0] == "python3" { executable, err := findPythonExecutable() if err == nil { command[0] = executable } } cmd := exec.Command(command[0], command[1:]...) cmd.Dir = filepath.Dir(s.config.path) stdin, err := cmd.StdinPipe() if err != nil { return err } go func() { defer stdin.Close() if n, err := io.WriteString(stdin, inString); err != nil { logger.Warnf("failure to write full input to script (wrote %v bytes out of %v): %v", n, len(inString), err) } }() stderr, err := cmd.StderrPipe() if err != nil { logger.Error("Scraper stderr not available: " + err.Error()) } stdout, err := cmd.StdoutPipe() if nil != err { logger.Error("Scraper stdout not available: " + err.Error()) } desktop.HideExecShell(cmd) if err = cmd.Start(); err != nil { logger.Error("Error running scraper script: " + err.Error()) return errors.New("error running scraper script") } go handleScraperStderr(s.config.Name, stderr) logger.Debugf("Scraper script <%s> started", strings.Join(cmd.Args, " ")) // TODO - add a timeout here // Make a copy of stdout here. This allows us to decode it twice. var sb strings.Builder tr := io.TeeReader(stdout, &sb) // First, perform a decode where unknown fields are disallowed. d := json.NewDecoder(tr) d.DisallowUnknownFields() strictErr := d.Decode(out) if strictErr != nil { // The decode failed for some reason, use the built string // and allow unknown fields in the decode. s := sb.String() lenientErr := json.NewDecoder(strings.NewReader(s)).Decode(out) if lenientErr != nil { // The error is genuine, so return it logger.Errorf("could not unmarshal json from script output: %v", lenientErr) return fmt.Errorf("could not unmarshal json from script output: %w", lenientErr) } // Lenient decode succeeded, print a warning, but use the decode logger.Warnf("reading script result: %v", strictErr) } err = cmd.Wait() logger.Debugf("Scraper script finished") if err != nil { return fmt.Errorf("%w: %v", ErrScraperScript, err) } return nil } func (s *scriptScraper) scrapeByName(ctx context.Context, name string, ty models.ScrapeContentType) ([]models.ScrapedContent, error) { input := `{"name": "` + name + `"}` var ret []models.ScrapedContent var err error switch ty { case models.ScrapeContentTypePerformer: var performers []models.ScrapedPerformer err = s.runScraperScript(input, &performers) if err == nil { for _, p := range performers { v := p ret = append(ret, &v) } } case models.ScrapeContentTypeScene: var scenes []models.ScrapedScene err = s.runScraperScript(input, &scenes) if err == nil { for _, s := range scenes { v := s ret = append(ret, &v) } } default: return nil, ErrNotSupported } return ret, err } func (s *scriptScraper) scrapeByFragment(ctx context.Context, input Input) (models.ScrapedContent, error) { var inString []byte var err error var ty models.ScrapeContentType switch { case input.Performer != nil: inString, err = json.Marshal(*input.Performer) ty = models.ScrapeContentTypePerformer case input.Gallery != nil: inString, err = json.Marshal(*input.Gallery) ty = models.ScrapeContentTypeGallery case input.Scene != nil: inString, err = json.Marshal(*input.Scene) ty = models.ScrapeContentTypeScene } if err != nil { return nil, err } return s.scrape(ctx, string(inString), ty) } func (s *scriptScraper) scrapeByURL(ctx context.Context, url string, ty models.ScrapeContentType) (models.ScrapedContent, error) { return s.scrape(ctx, `{"url": "`+url+`"}`, ty) } func (s *scriptScraper) scrape(ctx context.Context, input string, ty models.ScrapeContentType) (models.ScrapedContent, error) { switch ty { case models.ScrapeContentTypePerformer: var performer models.ScrapedPerformer err := s.runScraperScript(input, &performer) return &performer, err case models.ScrapeContentTypeGallery: var gallery models.ScrapedGallery err := s.runScraperScript(input, &gallery) return &gallery, err case models.ScrapeContentTypeScene: var scene models.ScrapedScene err := s.runScraperScript(input, &scene) return &scene, err case models.ScrapeContentTypeMovie: var movie models.ScrapedMovie err := s.runScraperScript(input, &movie) return &movie, err } return nil, ErrNotSupported } func (s *scriptScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) { inString, err := json.Marshal(sceneToUpdateInput(scene)) if err != nil { return nil, err } var ret models.ScrapedScene err = s.runScraperScript(string(inString), &ret) return &ret, err } func (s *scriptScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) { inString, err := json.Marshal(galleryToUpdateInput(gallery)) if err != nil { return nil, err } var ret models.ScrapedGallery err = s.runScraperScript(string(inString), &ret) return &ret, err } func findPythonExecutable() (string, error) { _, err := exec.LookPath("python3") if err != nil { _, err = exec.LookPath("python") if err != nil { return "", err } return "python", nil } return "python3", nil } func handleScraperStderr(name string, scraperOutputReader io.ReadCloser) { const scraperPrefix = "[Scrape / %s] " lgr := logger.PluginLogger{ Prefix: fmt.Sprintf(scraperPrefix, name), DefaultLogLevel: &logger.ErrorLevel, } lgr.HandlePluginStdErr(scraperOutputReader) }