package scraper import ( "bytes" "errors" "net/url" "regexp" "strings" "github.com/antchfx/htmlquery" "golang.org/x/net/html" "github.com/stashapp/stash/pkg/logger" "github.com/stashapp/stash/pkg/models" ) type xpathScraper struct { scraper scraperTypeConfig config config globalConfig GlobalConfig txnManager models.TransactionManager } func newXpathScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *xpathScraper { return &xpathScraper{ scraper: scraper, config: config, globalConfig: globalConfig, txnManager: txnManager, } } func (s *xpathScraper) getXpathScraper() *mappedScraper { return s.config.XPathScrapers[s.scraper.Scraper] } func (s *xpathScraper) scrapeURL(url string) (*html.Node, *mappedScraper, error) { scraper := s.getXpathScraper() if scraper == nil { return nil, nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config") } doc, err := s.loadURL(url) if err != nil { return nil, nil, err } return doc, scraper, nil } func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) { u := replaceURL(url, s.scraper) // allow a URL Replace for performer by URL queries doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } q := s.getXPathQuery(doc) return scraper.scrapePerformer(q) } func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) { u := replaceURL(url, s.scraper) // allow a URL Replace for scene by URL queries doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } q := s.getXPathQuery(doc) return scraper.scrapeScene(q) } func (s *xpathScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, error) { u := replaceURL(url, s.scraper) // allow a URL Replace for gallery by URL queries doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } q := s.getXPathQuery(doc) return scraper.scrapeGallery(q) } func (s *xpathScraper) scrapeMovieByURL(url string) (*models.ScrapedMovie, error) { u := replaceURL(url, s.scraper) // allow a URL Replace for movie by URL queries doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } q := s.getXPathQuery(doc) return scraper.scrapeMovie(q) } func (s *xpathScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) { scraper := s.getXpathScraper() if scraper == nil { return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config") } const placeholder = "{}" // replace the placeholder string with the URL-escaped name escapedName := url.QueryEscape(name) url := s.scraper.QueryURL url = strings.Replace(url, placeholder, escapedName, -1) doc, err := s.loadURL(url) if err != nil { return nil, err } q := s.getXPathQuery(doc) return scraper.scrapePerformers(q) } func (s *xpathScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) { return nil, errors.New("scrapePerformerByFragment not supported for xpath scraper") } func (s *xpathScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) { storedScene, err := sceneFromUpdateFragment(scene, s.txnManager) if err != nil { return nil, err } if storedScene == nil { return nil, errors.New("no scene found") } // construct the URL queryURL := queryURLParametersFromScene(storedScene) if s.scraper.QueryURLReplacements != nil { queryURL.applyReplacements(s.scraper.QueryURLReplacements) } url := queryURL.constructURL(s.scraper.QueryURL) scraper := s.getXpathScraper() if scraper == nil { return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config") } doc, err := s.loadURL(url) if err != nil { return nil, err } q := s.getXPathQuery(doc) return scraper.scrapeScene(q) } func (s *xpathScraper) scrapeGalleryByFragment(gallery models.GalleryUpdateInput) (*models.ScrapedGallery, error) { storedGallery, err := galleryFromUpdateFragment(gallery, s.txnManager) if err != nil { return nil, err } if storedGallery == nil { return nil, errors.New("no scene found") } // construct the URL queryURL := queryURLParametersFromGallery(storedGallery) if s.scraper.QueryURLReplacements != nil { queryURL.applyReplacements(s.scraper.QueryURLReplacements) } url := queryURL.constructURL(s.scraper.QueryURL) scraper := s.getXpathScraper() if scraper == nil { return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config") } doc, err := s.loadURL(url) if err != nil { return nil, err } q := s.getXPathQuery(doc) return scraper.scrapeGallery(q) } func (s *xpathScraper) loadURL(url string) (*html.Node, error) { r, err := loadURL(url, s.config, s.globalConfig) if err != nil { return nil, err } ret, err := html.Parse(r) if err == nil && s.config.DebugOptions != nil && s.config.DebugOptions.PrintHTML { var b bytes.Buffer html.Render(&b, ret) logger.Infof("loadURL (%s) response: \n%s", url, b.String()) } return ret, err } func (s *xpathScraper) getXPathQuery(doc *html.Node) *xpathQuery { return &xpathQuery{ doc: doc, scraper: s, } } type xpathQuery struct { doc *html.Node scraper *xpathScraper } func (q *xpathQuery) runQuery(selector string) []string { found, err := htmlquery.QueryAll(q.doc, selector) if err != nil { logger.Warnf("Error parsing xpath expression '%s': %s", selector, err.Error()) return nil } var ret []string for _, n := range found { // don't add empty strings nodeText := q.nodeText(n) if nodeText != "" { ret = append(ret, q.nodeText(n)) } } return ret } func (q *xpathQuery) nodeText(n *html.Node) string { var ret string if n != nil && n.Type == html.CommentNode { ret = htmlquery.OutputHTML(n, true) } else { ret = htmlquery.InnerText(n) } // trim all leading and trailing whitespace ret = strings.TrimSpace(ret) // remove multiple whitespace re := regexp.MustCompile(" +") ret = re.ReplaceAllString(ret, " ") // TODO - make this optional re = regexp.MustCompile("\n") ret = re.ReplaceAllString(ret, "") return ret } func (q *xpathQuery) subScrape(value string) mappedQuery { doc, err := q.scraper.loadURL(value) if err != nil { logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error()) return nil } return q.scraper.getXPathQuery(doc) }