stash/pkg/scraper/xpath.go

310 lines
7.5 KiB
Go
Raw Normal View History

package scraper
import (
"bytes"
"errors"
"net/url"
"regexp"
"strings"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
)
type xpathScraper struct {
scraper scraperTypeConfig
config config
globalConfig GlobalConfig
txnManager models.TransactionManager
}
func newXpathScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *xpathScraper {
return &xpathScraper{
scraper: scraper,
config: config,
globalConfig: globalConfig,
txnManager: txnManager,
}
}
func (s *xpathScraper) getXpathScraper() *mappedScraper {
return s.config.XPathScrapers[s.scraper.Scraper]
}
func (s *xpathScraper) scrapeURL(url string) (*html.Node, *mappedScraper, error) {
scraper := s.getXpathScraper()
if scraper == nil {
return nil, nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
doc, err := s.loadURL(url)
if err != nil {
return nil, nil, err
}
return doc, scraper, nil
}
func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
u := replaceURL(url, s.scraper) // allow a URL Replace for performer by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapePerformer(q)
}
func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
u := replaceURL(url, s.scraper) // allow a URL Replace for scene by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeScene(q)
}
2020-10-20 22:24:32 +00:00
func (s *xpathScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, error) {
u := replaceURL(url, s.scraper) // allow a URL Replace for gallery by URL queries
doc, scraper, err := s.scrapeURL(u)
2020-10-20 22:24:32 +00:00
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeGallery(q)
}
func (s *xpathScraper) scrapeMovieByURL(url string) (*models.ScrapedMovie, error) {
u := replaceURL(url, s.scraper) // allow a URL Replace for movie by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeMovie(q)
}
func (s *xpathScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
const placeholder = "{}"
// replace the placeholder string with the URL-escaped name
escapedName := url.QueryEscape(name)
url := s.scraper.QueryURL
url = strings.Replace(url, placeholder, escapedName, -1)
doc, err := s.loadURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapePerformers(q)
}
func (s *xpathScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
return nil, errors.New("scrapePerformerByFragment not supported for xpath scraper")
}
func (s *xpathScraper) scrapeScenesByName(name string) ([]*models.ScrapedScene, error) {
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
const placeholder = "{}"
// replace the placeholder string with the URL-escaped name
escapedName := url.QueryEscape(name)
url := s.scraper.QueryURL
url = strings.Replace(url, placeholder, escapedName, -1)
doc, err := s.loadURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeScenes(q)
}
func (s *xpathScraper) scrapeSceneByScene(scene *models.Scene) (*models.ScrapedScene, error) {
// construct the URL
queryURL := queryURLParametersFromScene(scene)
2020-10-22 00:56:04 +00:00
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
doc, err := s.loadURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeScene(q)
}
func (s *xpathScraper) scrapeSceneByFragment(scene models.ScrapedSceneInput) (*models.ScrapedScene, error) {
// construct the URL
queryURL := queryURLParametersFromScrapedScene(scene)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
doc, err := s.loadURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeScene(q)
}
2020-10-20 22:24:32 +00:00
func (s *xpathScraper) scrapeGalleryByGallery(gallery *models.Gallery) (*models.ScrapedGallery, error) {
2020-10-20 22:24:32 +00:00
// construct the URL
queryURL := queryURLParametersFromGallery(gallery)
2020-10-22 00:56:04 +00:00
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
2020-10-20 22:24:32 +00:00
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
doc, err := s.loadURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeGallery(q)
}
func (s *xpathScraper) scrapeGalleryByFragment(gallery models.ScrapedGalleryInput) (*models.ScrapedGallery, error) {
return nil, errors.New("scrapeGalleryByFragment not supported for xpath scraper")
}
func (s *xpathScraper) loadURL(url string) (*html.Node, error) {
r, err := loadURL(url, s.config, s.globalConfig)
if err != nil {
return nil, err
}
ret, err := html.Parse(r)
if err == nil && s.config.DebugOptions != nil && s.config.DebugOptions.PrintHTML {
var b bytes.Buffer
Lint checks phase 2 (#1747) * Log 3 unchecked errors Rather than ignore errors, log them at the WARNING log level. The server has been functioning without these, so assume they are not at the ERROR level. * Log errors in concurrency test If we can't initialize the configuration, treat the test as a failure. * Undo the errcheck on configurations for now. * Handle unchecked errors in pkg/manager * Resolve unchecked errors * Handle DLNA/DMS unchecked errors * Handle error checking in concurrency test Generalize config initialization, so we can initialize a configuration without writing it to disk. Use this in the test case, since otherwise the test fails to write. * Handle the remaining unchecked errors * Heed gosimple in update test * Use one-line if-initializer statements While here, fix a wrong variable capture error. * testing.T doesn't support %w use %v instead which is supported. * Remove unused query builder functions The Int/String criterion handler functions are now generalized. Thus, there's no need to keep these functions around anymore. * Mark filterBuilder.addRecursiveWith nolint The function is useful in the future and no other refactors are looking nice. Keep the function around, but tell the linter to ignore it. * Remove utils.Btoi There are no users of this utility function * Return error on scan failure If we fail to scan the row when looking for the unique checksum index, then report the error upwards. * Fix comments on exported functions * Fix typos * Fix startup error
2021-09-23 07:15:50 +00:00
if err := html.Render(&b, ret); err != nil {
logger.Warnf("could not render HTML: %v", err)
}
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
}
return ret, err
}
func (s *xpathScraper) getXPathQuery(doc *html.Node) *xpathQuery {
return &xpathQuery{
doc: doc,
scraper: s,
}
}
type xpathQuery struct {
doc *html.Node
scraper *xpathScraper
}
func (q *xpathQuery) runQuery(selector string) []string {
found, err := htmlquery.QueryAll(q.doc, selector)
if err != nil {
logger.Warnf("Error parsing xpath expression '%s': %s", selector, err.Error())
return nil
}
var ret []string
for _, n := range found {
// don't add empty strings
nodeText := q.nodeText(n)
if nodeText != "" {
ret = append(ret, q.nodeText(n))
}
}
return ret
}
func (q *xpathQuery) nodeText(n *html.Node) string {
var ret string
if n != nil && n.Type == html.CommentNode {
ret = htmlquery.OutputHTML(n, true)
} else {
ret = htmlquery.InnerText(n)
}
// trim all leading and trailing whitespace
ret = strings.TrimSpace(ret)
// remove multiple whitespace
re := regexp.MustCompile(" +")
ret = re.ReplaceAllString(ret, " ")
// TODO - make this optional
re = regexp.MustCompile("\n")
ret = re.ReplaceAllString(ret, "")
return ret
}
func (q *xpathQuery) subScrape(value string) mappedQuery {
doc, err := q.scraper.loadURL(value)
if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
return nil
}
return q.scraper.getXPathQuery(doc)
2020-05-18 02:26:20 +00:00
}