stash/pkg/scraper/xpath.go

625 lines
14 KiB
Go
Raw Normal View History

package scraper
import (
"bytes"
"errors"
"net/http"
"net/url"
"reflect"
"regexp"
"strings"
"time"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
)
// Timeout for the scrape http request. Includes transfer time. May want to make this
// configurable at some point.
const scrapeGetTimeout = time.Second * 30
type commonXPathConfig map[string]string
func (c commonXPathConfig) applyCommon(src string) string {
ret := src
for commonKey, commonVal := range c {
if strings.Contains(ret, commonKey) {
ret = strings.Replace(ret, commonKey, commonVal, -1)
}
}
return ret
}
type xpathScraperConfig map[string]interface{}
func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfig {
ret := make(xpathScraperConfig)
if src != nil {
for k, v := range src {
keyStr, isStr := k.(string)
if isStr {
ret[keyStr] = v
}
}
}
return ret
}
type xpathRegexConfig map[interface{}]interface{}
type xpathRegexConfigs []xpathRegexConfig
func (c xpathRegexConfig) apply(value string) string {
regex := ""
with := ""
if regexI, _ := c["regex"]; regexI != nil {
regex, _ = regexI.(string)
}
if withI, _ := c["with"]; withI != nil {
with, _ = withI.(string)
}
if regex != "" {
re, err := regexp.Compile(regex)
if err != nil {
logger.Warnf("Error compiling regex '%s': %s", regex, err.Error())
return value
}
ret := re.ReplaceAllString(value, with)
logger.Debugf(`Replace: '%s' with '%s'`, regex, with)
logger.Debugf("Before: %s", value)
logger.Debugf("After: %s", ret)
return ret
}
return value
}
func (c xpathRegexConfigs) apply(value string) string {
// apply regex in order
for _, config := range c {
value = config.apply(value)
}
// remove whitespace again
value = commonPostProcess(value)
return value
}
type xpathScraperAttrConfig map[interface{}]interface{}
func (c xpathScraperAttrConfig) getString(key string) string {
ret, _ := c[key]
if ret == nil {
return ""
}
asStr, _ := ret.(string)
return asStr
}
func (c xpathScraperAttrConfig) getSelector() string {
const selectorKey = "selector"
return c.getString(selectorKey)
}
func (c xpathScraperAttrConfig) getConcat() string {
const concatKey = "concat"
return c.getString(concatKey)
}
func (c xpathScraperAttrConfig) hasConcat() bool {
return c.getConcat() != ""
}
func (c xpathScraperAttrConfig) getParseDate() string {
const parseDateKey = "parseDate"
return c.getString(parseDateKey)
}
func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs {
const replaceKey = "replace"
val, _ := c[replaceKey]
var ret xpathRegexConfigs
if val == nil {
return ret
}
asSlice, _ := val.([]interface{})
for _, v := range asSlice {
asMap, _ := v.(map[interface{}]interface{})
ret = append(ret, xpathRegexConfig(asMap))
}
return ret
}
func (c xpathScraperAttrConfig) getSubScraper() xpathScraperAttrConfig {
const subScraperKey = "subScraper"
val, _ := c[subScraperKey]
if val == nil {
return nil
}
asMap, _ := val.(map[interface{}]interface{})
if asMap != nil {
return xpathScraperAttrConfig(asMap)
}
return nil
}
func (c xpathScraperAttrConfig) concatenateResults(nodes []*html.Node) string {
separator := c.getConcat()
result := []string{}
for _, elem := range nodes {
2020-05-18 02:26:20 +00:00
text := NodeText(elem)
text = commonPostProcess(text)
result = append(result, text)
}
return strings.Join(result, separator)
}
func (c xpathScraperAttrConfig) parseDate(value string) string {
parseDate := c.getParseDate()
if parseDate == "" {
return value
}
// try to parse the date using the pattern
// if it fails, then just fall back to the original value
parsedValue, err := time.Parse(parseDate, value)
if err != nil {
logger.Warnf("Error parsing date string '%s' using format '%s': %s", value, parseDate, err.Error())
return value
}
// convert it into our date format
const internalDateFormat = "2006-01-02"
return parsedValue.Format(internalDateFormat)
}
func (c xpathScraperAttrConfig) replaceRegex(value string) string {
replace := c.getReplace()
return replace.apply(value)
}
func (c xpathScraperAttrConfig) applySubScraper(value string) string {
subScraper := c.getSubScraper()
if subScraper == nil {
return value
}
logger.Debugf("Sub-scraping for: %s", value)
doc, err := loadURL(value, nil)
if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
return ""
}
found := runXPathQuery(doc, subScraper.getSelector(), nil)
if len(found) > 0 {
// check if we're concatenating the results into a single result
var result string
if subScraper.hasConcat() {
result = subScraper.concatenateResults(found)
} else {
2020-05-18 02:26:20 +00:00
result = NodeText(found[0])
result = commonPostProcess(result)
}
result = subScraper.postProcess(result)
return result
}
return ""
}
func (c xpathScraperAttrConfig) postProcess(value string) string {
// perform regex replacements first
value = c.replaceRegex(value)
value = c.parseDate(value)
value = c.applySubScraper(value)
return value
}
func commonPostProcess(value string) string {
value = strings.TrimSpace(value)
// remove multiple whitespace and end lines
re := regexp.MustCompile("\n")
value = re.ReplaceAllString(value, "")
re = regexp.MustCompile(" +")
value = re.ReplaceAllString(value, " ")
return value
}
func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node {
// apply common
if common != nil {
xpath = common.applyCommon(xpath)
}
found, err := htmlquery.QueryAll(doc, xpath)
if err != nil {
logger.Warnf("Error parsing xpath expression '%s': %s", xpath, err.Error())
return nil
}
return found
}
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xPathResults {
var ret xPathResults
for k, value := range s {
switch v := value.(type) {
case string:
found := runXPathQuery(doc, v, common)
if len(found) > 0 {
for i, elem := range found {
2020-05-18 02:26:20 +00:00
text := NodeText(elem)
text = commonPostProcess(text)
ret = ret.setKey(i, k, text)
}
}
case map[interface{}]interface{}:
attrConfig := xpathScraperAttrConfig(v)
found := runXPathQuery(doc, attrConfig.getSelector(), common)
if len(found) > 0 {
// check if we're concatenating the results into a single result
if attrConfig.hasConcat() {
result := attrConfig.concatenateResults(found)
result = attrConfig.postProcess(result)
const i = 0
ret = ret.setKey(i, k, result)
} else {
for i, elem := range found {
2020-05-18 02:26:20 +00:00
text := NodeText(elem)
text = commonPostProcess(text)
text = attrConfig.postProcess(text)
ret = ret.setKey(i, k, text)
}
}
}
}
}
return ret
}
type xpathScrapers map[string]*xpathScraper
type xpathScraper struct {
Common commonXPathConfig `yaml:"common"`
Scene xpathScraperConfig `yaml:"scene"`
Performer xpathScraperConfig `yaml:"performer"`
}
const (
XPathScraperConfigSceneTags = "Tags"
XPathScraperConfigScenePerformers = "Performers"
XPathScraperConfigSceneStudio = "Studio"
XPathScraperConfigSceneMovies = "Movies"
)
func (s xpathScraper) GetSceneSimple() xpathScraperConfig {
// exclude the complex sub-configs
ret := make(xpathScraperConfig)
mapped := s.Scene
if mapped != nil {
for k, v := range mapped {
if k != XPathScraperConfigSceneTags && k != XPathScraperConfigScenePerformers && k != XPathScraperConfigSceneStudio && k != XPathScraperConfigSceneMovies {
ret[k] = v
}
}
}
return ret
}
func (s xpathScraper) getSceneSubMap(key string) xpathScraperConfig {
var ret map[interface{}]interface{}
mapped := s.Scene
if mapped != nil {
v, ok := mapped[key]
if ok {
ret, _ = v.(map[interface{}]interface{})
}
}
if ret != nil {
return createXPathScraperConfig(ret)
}
return nil
}
func (s xpathScraper) GetScenePerformers() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigScenePerformers)
}
func (s xpathScraper) GetSceneTags() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneTags)
}
func (s xpathScraper) GetSceneStudio() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneStudio)
}
func (s xpathScraper) GetSceneMovies() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneMovies)
}
func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, error) {
var ret models.ScrapedPerformer
performerMap := s.Performer
if performerMap == nil {
return nil, nil
}
results := performerMap.process(doc, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
}
return &ret, nil
}
func (s xpathScraper) scrapePerformers(doc *html.Node) ([]*models.ScrapedPerformer, error) {
var ret []*models.ScrapedPerformer
performerMap := s.Performer
if performerMap == nil {
return nil, nil
}
results := performerMap.process(doc, s.Common)
for _, r := range results {
var p models.ScrapedPerformer
r.apply(&p)
ret = append(ret, &p)
}
return ret, nil
}
func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
var ret models.ScrapedScene
sceneMap := s.GetSceneSimple()
if sceneMap == nil {
return nil, nil
}
scenePerformersMap := s.GetScenePerformers()
sceneTagsMap := s.GetSceneTags()
sceneStudioMap := s.GetSceneStudio()
sceneMoviesMap := s.GetSceneMovies()
logger.Debug(`Processing scene:`)
results := sceneMap.process(doc, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
// now apply the performers and tags
if scenePerformersMap != nil {
logger.Debug(`Processing scene performers:`)
performerResults := scenePerformersMap.process(doc, s.Common)
for _, p := range performerResults {
performer := &models.ScrapedScenePerformer{}
p.apply(performer)
ret.Performers = append(ret.Performers, performer)
}
}
if sceneTagsMap != nil {
logger.Debug(`Processing scene tags:`)
tagResults := sceneTagsMap.process(doc, s.Common)
for _, p := range tagResults {
tag := &models.ScrapedSceneTag{}
p.apply(tag)
ret.Tags = append(ret.Tags, tag)
}
}
if sceneStudioMap != nil {
logger.Debug(`Processing scene studio:`)
studioResults := sceneStudioMap.process(doc, s.Common)
if len(studioResults) > 0 {
studio := &models.ScrapedSceneStudio{}
studioResults[0].apply(studio)
ret.Studio = studio
}
}
if sceneMoviesMap != nil {
logger.Debug(`Processing scene movies:`)
movieResults := sceneMoviesMap.process(doc, s.Common)
for _, p := range movieResults {
movie := &models.ScrapedSceneMovie{}
p.apply(movie)
ret.Movies = append(ret.Movies, movie)
}
}
}
return &ret, nil
}
type xPathResult map[string]string
type xPathResults []xPathResult
func (r xPathResult) apply(dest interface{}) {
destVal := reflect.ValueOf(dest)
// dest should be a pointer
destVal = destVal.Elem()
for key, value := range r {
field := destVal.FieldByName(key)
if field.IsValid() {
var reflectValue reflect.Value
if field.Kind() == reflect.Ptr {
// need to copy the value, otherwise everything is set to the
// same pointer
localValue := value
reflectValue = reflect.ValueOf(&localValue)
} else {
reflectValue = reflect.ValueOf(value)
}
field.Set(reflectValue)
} else {
logger.Errorf("Field %s does not exist in %T", key, dest)
}
}
}
func (r xPathResults) setKey(index int, key string, value string) xPathResults {
if index >= len(r) {
r = append(r, make(xPathResult))
}
logger.Debugf(`[%d][%s] = %s`, index, key, value)
r[index][key] = value
return r
}
func loadURL(url string, c *scraperConfig) (*html.Node, error) {
client := &http.Client{
Timeout: scrapeGetTimeout,
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
ret, err := html.Parse(r)
if err == nil && c != nil && c.DebugOptions != nil && c.DebugOptions.PrintHTML {
var b bytes.Buffer
html.Render(&b, ret)
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
}
return ret, err
}
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := loadURL(url, c.scraperConfig)
if err != nil {
return nil, err
}
return scraper.scrapePerformer(doc)
}
func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := loadURL(url, c.scraperConfig)
if err != nil {
return nil, err
}
return scraper.scrapeScene(doc)
}
func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
const placeholder = "{}"
// replace the placeholder string with the URL-escaped name
escapedName := url.QueryEscape(name)
u := c.QueryURL
u = strings.Replace(u, placeholder, escapedName, -1)
doc, err := loadURL(u, c.scraperConfig)
if err != nil {
return nil, err
}
return scraper.scrapePerformers(doc)
}
2020-05-18 02:26:20 +00:00
func NodeText(n *html.Node) string {
if n != nil && n.Type == html.CommentNode {
return htmlquery.OutputHTML(n, true)
}
return htmlquery.InnerText(n)
}