stash/pkg/scraper/xpath.go

268 lines
5.6 KiB
Go

package scraper
import (
"errors"
"reflect"
"regexp"
"strings"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
)
type commonXPathConfig map[string]string
func (c commonXPathConfig) applyCommon(src string) string {
ret := src
for commonKey, commonVal := range c {
if strings.Contains(ret, commonKey) {
ret = strings.Replace(ret, commonKey, commonVal, -1)
}
}
return ret
}
type xpathScraperConfig map[string]interface{}
func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfig {
ret := make(xpathScraperConfig)
if src != nil {
for k, v := range src {
keyStr, isStr := k.(string)
if isStr {
ret[keyStr] = v
}
}
}
return ret
}
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) []xPathResult {
var ret []xPathResult
for k, v := range s {
asStr, isStr := v.(string)
if isStr {
// apply common
if common != nil {
asStr = common.applyCommon(asStr)
}
found := htmlquery.Find(doc, asStr)
if len(found) > 0 {
for i, elem := range found {
if i >= len(ret) {
ret = append(ret, make(xPathResult))
}
ret[i][k] = elem
}
}
}
// TODO - handle map type
}
return ret
}
type xpathScrapers map[string]*xpathScraper
type xpathScraper struct {
Common commonXPathConfig `yaml:"common"`
Scene xpathScraperConfig `yaml:"scene"`
Performer xpathScraperConfig `yaml:"performer"`
}
const (
XPathScraperConfigSceneTags = "Tags"
XPathScraperConfigScenePerformers = "Performers"
XPathScraperConfigSceneStudio = "Studio"
)
func (s xpathScraper) GetSceneSimple() xpathScraperConfig {
// exclude the complex sub-configs
ret := make(xpathScraperConfig)
mapped := s.Scene
if mapped != nil {
for k, v := range mapped {
if k != XPathScraperConfigSceneTags && k != XPathScraperConfigScenePerformers && k != XPathScraperConfigSceneStudio {
ret[k] = v
}
}
}
return ret
}
func (s xpathScraper) getSceneSubMap(key string) xpathScraperConfig {
var ret map[interface{}]interface{}
mapped := s.Scene
if mapped != nil {
v, ok := mapped[key]
if ok {
ret, _ = v.(map[interface{}]interface{})
}
}
if ret != nil {
return createXPathScraperConfig(ret)
}
return nil
}
func (s xpathScraper) GetScenePerformers() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigScenePerformers)
}
func (s xpathScraper) GetSceneTags() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneTags)
}
func (s xpathScraper) GetSceneStudio() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneStudio)
}
func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, error) {
var ret models.ScrapedPerformer
performerMap := s.Performer
if performerMap == nil {
return nil, nil
}
results := performerMap.process(doc, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
}
return &ret, nil
}
func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
var ret models.ScrapedScene
sceneMap := s.GetSceneSimple()
if sceneMap == nil {
return nil, nil
}
scenePerformersMap := s.GetScenePerformers()
sceneTagsMap := s.GetSceneTags()
sceneStudioMap := s.GetSceneStudio()
results := sceneMap.process(doc, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
// now apply the performers and tags
if scenePerformersMap != nil {
performerResults := scenePerformersMap.process(doc, s.Common)
for _, p := range performerResults {
performer := &models.ScrapedScenePerformer{}
p.apply(performer)
ret.Performers = append(ret.Performers, performer)
}
}
if sceneTagsMap != nil {
tagResults := sceneTagsMap.process(doc, s.Common)
for _, p := range tagResults {
tag := &models.ScrapedSceneTag{}
p.apply(tag)
ret.Tags = append(ret.Tags, tag)
}
}
if sceneStudioMap != nil {
studioResults := sceneStudioMap.process(doc, s.Common)
if len(studioResults) > 0 {
studio := &models.ScrapedSceneStudio{}
studioResults[0].apply(studio)
ret.Studio = studio
}
}
}
return &ret, nil
}
type xPathResult map[string]*html.Node
func (r xPathResult) apply(dest interface{}) {
destVal := reflect.ValueOf(dest)
// dest should be a pointer
destVal = destVal.Elem()
for key, v := range r {
field := destVal.FieldByName(key)
if field.IsValid() {
value := htmlquery.InnerText(v)
value = strings.TrimSpace(value)
// remove multiple whitespace and end lines
re := regexp.MustCompile("\n")
value = re.ReplaceAllString(value, "")
re = regexp.MustCompile(" +")
value = re.ReplaceAllString(value, " ")
var reflectValue reflect.Value
if field.Kind() == reflect.Ptr {
reflectValue = reflect.ValueOf(&value)
} else {
reflectValue = reflect.ValueOf(value)
}
field.Set(reflectValue)
} else {
logger.Errorf("Field %s does not exist in %T", key, dest)
}
}
}
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := htmlquery.LoadURL(url)
if err != nil {
return nil, err
}
return scraper.scrapePerformer(doc)
}
func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := htmlquery.LoadURL(url)
if err != nil {
return nil, err
}
return scraper.scrapeScene(doc)
}