mirror of https://github.com/stashapp/stash.git
268 lines
5.6 KiB
Go
268 lines
5.6 KiB
Go
package scraper
|
|
|
|
import (
|
|
"errors"
|
|
"reflect"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/antchfx/htmlquery"
|
|
"golang.org/x/net/html"
|
|
|
|
"github.com/stashapp/stash/pkg/logger"
|
|
"github.com/stashapp/stash/pkg/models"
|
|
)
|
|
|
|
type commonXPathConfig map[string]string
|
|
|
|
func (c commonXPathConfig) applyCommon(src string) string {
|
|
ret := src
|
|
for commonKey, commonVal := range c {
|
|
if strings.Contains(ret, commonKey) {
|
|
ret = strings.Replace(ret, commonKey, commonVal, -1)
|
|
}
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
type xpathScraperConfig map[string]interface{}
|
|
|
|
func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfig {
|
|
ret := make(xpathScraperConfig)
|
|
|
|
if src != nil {
|
|
for k, v := range src {
|
|
keyStr, isStr := k.(string)
|
|
if isStr {
|
|
ret[keyStr] = v
|
|
}
|
|
}
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) []xPathResult {
|
|
var ret []xPathResult
|
|
|
|
for k, v := range s {
|
|
asStr, isStr := v.(string)
|
|
|
|
if isStr {
|
|
// apply common
|
|
if common != nil {
|
|
asStr = common.applyCommon(asStr)
|
|
}
|
|
|
|
found := htmlquery.Find(doc, asStr)
|
|
if len(found) > 0 {
|
|
for i, elem := range found {
|
|
if i >= len(ret) {
|
|
ret = append(ret, make(xPathResult))
|
|
}
|
|
|
|
ret[i][k] = elem
|
|
}
|
|
}
|
|
}
|
|
// TODO - handle map type
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
type xpathScrapers map[string]*xpathScraper
|
|
|
|
type xpathScraper struct {
|
|
Common commonXPathConfig `yaml:"common"`
|
|
Scene xpathScraperConfig `yaml:"scene"`
|
|
Performer xpathScraperConfig `yaml:"performer"`
|
|
}
|
|
|
|
const (
|
|
XPathScraperConfigSceneTags = "Tags"
|
|
XPathScraperConfigScenePerformers = "Performers"
|
|
XPathScraperConfigSceneStudio = "Studio"
|
|
)
|
|
|
|
func (s xpathScraper) GetSceneSimple() xpathScraperConfig {
|
|
// exclude the complex sub-configs
|
|
ret := make(xpathScraperConfig)
|
|
mapped := s.Scene
|
|
|
|
if mapped != nil {
|
|
for k, v := range mapped {
|
|
if k != XPathScraperConfigSceneTags && k != XPathScraperConfigScenePerformers && k != XPathScraperConfigSceneStudio {
|
|
ret[k] = v
|
|
}
|
|
}
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
func (s xpathScraper) getSceneSubMap(key string) xpathScraperConfig {
|
|
var ret map[interface{}]interface{}
|
|
mapped := s.Scene
|
|
|
|
if mapped != nil {
|
|
v, ok := mapped[key]
|
|
if ok {
|
|
ret, _ = v.(map[interface{}]interface{})
|
|
}
|
|
}
|
|
|
|
if ret != nil {
|
|
return createXPathScraperConfig(ret)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s xpathScraper) GetScenePerformers() xpathScraperConfig {
|
|
return s.getSceneSubMap(XPathScraperConfigScenePerformers)
|
|
}
|
|
|
|
func (s xpathScraper) GetSceneTags() xpathScraperConfig {
|
|
return s.getSceneSubMap(XPathScraperConfigSceneTags)
|
|
}
|
|
|
|
func (s xpathScraper) GetSceneStudio() xpathScraperConfig {
|
|
return s.getSceneSubMap(XPathScraperConfigSceneStudio)
|
|
}
|
|
|
|
func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, error) {
|
|
var ret models.ScrapedPerformer
|
|
|
|
performerMap := s.Performer
|
|
if performerMap == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
results := performerMap.process(doc, s.Common)
|
|
if len(results) > 0 {
|
|
results[0].apply(&ret)
|
|
}
|
|
|
|
return &ret, nil
|
|
}
|
|
|
|
func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
|
|
var ret models.ScrapedScene
|
|
|
|
sceneMap := s.GetSceneSimple()
|
|
if sceneMap == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
scenePerformersMap := s.GetScenePerformers()
|
|
sceneTagsMap := s.GetSceneTags()
|
|
sceneStudioMap := s.GetSceneStudio()
|
|
|
|
results := sceneMap.process(doc, s.Common)
|
|
if len(results) > 0 {
|
|
results[0].apply(&ret)
|
|
|
|
// now apply the performers and tags
|
|
if scenePerformersMap != nil {
|
|
performerResults := scenePerformersMap.process(doc, s.Common)
|
|
|
|
for _, p := range performerResults {
|
|
performer := &models.ScrapedScenePerformer{}
|
|
p.apply(performer)
|
|
ret.Performers = append(ret.Performers, performer)
|
|
}
|
|
}
|
|
|
|
if sceneTagsMap != nil {
|
|
tagResults := sceneTagsMap.process(doc, s.Common)
|
|
|
|
for _, p := range tagResults {
|
|
tag := &models.ScrapedSceneTag{}
|
|
p.apply(tag)
|
|
ret.Tags = append(ret.Tags, tag)
|
|
}
|
|
}
|
|
|
|
if sceneStudioMap != nil {
|
|
studioResults := sceneStudioMap.process(doc, s.Common)
|
|
|
|
if len(studioResults) > 0 {
|
|
studio := &models.ScrapedSceneStudio{}
|
|
studioResults[0].apply(studio)
|
|
ret.Studio = studio
|
|
}
|
|
}
|
|
}
|
|
|
|
return &ret, nil
|
|
}
|
|
|
|
type xPathResult map[string]*html.Node
|
|
|
|
func (r xPathResult) apply(dest interface{}) {
|
|
destVal := reflect.ValueOf(dest)
|
|
|
|
// dest should be a pointer
|
|
destVal = destVal.Elem()
|
|
|
|
for key, v := range r {
|
|
field := destVal.FieldByName(key)
|
|
|
|
if field.IsValid() {
|
|
value := htmlquery.InnerText(v)
|
|
value = strings.TrimSpace(value)
|
|
|
|
// remove multiple whitespace and end lines
|
|
re := regexp.MustCompile("\n")
|
|
value = re.ReplaceAllString(value, "")
|
|
re = regexp.MustCompile(" +")
|
|
value = re.ReplaceAllString(value, " ")
|
|
|
|
var reflectValue reflect.Value
|
|
if field.Kind() == reflect.Ptr {
|
|
reflectValue = reflect.ValueOf(&value)
|
|
} else {
|
|
reflectValue = reflect.ValueOf(value)
|
|
}
|
|
|
|
field.Set(reflectValue)
|
|
} else {
|
|
logger.Errorf("Field %s does not exist in %T", key, dest)
|
|
}
|
|
}
|
|
}
|
|
|
|
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
|
|
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
|
|
|
if scraper == nil {
|
|
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
|
}
|
|
|
|
doc, err := htmlquery.LoadURL(url)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return scraper.scrapePerformer(doc)
|
|
}
|
|
|
|
func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
|
|
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
|
|
|
if scraper == nil {
|
|
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
|
}
|
|
|
|
doc, err := htmlquery.LoadURL(url)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return scraper.scrapeScene(doc)
|
|
}
|