2020-01-04 16:39:33 +00:00
|
|
|
package scraper
|
|
|
|
|
|
|
|
import (
|
2020-03-20 21:55:15 +00:00
|
|
|
"bytes"
|
2020-01-04 16:39:33 +00:00
|
|
|
"errors"
|
2020-03-20 21:55:15 +00:00
|
|
|
"net/http"
|
2020-01-31 22:17:40 +00:00
|
|
|
"net/url"
|
2020-01-04 16:39:33 +00:00
|
|
|
"reflect"
|
|
|
|
"regexp"
|
|
|
|
"strings"
|
2020-01-31 22:17:40 +00:00
|
|
|
"time"
|
2020-01-04 16:39:33 +00:00
|
|
|
|
|
|
|
"github.com/antchfx/htmlquery"
|
|
|
|
"golang.org/x/net/html"
|
2020-03-20 21:55:15 +00:00
|
|
|
"golang.org/x/net/html/charset"
|
2020-01-04 16:39:33 +00:00
|
|
|
|
|
|
|
"github.com/stashapp/stash/pkg/logger"
|
2020-03-20 21:55:15 +00:00
|
|
|
"github.com/stashapp/stash/pkg/manager/config"
|
2020-01-04 16:39:33 +00:00
|
|
|
"github.com/stashapp/stash/pkg/models"
|
|
|
|
)
|
|
|
|
|
2020-03-20 21:55:15 +00:00
|
|
|
// Timeout for the scrape http request. Includes transfer time. May want to make this
|
|
|
|
// configurable at some point.
|
|
|
|
const scrapeGetTimeout = time.Second * 30
|
|
|
|
|
2020-01-04 16:39:33 +00:00
|
|
|
type commonXPathConfig map[string]string
|
|
|
|
|
|
|
|
func (c commonXPathConfig) applyCommon(src string) string {
|
|
|
|
ret := src
|
|
|
|
for commonKey, commonVal := range c {
|
|
|
|
if strings.Contains(ret, commonKey) {
|
|
|
|
ret = strings.Replace(ret, commonKey, commonVal, -1)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
|
|
|
type xpathScraperConfig map[string]interface{}
|
|
|
|
|
|
|
|
func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfig {
|
|
|
|
ret := make(xpathScraperConfig)
|
|
|
|
|
|
|
|
if src != nil {
|
|
|
|
for k, v := range src {
|
|
|
|
keyStr, isStr := k.(string)
|
|
|
|
if isStr {
|
|
|
|
ret[keyStr] = v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
type xpathRegexConfig map[interface{}]interface{}
|
|
|
|
type xpathRegexConfigs []xpathRegexConfig
|
2020-01-04 16:39:33 +00:00
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
func (c xpathRegexConfig) apply(value string) string {
|
|
|
|
regex := ""
|
|
|
|
with := ""
|
2020-01-04 16:39:33 +00:00
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
if regexI, _ := c["regex"]; regexI != nil {
|
|
|
|
regex, _ = regexI.(string)
|
|
|
|
}
|
|
|
|
if withI, _ := c["with"]; withI != nil {
|
|
|
|
with, _ = withI.(string)
|
|
|
|
}
|
2020-01-04 16:39:33 +00:00
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
if regex != "" {
|
|
|
|
re, err := regexp.Compile(regex)
|
|
|
|
if err != nil {
|
|
|
|
logger.Warnf("Error compiling regex '%s': %s", regex, err.Error())
|
|
|
|
return value
|
|
|
|
}
|
|
|
|
|
2020-05-20 12:46:00 +00:00
|
|
|
ret := re.ReplaceAllString(value, with)
|
|
|
|
|
|
|
|
logger.Debugf(`Replace: '%s' with '%s'`, regex, with)
|
|
|
|
logger.Debugf("Before: %s", value)
|
|
|
|
logger.Debugf("After: %s", ret)
|
|
|
|
return ret
|
2020-01-31 22:17:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return value
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c xpathRegexConfigs) apply(value string) string {
|
|
|
|
// apply regex in order
|
|
|
|
for _, config := range c {
|
|
|
|
value = config.apply(value)
|
|
|
|
}
|
|
|
|
|
|
|
|
// remove whitespace again
|
|
|
|
value = commonPostProcess(value)
|
|
|
|
|
|
|
|
return value
|
|
|
|
}
|
|
|
|
|
|
|
|
type xpathScraperAttrConfig map[interface{}]interface{}
|
|
|
|
|
|
|
|
func (c xpathScraperAttrConfig) getString(key string) string {
|
|
|
|
ret, _ := c[key]
|
|
|
|
|
|
|
|
if ret == nil {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
|
|
|
asStr, _ := ret.(string)
|
|
|
|
return asStr
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c xpathScraperAttrConfig) getSelector() string {
|
|
|
|
const selectorKey = "selector"
|
|
|
|
return c.getString(selectorKey)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c xpathScraperAttrConfig) getConcat() string {
|
|
|
|
const concatKey = "concat"
|
|
|
|
return c.getString(concatKey)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c xpathScraperAttrConfig) hasConcat() bool {
|
|
|
|
return c.getConcat() != ""
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c xpathScraperAttrConfig) getParseDate() string {
|
|
|
|
const parseDateKey = "parseDate"
|
|
|
|
return c.getString(parseDateKey)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs {
|
|
|
|
const replaceKey = "replace"
|
|
|
|
val, _ := c[replaceKey]
|
|
|
|
|
|
|
|
var ret xpathRegexConfigs
|
|
|
|
if val == nil {
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
|
|
|
asSlice, _ := val.([]interface{})
|
|
|
|
|
|
|
|
for _, v := range asSlice {
|
|
|
|
asMap, _ := v.(map[interface{}]interface{})
|
|
|
|
ret = append(ret, xpathRegexConfig(asMap))
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
2020-03-11 00:41:55 +00:00
|
|
|
func (c xpathScraperAttrConfig) getSubScraper() xpathScraperAttrConfig {
|
|
|
|
const subScraperKey = "subScraper"
|
|
|
|
val, _ := c[subScraperKey]
|
|
|
|
|
|
|
|
if val == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
asMap, _ := val.(map[interface{}]interface{})
|
|
|
|
if asMap != nil {
|
|
|
|
return xpathScraperAttrConfig(asMap)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
func (c xpathScraperAttrConfig) concatenateResults(nodes []*html.Node) string {
|
|
|
|
separator := c.getConcat()
|
|
|
|
result := []string{}
|
|
|
|
|
|
|
|
for _, elem := range nodes {
|
2020-05-18 02:26:20 +00:00
|
|
|
text := NodeText(elem)
|
2020-01-31 22:17:40 +00:00
|
|
|
text = commonPostProcess(text)
|
|
|
|
|
|
|
|
result = append(result, text)
|
|
|
|
}
|
|
|
|
|
|
|
|
return strings.Join(result, separator)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c xpathScraperAttrConfig) parseDate(value string) string {
|
|
|
|
parseDate := c.getParseDate()
|
|
|
|
|
|
|
|
if parseDate == "" {
|
|
|
|
return value
|
|
|
|
}
|
|
|
|
|
|
|
|
// try to parse the date using the pattern
|
|
|
|
// if it fails, then just fall back to the original value
|
|
|
|
parsedValue, err := time.Parse(parseDate, value)
|
|
|
|
if err != nil {
|
|
|
|
logger.Warnf("Error parsing date string '%s' using format '%s': %s", value, parseDate, err.Error())
|
|
|
|
return value
|
|
|
|
}
|
|
|
|
|
|
|
|
// convert it into our date format
|
|
|
|
const internalDateFormat = "2006-01-02"
|
|
|
|
return parsedValue.Format(internalDateFormat)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c xpathScraperAttrConfig) replaceRegex(value string) string {
|
|
|
|
replace := c.getReplace()
|
|
|
|
return replace.apply(value)
|
|
|
|
}
|
|
|
|
|
2020-03-11 00:41:55 +00:00
|
|
|
func (c xpathScraperAttrConfig) applySubScraper(value string) string {
|
|
|
|
subScraper := c.getSubScraper()
|
|
|
|
|
|
|
|
if subScraper == nil {
|
|
|
|
return value
|
|
|
|
}
|
|
|
|
|
2020-05-20 12:46:00 +00:00
|
|
|
logger.Debugf("Sub-scraping for: %s", value)
|
2020-03-20 21:55:15 +00:00
|
|
|
doc, err := loadURL(value, nil)
|
2020-03-11 00:41:55 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
|
|
|
found := runXPathQuery(doc, subScraper.getSelector(), nil)
|
|
|
|
|
|
|
|
if len(found) > 0 {
|
|
|
|
// check if we're concatenating the results into a single result
|
|
|
|
var result string
|
|
|
|
if subScraper.hasConcat() {
|
|
|
|
result = subScraper.concatenateResults(found)
|
|
|
|
} else {
|
2020-05-18 02:26:20 +00:00
|
|
|
result = NodeText(found[0])
|
2020-03-11 00:41:55 +00:00
|
|
|
result = commonPostProcess(result)
|
|
|
|
}
|
|
|
|
|
|
|
|
result = subScraper.postProcess(result)
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
func (c xpathScraperAttrConfig) postProcess(value string) string {
|
|
|
|
// perform regex replacements first
|
|
|
|
value = c.replaceRegex(value)
|
2020-03-11 00:41:55 +00:00
|
|
|
value = c.applySubScraper(value)
|
2020-06-15 11:38:59 +00:00
|
|
|
value = c.parseDate(value)
|
2020-01-31 22:17:40 +00:00
|
|
|
|
|
|
|
return value
|
|
|
|
}
|
|
|
|
|
|
|
|
func commonPostProcess(value string) string {
|
|
|
|
value = strings.TrimSpace(value)
|
|
|
|
|
|
|
|
// remove multiple whitespace and end lines
|
|
|
|
re := regexp.MustCompile("\n")
|
|
|
|
value = re.ReplaceAllString(value, "")
|
|
|
|
re = regexp.MustCompile(" +")
|
|
|
|
value = re.ReplaceAllString(value, " ")
|
|
|
|
|
|
|
|
return value
|
|
|
|
}
|
|
|
|
|
|
|
|
func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node {
|
|
|
|
// apply common
|
|
|
|
if common != nil {
|
|
|
|
xpath = common.applyCommon(xpath)
|
|
|
|
}
|
|
|
|
|
|
|
|
found, err := htmlquery.QueryAll(doc, xpath)
|
|
|
|
if err != nil {
|
|
|
|
logger.Warnf("Error parsing xpath expression '%s': %s", xpath, err.Error())
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return found
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xPathResults {
|
|
|
|
var ret xPathResults
|
|
|
|
|
|
|
|
for k, value := range s {
|
|
|
|
switch v := value.(type) {
|
|
|
|
case string:
|
|
|
|
found := runXPathQuery(doc, v, common)
|
2020-01-25 03:36:24 +00:00
|
|
|
|
2020-01-04 16:39:33 +00:00
|
|
|
if len(found) > 0 {
|
|
|
|
for i, elem := range found {
|
2020-05-18 02:26:20 +00:00
|
|
|
text := NodeText(elem)
|
2020-01-31 22:17:40 +00:00
|
|
|
text = commonPostProcess(text)
|
|
|
|
|
|
|
|
ret = ret.setKey(i, k, text)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
case map[interface{}]interface{}:
|
|
|
|
attrConfig := xpathScraperAttrConfig(v)
|
2020-01-04 16:39:33 +00:00
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
found := runXPathQuery(doc, attrConfig.getSelector(), common)
|
|
|
|
|
|
|
|
if len(found) > 0 {
|
|
|
|
// check if we're concatenating the results into a single result
|
|
|
|
if attrConfig.hasConcat() {
|
|
|
|
result := attrConfig.concatenateResults(found)
|
|
|
|
result = attrConfig.postProcess(result)
|
|
|
|
const i = 0
|
|
|
|
ret = ret.setKey(i, k, result)
|
|
|
|
} else {
|
|
|
|
for i, elem := range found {
|
2020-05-18 02:26:20 +00:00
|
|
|
text := NodeText(elem)
|
2020-01-31 22:17:40 +00:00
|
|
|
text = commonPostProcess(text)
|
|
|
|
text = attrConfig.postProcess(text)
|
|
|
|
|
|
|
|
ret = ret.setKey(i, k, text)
|
|
|
|
}
|
2020-01-04 16:39:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
|
|
|
type xpathScrapers map[string]*xpathScraper
|
|
|
|
|
|
|
|
type xpathScraper struct {
|
|
|
|
Common commonXPathConfig `yaml:"common"`
|
|
|
|
Scene xpathScraperConfig `yaml:"scene"`
|
|
|
|
Performer xpathScraperConfig `yaml:"performer"`
|
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
|
|
|
XPathScraperConfigSceneTags = "Tags"
|
|
|
|
XPathScraperConfigScenePerformers = "Performers"
|
|
|
|
XPathScraperConfigSceneStudio = "Studio"
|
2020-03-10 03:28:15 +00:00
|
|
|
XPathScraperConfigSceneMovies = "Movies"
|
2020-01-04 16:39:33 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
func (s xpathScraper) GetSceneSimple() xpathScraperConfig {
|
|
|
|
// exclude the complex sub-configs
|
|
|
|
ret := make(xpathScraperConfig)
|
|
|
|
mapped := s.Scene
|
|
|
|
|
|
|
|
if mapped != nil {
|
|
|
|
for k, v := range mapped {
|
2020-03-10 03:28:15 +00:00
|
|
|
if k != XPathScraperConfigSceneTags && k != XPathScraperConfigScenePerformers && k != XPathScraperConfigSceneStudio && k != XPathScraperConfigSceneMovies {
|
2020-01-04 16:39:33 +00:00
|
|
|
ret[k] = v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s xpathScraper) getSceneSubMap(key string) xpathScraperConfig {
|
|
|
|
var ret map[interface{}]interface{}
|
|
|
|
mapped := s.Scene
|
|
|
|
|
|
|
|
if mapped != nil {
|
|
|
|
v, ok := mapped[key]
|
|
|
|
if ok {
|
|
|
|
ret, _ = v.(map[interface{}]interface{})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ret != nil {
|
|
|
|
return createXPathScraperConfig(ret)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s xpathScraper) GetScenePerformers() xpathScraperConfig {
|
|
|
|
return s.getSceneSubMap(XPathScraperConfigScenePerformers)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s xpathScraper) GetSceneTags() xpathScraperConfig {
|
|
|
|
return s.getSceneSubMap(XPathScraperConfigSceneTags)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s xpathScraper) GetSceneStudio() xpathScraperConfig {
|
|
|
|
return s.getSceneSubMap(XPathScraperConfigSceneStudio)
|
|
|
|
}
|
|
|
|
|
2020-03-10 03:28:15 +00:00
|
|
|
func (s xpathScraper) GetSceneMovies() xpathScraperConfig {
|
|
|
|
return s.getSceneSubMap(XPathScraperConfigSceneMovies)
|
|
|
|
}
|
|
|
|
|
2020-01-04 16:39:33 +00:00
|
|
|
func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, error) {
|
|
|
|
var ret models.ScrapedPerformer
|
|
|
|
|
|
|
|
performerMap := s.Performer
|
|
|
|
if performerMap == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
results := performerMap.process(doc, s.Common)
|
|
|
|
if len(results) > 0 {
|
|
|
|
results[0].apply(&ret)
|
|
|
|
}
|
|
|
|
|
|
|
|
return &ret, nil
|
|
|
|
}
|
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
func (s xpathScraper) scrapePerformers(doc *html.Node) ([]*models.ScrapedPerformer, error) {
|
|
|
|
var ret []*models.ScrapedPerformer
|
|
|
|
|
|
|
|
performerMap := s.Performer
|
|
|
|
if performerMap == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
results := performerMap.process(doc, s.Common)
|
|
|
|
for _, r := range results {
|
|
|
|
var p models.ScrapedPerformer
|
|
|
|
r.apply(&p)
|
|
|
|
ret = append(ret, &p)
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret, nil
|
|
|
|
}
|
|
|
|
|
2020-01-04 16:39:33 +00:00
|
|
|
func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
|
|
|
|
var ret models.ScrapedScene
|
|
|
|
|
|
|
|
sceneMap := s.GetSceneSimple()
|
|
|
|
if sceneMap == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
scenePerformersMap := s.GetScenePerformers()
|
|
|
|
sceneTagsMap := s.GetSceneTags()
|
|
|
|
sceneStudioMap := s.GetSceneStudio()
|
2020-03-10 03:28:15 +00:00
|
|
|
sceneMoviesMap := s.GetSceneMovies()
|
2020-01-04 16:39:33 +00:00
|
|
|
|
2020-05-20 12:46:00 +00:00
|
|
|
logger.Debug(`Processing scene:`)
|
2020-01-04 16:39:33 +00:00
|
|
|
results := sceneMap.process(doc, s.Common)
|
|
|
|
if len(results) > 0 {
|
|
|
|
results[0].apply(&ret)
|
|
|
|
|
|
|
|
// now apply the performers and tags
|
|
|
|
if scenePerformersMap != nil {
|
2020-05-20 12:46:00 +00:00
|
|
|
logger.Debug(`Processing scene performers:`)
|
2020-01-04 16:39:33 +00:00
|
|
|
performerResults := scenePerformersMap.process(doc, s.Common)
|
|
|
|
|
|
|
|
for _, p := range performerResults {
|
|
|
|
performer := &models.ScrapedScenePerformer{}
|
|
|
|
p.apply(performer)
|
|
|
|
ret.Performers = append(ret.Performers, performer)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if sceneTagsMap != nil {
|
2020-05-20 12:46:00 +00:00
|
|
|
logger.Debug(`Processing scene tags:`)
|
2020-01-04 16:39:33 +00:00
|
|
|
tagResults := sceneTagsMap.process(doc, s.Common)
|
|
|
|
|
|
|
|
for _, p := range tagResults {
|
|
|
|
tag := &models.ScrapedSceneTag{}
|
|
|
|
p.apply(tag)
|
|
|
|
ret.Tags = append(ret.Tags, tag)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if sceneStudioMap != nil {
|
2020-05-20 12:46:00 +00:00
|
|
|
logger.Debug(`Processing scene studio:`)
|
2020-01-04 16:39:33 +00:00
|
|
|
studioResults := sceneStudioMap.process(doc, s.Common)
|
|
|
|
|
|
|
|
if len(studioResults) > 0 {
|
|
|
|
studio := &models.ScrapedSceneStudio{}
|
|
|
|
studioResults[0].apply(studio)
|
|
|
|
ret.Studio = studio
|
|
|
|
}
|
|
|
|
}
|
2020-03-10 03:28:15 +00:00
|
|
|
|
|
|
|
if sceneMoviesMap != nil {
|
2020-05-20 12:46:00 +00:00
|
|
|
logger.Debug(`Processing scene movies:`)
|
2020-03-10 03:28:15 +00:00
|
|
|
movieResults := sceneMoviesMap.process(doc, s.Common)
|
|
|
|
|
|
|
|
for _, p := range movieResults {
|
|
|
|
movie := &models.ScrapedSceneMovie{}
|
|
|
|
p.apply(movie)
|
|
|
|
ret.Movies = append(ret.Movies, movie)
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2020-01-04 16:39:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return &ret, nil
|
|
|
|
}
|
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
type xPathResult map[string]string
|
|
|
|
type xPathResults []xPathResult
|
2020-01-04 16:39:33 +00:00
|
|
|
|
|
|
|
func (r xPathResult) apply(dest interface{}) {
|
|
|
|
destVal := reflect.ValueOf(dest)
|
|
|
|
|
|
|
|
// dest should be a pointer
|
|
|
|
destVal = destVal.Elem()
|
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
for key, value := range r {
|
2020-01-04 16:39:33 +00:00
|
|
|
field := destVal.FieldByName(key)
|
|
|
|
|
|
|
|
if field.IsValid() {
|
|
|
|
var reflectValue reflect.Value
|
|
|
|
if field.Kind() == reflect.Ptr {
|
2020-01-31 22:17:40 +00:00
|
|
|
// need to copy the value, otherwise everything is set to the
|
|
|
|
// same pointer
|
|
|
|
localValue := value
|
|
|
|
reflectValue = reflect.ValueOf(&localValue)
|
2020-01-04 16:39:33 +00:00
|
|
|
} else {
|
|
|
|
reflectValue = reflect.ValueOf(value)
|
|
|
|
}
|
|
|
|
|
|
|
|
field.Set(reflectValue)
|
|
|
|
} else {
|
|
|
|
logger.Errorf("Field %s does not exist in %T", key, dest)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-31 22:17:40 +00:00
|
|
|
func (r xPathResults) setKey(index int, key string, value string) xPathResults {
|
|
|
|
if index >= len(r) {
|
|
|
|
r = append(r, make(xPathResult))
|
|
|
|
}
|
|
|
|
|
2020-05-20 12:46:00 +00:00
|
|
|
logger.Debugf(`[%d][%s] = %s`, index, key, value)
|
2020-01-31 22:17:40 +00:00
|
|
|
r[index][key] = value
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2020-03-20 21:55:15 +00:00
|
|
|
func loadURL(url string, c *scraperConfig) (*html.Node, error) {
|
|
|
|
client := &http.Client{
|
|
|
|
Timeout: scrapeGetTimeout,
|
|
|
|
}
|
|
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
userAgent := config.GetScraperUserAgent()
|
|
|
|
if userAgent != "" {
|
|
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
}
|
|
|
|
|
|
|
|
resp, err := client.Do(req)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
|
|
|
r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
ret, err := html.Parse(r)
|
|
|
|
|
|
|
|
if err == nil && c != nil && c.DebugOptions != nil && c.DebugOptions.PrintHTML {
|
|
|
|
var b bytes.Buffer
|
|
|
|
html.Render(&b, ret)
|
|
|
|
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret, err
|
|
|
|
}
|
|
|
|
|
2020-01-04 16:39:33 +00:00
|
|
|
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
|
|
|
|
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
|
|
|
|
|
|
|
if scraper == nil {
|
|
|
|
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
|
|
|
}
|
|
|
|
|
2020-03-20 21:55:15 +00:00
|
|
|
doc, err := loadURL(url, c.scraperConfig)
|
2020-01-04 16:39:33 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return scraper.scrapePerformer(doc)
|
|
|
|
}
|
|
|
|
|
|
|
|
func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
|
|
|
|
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
|
|
|
|
|
|
|
if scraper == nil {
|
|
|
|
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
|
|
|
}
|
|
|
|
|
2020-03-20 21:55:15 +00:00
|
|
|
doc, err := loadURL(url, c.scraperConfig)
|
2020-01-04 16:39:33 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return scraper.scrapeScene(doc)
|
|
|
|
}
|
2020-01-31 22:17:40 +00:00
|
|
|
|
|
|
|
func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) {
|
|
|
|
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
|
|
|
|
|
|
|
if scraper == nil {
|
|
|
|
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
|
|
|
}
|
|
|
|
|
|
|
|
const placeholder = "{}"
|
|
|
|
|
|
|
|
// replace the placeholder string with the URL-escaped name
|
|
|
|
escapedName := url.QueryEscape(name)
|
|
|
|
|
|
|
|
u := c.QueryURL
|
|
|
|
u = strings.Replace(u, placeholder, escapedName, -1)
|
|
|
|
|
2020-03-20 21:55:15 +00:00
|
|
|
doc, err := loadURL(u, c.scraperConfig)
|
2020-01-31 22:17:40 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return scraper.scrapePerformers(doc)
|
|
|
|
}
|
2020-05-18 02:26:20 +00:00
|
|
|
|
|
|
|
func NodeText(n *html.Node) string {
|
|
|
|
if n != nil && n.Type == html.CommentNode {
|
|
|
|
return htmlquery.OutputHTML(n, true)
|
|
|
|
}
|
|
|
|
return htmlquery.InnerText(n)
|
|
|
|
}
|