From abf2b49803a5c315ae689dd70e217fed70e015c9 Mon Sep 17 00:00:00 2001 From: WithoutPants <53250216+WithoutPants@users.noreply.github.com> Date: Sat, 21 Mar 2020 08:55:15 +1100 Subject: [PATCH] Configurable scraper user agent string (#409) * Add debug scrape option. Co-authored-by: HiddenPants255 <> --- graphql/documents/data/config.graphql | 1 + graphql/schema/types/config.graphql | 4 ++ pkg/api/resolver_mutation_configure.go | 4 ++ pkg/api/resolver_query_configuration.go | 4 +- pkg/manager/config/config.go | 9 +++- pkg/scraper/config.go | 9 +++- pkg/scraper/image.go | 13 ++++- pkg/scraper/xpath.go | 52 +++++++++++++++++-- .../Settings/SettingsConfigurationPanel.tsx | 21 +++++++- .../Settings/SettingsConfigurationPanel.tsx | 16 +++++- 10 files changed, 122 insertions(+), 11 deletions(-) diff --git a/graphql/documents/data/config.graphql b/graphql/documents/data/config.graphql index e36594c50..13a15e48b 100644 --- a/graphql/documents/data/config.graphql +++ b/graphql/documents/data/config.graphql @@ -11,6 +11,7 @@ fragment ConfigGeneralData on ConfigGeneralResult { logLevel logAccess excludes + scraperUserAgent } fragment ConfigInterfaceData on ConfigInterfaceResult { diff --git a/graphql/schema/types/config.graphql b/graphql/schema/types/config.graphql index 8054798ea..24a4334dd 100644 --- a/graphql/schema/types/config.graphql +++ b/graphql/schema/types/config.graphql @@ -32,6 +32,8 @@ input ConfigGeneralInput { logAccess: Boolean! """Array of file regexp to exclude from Scan""" excludes: [String!] + """Scraper user agent string""" + scraperUserAgent: String } type ConfigGeneralResult { @@ -59,6 +61,8 @@ type ConfigGeneralResult { logAccess: Boolean! """Array of file regexp to exclude from Scan""" excludes: [String!]! + """Scraper user agent string""" + scraperUserAgent: String } input ConfigInterfaceInput { diff --git a/pkg/api/resolver_mutation_configure.go b/pkg/api/resolver_mutation_configure.go index 5cec957b1..f3020fdc8 100644 --- a/pkg/api/resolver_mutation_configure.go +++ b/pkg/api/resolver_mutation_configure.go @@ -76,6 +76,10 @@ func (r *mutationResolver) ConfigureGeneral(ctx context.Context, input models.Co config.Set(config.Exclude, input.Excludes) } + if input.ScraperUserAgent != nil { + config.Set(config.ScraperUserAgent, input.ScraperUserAgent) + } + if err := config.Write(); err != nil { return makeConfigGeneralResult(), err } diff --git a/pkg/api/resolver_query_configuration.go b/pkg/api/resolver_query_configuration.go index d9865f1c7..abee07ecb 100644 --- a/pkg/api/resolver_query_configuration.go +++ b/pkg/api/resolver_query_configuration.go @@ -33,6 +33,8 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult { maxTranscodeSize := config.GetMaxTranscodeSize() maxStreamingTranscodeSize := config.GetMaxStreamingTranscodeSize() + scraperUserAgent := config.GetScraperUserAgent() + return &models.ConfigGeneralResult{ Stashes: config.GetStashPaths(), DatabasePath: config.GetDatabasePath(), @@ -46,6 +48,7 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult { LogLevel: config.GetLogLevel(), LogAccess: config.GetLogAccess(), Excludes: config.GetExcludes(), + ScraperUserAgent: &scraperUserAgent, } } @@ -59,7 +62,6 @@ func makeConfigInterfaceResult() *models.ConfigInterfaceResult { cssEnabled := config.GetCSSEnabled() language := config.GetLanguage() - return &models.ConfigInterfaceResult{ SoundOnPreview: &soundOnPreview, WallShowTitle: &wallShowTitle, diff --git a/pkg/manager/config/config.go b/pkg/manager/config/config.go index e5fa50f51..fa0a70b03 100644 --- a/pkg/manager/config/config.go +++ b/pkg/manager/config/config.go @@ -22,7 +22,6 @@ const Password = "password" const Database = "database" -const ScrapersPath = "scrapers_path" const Exclude = "exclude" const MaxTranscodeSize = "max_transcode_size" @@ -32,6 +31,10 @@ const Host = "host" const Port = "port" const ExternalHost = "external_host" +// scraping options +const ScrapersPath = "scrapers_path" +const ScraperUserAgent = "scraper_user_agent" + // i18n const Language = "language" @@ -115,6 +118,10 @@ func GetScrapersPath() string { return viper.GetString(ScrapersPath) } +func GetScraperUserAgent() string { + return viper.GetString(ScraperUserAgent) +} + func GetHost() string { return viper.GetString(Host) } diff --git a/pkg/scraper/config.go b/pkg/scraper/config.go index bfeb1e1bc..4e1a84557 100644 --- a/pkg/scraper/config.go +++ b/pkg/scraper/config.go @@ -139,6 +139,10 @@ func (c *scrapeSceneByURLConfig) resolveFn() { } } +type scraperDebugOptions struct { + PrintHTML bool `yaml:"printHTML"` +} + type scraperConfig struct { ID string Name string `yaml:"name"` @@ -148,8 +152,9 @@ type scraperConfig struct { SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"` SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"` - StashServer *stashServer `yaml:"stashServer"` - XPathScrapers xpathScrapers `yaml:"xPathScrapers"` + DebugOptions *scraperDebugOptions `yaml:"debug"` + StashServer *stashServer `yaml:"stashServer"` + XPathScrapers xpathScrapers `yaml:"xPathScrapers"` } func loadScraperFromYAML(path string) (*scraperConfig, error) { diff --git a/pkg/scraper/image.go b/pkg/scraper/image.go index c44295100..4cdd691c1 100644 --- a/pkg/scraper/image.go +++ b/pkg/scraper/image.go @@ -6,6 +6,7 @@ import ( "strings" "time" + "github.com/stashapp/stash/pkg/manager/config" "github.com/stashapp/stash/pkg/models" "github.com/stashapp/stash/pkg/utils" ) @@ -52,8 +53,18 @@ func getImage(url string) (*string, error) { Timeout: imageGetTimeout, } + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + + userAgent := config.GetScraperUserAgent() + if userAgent != "" { + req.Header.Set("User-Agent", userAgent) + } + // assume is a URL for now - resp, err := client.Get(url) + resp, err := client.Do(req) if err != nil { return nil, err } diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go index 9f063454c..b3eb25777 100644 --- a/pkg/scraper/xpath.go +++ b/pkg/scraper/xpath.go @@ -1,7 +1,9 @@ package scraper import ( + "bytes" "errors" + "net/http" "net/url" "reflect" "regexp" @@ -10,11 +12,17 @@ import ( "github.com/antchfx/htmlquery" "golang.org/x/net/html" + "golang.org/x/net/html/charset" "github.com/stashapp/stash/pkg/logger" + "github.com/stashapp/stash/pkg/manager/config" "github.com/stashapp/stash/pkg/models" ) +// Timeout for the scrape http request. Includes transfer time. May want to make this +// configurable at some point. +const scrapeGetTimeout = time.Second * 30 + type commonXPathConfig map[string]string func (c commonXPathConfig) applyCommon(src string) string { @@ -197,7 +205,7 @@ func (c xpathScraperAttrConfig) applySubScraper(value string) string { return value } - doc, err := htmlquery.LoadURL(value) + doc, err := loadURL(value, nil) if err != nil { logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error()) @@ -504,6 +512,42 @@ func (r xPathResults) setKey(index int, key string, value string) xPathResults { return r } +func loadURL(url string, c *scraperConfig) (*html.Node, error) { + client := &http.Client{ + Timeout: scrapeGetTimeout, + } + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + + userAgent := config.GetScraperUserAgent() + if userAgent != "" { + req.Header.Set("User-Agent", userAgent) + } + + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type")) + if err != nil { + return nil, err + } + + ret, err := html.Parse(r) + + if err == nil && c != nil && c.DebugOptions != nil && c.DebugOptions.PrintHTML { + var b bytes.Buffer + html.Render(&b, ret) + logger.Infof("loadURL (%s) response: \n%s", url, b.String()) + } + + return ret, err +} + func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) { scraper := c.scraperConfig.XPathScrapers[c.Scraper] @@ -511,7 +555,7 @@ func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPe return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config") } - doc, err := htmlquery.LoadURL(url) + doc, err := loadURL(url, c.scraperConfig) if err != nil { return nil, err @@ -527,7 +571,7 @@ func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config") } - doc, err := htmlquery.LoadURL(url) + doc, err := loadURL(url, c.scraperConfig) if err != nil { return nil, err @@ -551,7 +595,7 @@ func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.Scra u := c.QueryURL u = strings.Replace(u, placeholder, escapedName, -1) - doc, err := htmlquery.LoadURL(u) + doc, err := loadURL(u, c.scraperConfig) if err != nil { return nil, err diff --git a/ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx b/ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx index 22b0942d8..8e46cacfb 100644 --- a/ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx +++ b/ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx @@ -29,6 +29,7 @@ export const SettingsConfigurationPanel: React.FC = () => { const [logLevel, setLogLevel] = useState("Info"); const [logAccess, setLogAccess] = useState(true); const [excludes, setExcludes] = useState([]); + const [scraperUserAgent, setScraperUserAgent] = useState(undefined); const { data, error, loading } = StashService.useConfiguration(); @@ -44,7 +45,8 @@ export const SettingsConfigurationPanel: React.FC = () => { logOut, logLevel, logAccess, - excludes + excludes, + scraperUserAgent }); useEffect(() => { @@ -66,6 +68,7 @@ export const SettingsConfigurationPanel: React.FC = () => { setLogLevel(conf.general.logLevel); setLogAccess(conf.general.logAccess); setExcludes(conf.general.excludes); + setScraperUserAgent(conf.general.scraperUserAgent ?? undefined); } }, [data, error]); @@ -289,6 +292,22 @@ export const SettingsConfigurationPanel: React.FC = () => {
+ +
Scraping
+ ) => + setScraperUserAgent(e.currentTarget.value) + } + /> + + User-Agent string used during scrape http requests + +
+ +
+

Authentication

diff --git a/ui/v2/src/components/Settings/SettingsConfigurationPanel.tsx b/ui/v2/src/components/Settings/SettingsConfigurationPanel.tsx index ce3de1c17..30638fc88 100644 --- a/ui/v2/src/components/Settings/SettingsConfigurationPanel.tsx +++ b/ui/v2/src/components/Settings/SettingsConfigurationPanel.tsx @@ -32,6 +32,7 @@ export const SettingsConfigurationPanel: FunctionComponent = (props: IPr const [logLevel, setLogLevel] = useState("Info"); const [logAccess, setLogAccess] = useState(true); const [excludes, setExcludes] = useState<(string)[]>([]); + const [scraperUserAgent, setScraperUserAgent] = useState(undefined); const { data, error, loading } = StashService.useConfiguration(); @@ -48,7 +49,7 @@ export const SettingsConfigurationPanel: FunctionComponent = (props: IPr logLevel, logAccess, excludes, - + scraperUserAgent, }); useEffect(() => { @@ -67,6 +68,7 @@ export const SettingsConfigurationPanel: FunctionComponent = (props: IPr setLogLevel(conf.general.logLevel); setLogAccess(conf.general.logAccess); setExcludes(conf.general.excludes); + setScraperUserAgent(conf.general.scraperUserAgent); } }, [data, error]); @@ -229,6 +231,18 @@ export const SettingsConfigurationPanel: FunctionComponent = (props: IPr + +

Scraping

+ + setScraperUserAgent(e.target.value)} /> + +
+ + +

Authentication