Configurable scraper user agent string (#409)

* Add debug scrape option.

Co-authored-by: HiddenPants255 <>
This commit is contained in:
WithoutPants 2020-03-21 08:55:15 +11:00 committed by GitHub
parent ff495361d9
commit abf2b49803
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 122 additions and 11 deletions

View File

@ -11,6 +11,7 @@ fragment ConfigGeneralData on ConfigGeneralResult {
logLevel
logAccess
excludes
scraperUserAgent
}
fragment ConfigInterfaceData on ConfigInterfaceResult {

View File

@ -32,6 +32,8 @@ input ConfigGeneralInput {
logAccess: Boolean!
"""Array of file regexp to exclude from Scan"""
excludes: [String!]
"""Scraper user agent string"""
scraperUserAgent: String
}
type ConfigGeneralResult {
@ -59,6 +61,8 @@ type ConfigGeneralResult {
logAccess: Boolean!
"""Array of file regexp to exclude from Scan"""
excludes: [String!]!
"""Scraper user agent string"""
scraperUserAgent: String
}
input ConfigInterfaceInput {

View File

@ -76,6 +76,10 @@ func (r *mutationResolver) ConfigureGeneral(ctx context.Context, input models.Co
config.Set(config.Exclude, input.Excludes)
}
if input.ScraperUserAgent != nil {
config.Set(config.ScraperUserAgent, input.ScraperUserAgent)
}
if err := config.Write(); err != nil {
return makeConfigGeneralResult(), err
}

View File

@ -33,6 +33,8 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
maxTranscodeSize := config.GetMaxTranscodeSize()
maxStreamingTranscodeSize := config.GetMaxStreamingTranscodeSize()
scraperUserAgent := config.GetScraperUserAgent()
return &models.ConfigGeneralResult{
Stashes: config.GetStashPaths(),
DatabasePath: config.GetDatabasePath(),
@ -46,6 +48,7 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
LogLevel: config.GetLogLevel(),
LogAccess: config.GetLogAccess(),
Excludes: config.GetExcludes(),
ScraperUserAgent: &scraperUserAgent,
}
}
@ -59,7 +62,6 @@ func makeConfigInterfaceResult() *models.ConfigInterfaceResult {
cssEnabled := config.GetCSSEnabled()
language := config.GetLanguage()
return &models.ConfigInterfaceResult{
SoundOnPreview: &soundOnPreview,
WallShowTitle: &wallShowTitle,

View File

@ -22,7 +22,6 @@ const Password = "password"
const Database = "database"
const ScrapersPath = "scrapers_path"
const Exclude = "exclude"
const MaxTranscodeSize = "max_transcode_size"
@ -32,6 +31,10 @@ const Host = "host"
const Port = "port"
const ExternalHost = "external_host"
// scraping options
const ScrapersPath = "scrapers_path"
const ScraperUserAgent = "scraper_user_agent"
// i18n
const Language = "language"
@ -115,6 +118,10 @@ func GetScrapersPath() string {
return viper.GetString(ScrapersPath)
}
func GetScraperUserAgent() string {
return viper.GetString(ScraperUserAgent)
}
func GetHost() string {
return viper.GetString(Host)
}

View File

@ -139,6 +139,10 @@ func (c *scrapeSceneByURLConfig) resolveFn() {
}
}
type scraperDebugOptions struct {
PrintHTML bool `yaml:"printHTML"`
}
type scraperConfig struct {
ID string
Name string `yaml:"name"`
@ -148,8 +152,9 @@ type scraperConfig struct {
SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"`
SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"`
StashServer *stashServer `yaml:"stashServer"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
DebugOptions *scraperDebugOptions `yaml:"debug"`
StashServer *stashServer `yaml:"stashServer"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
}
func loadScraperFromYAML(path string) (*scraperConfig, error) {

View File

@ -6,6 +6,7 @@ import (
"strings"
"time"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/utils"
)
@ -52,8 +53,18 @@ func getImage(url string) (*string, error) {
Timeout: imageGetTimeout,
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
// assume is a URL for now
resp, err := client.Get(url)
resp, err := client.Do(req)
if err != nil {
return nil, err
}

View File

@ -1,7 +1,9 @@
package scraper
import (
"bytes"
"errors"
"net/http"
"net/url"
"reflect"
"regexp"
@ -10,11 +12,17 @@ import (
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
)
// Timeout for the scrape http request. Includes transfer time. May want to make this
// configurable at some point.
const scrapeGetTimeout = time.Second * 30
type commonXPathConfig map[string]string
func (c commonXPathConfig) applyCommon(src string) string {
@ -197,7 +205,7 @@ func (c xpathScraperAttrConfig) applySubScraper(value string) string {
return value
}
doc, err := htmlquery.LoadURL(value)
doc, err := loadURL(value, nil)
if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
@ -504,6 +512,42 @@ func (r xPathResults) setKey(index int, key string, value string) xPathResults {
return r
}
func loadURL(url string, c *scraperConfig) (*html.Node, error) {
client := &http.Client{
Timeout: scrapeGetTimeout,
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
ret, err := html.Parse(r)
if err == nil && c != nil && c.DebugOptions != nil && c.DebugOptions.PrintHTML {
var b bytes.Buffer
html.Render(&b, ret)
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
}
return ret, err
}
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
@ -511,7 +555,7 @@ func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPe
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := htmlquery.LoadURL(url)
doc, err := loadURL(url, c.scraperConfig)
if err != nil {
return nil, err
@ -527,7 +571,7 @@ func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene,
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := htmlquery.LoadURL(url)
doc, err := loadURL(url, c.scraperConfig)
if err != nil {
return nil, err
@ -551,7 +595,7 @@ func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.Scra
u := c.QueryURL
u = strings.Replace(u, placeholder, escapedName, -1)
doc, err := htmlquery.LoadURL(u)
doc, err := loadURL(u, c.scraperConfig)
if err != nil {
return nil, err

View File

@ -29,6 +29,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
const [logLevel, setLogLevel] = useState<string>("Info");
const [logAccess, setLogAccess] = useState<boolean>(true);
const [excludes, setExcludes] = useState<string[]>([]);
const [scraperUserAgent, setScraperUserAgent] = useState<string | undefined>(undefined);
const { data, error, loading } = StashService.useConfiguration();
@ -44,7 +45,8 @@ export const SettingsConfigurationPanel: React.FC = () => {
logOut,
logLevel,
logAccess,
excludes
excludes,
scraperUserAgent
});
useEffect(() => {
@ -66,6 +68,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
setLogLevel(conf.general.logLevel);
setLogAccess(conf.general.logAccess);
setExcludes(conf.general.excludes);
setScraperUserAgent(conf.general.scraperUserAgent ?? undefined);
}
}, [data, error]);
@ -289,6 +292,22 @@ export const SettingsConfigurationPanel: React.FC = () => {
<hr />
<Form.Group id="generated-path">
<h6>Scraping</h6>
<Form.Control
className="col col-sm-6 text-input"
defaultValue={scraperUserAgent}
onChange={(e: React.FormEvent<HTMLInputElement>) =>
setScraperUserAgent(e.currentTarget.value)
}
/>
<Form.Text className="text-muted">
User-Agent string used during scrape http requests
</Form.Text>
</Form.Group>
<hr />
<Form.Group>
<h4>Authentication</h4>
<Form.Group id="username">

View File

@ -32,6 +32,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
const [logLevel, setLogLevel] = useState<string>("Info");
const [logAccess, setLogAccess] = useState<boolean>(true);
const [excludes, setExcludes] = useState<(string)[]>([]);
const [scraperUserAgent, setScraperUserAgent] = useState<string | undefined>(undefined);
const { data, error, loading } = StashService.useConfiguration();
@ -48,7 +49,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
logLevel,
logAccess,
excludes,
scraperUserAgent,
});
useEffect(() => {
@ -67,6 +68,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
setLogLevel(conf.general.logLevel);
setLogAccess(conf.general.logAccess);
setExcludes(conf.general.excludes);
setScraperUserAgent(conf.general.scraperUserAgent);
}
}, [data, error]);
@ -229,6 +231,18 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
</FormGroup>
<Divider />
<FormGroup>
<H4>Scraping</H4>
<FormGroup
label="Scraper User-Agent string"
helperText="User-Agent string used during scrape http requests"
>
<InputGroup value={scraperUserAgent} onChange={(e: any) => setScraperUserAgent(e.target.value)} />
</FormGroup>
</FormGroup>
<Divider />
<FormGroup>
<H4>Authentication</H4>
<FormGroup