From dfd55346b225d2d892b7891a6e60397f17fbc51d Mon Sep 17 00:00:00 2001 From: gitgiggety <79809426+gitgiggety@users.noreply.github.com> Date: Tue, 10 Aug 2021 06:07:01 +0200 Subject: [PATCH] Scrape tag exclusions (#1617) * Add config option for scraper tag exclusion patterns Add a config option for exclusing tags / tag patterns from the scraper results. * Handle tag exclusion patterns during scraping --- graphql/documents/data/config.graphql | 1 + graphql/schema/types/config.graphql | 4 + pkg/api/resolver_mutation_configure.go | 4 + pkg/api/resolver_query_configuration.go | 7 +- pkg/manager/config/config.go | 10 +++ pkg/scraper/scrapers.go | 77 ++++++++++++++----- .../src/components/Changelog/versions/v090.md | 1 + .../Settings/SettingsConfigurationPanel.tsx | 13 +++- .../Settings/SettingsScrapingPanel.tsx | 22 ++++++ ui/v2.5/src/locales/en-GB.json | 2 + 10 files changed, 114 insertions(+), 27 deletions(-) diff --git a/graphql/documents/data/config.graphql b/graphql/documents/data/config.graphql index 992bafa44..5511dc535 100644 --- a/graphql/documents/data/config.graphql +++ b/graphql/documents/data/config.graphql @@ -69,6 +69,7 @@ fragment ConfigScrapingData on ConfigScrapingResult { scraperUserAgent scraperCertCheck scraperCDPPath + excludeTagPatterns } fragment ConfigData on ConfigResult { diff --git a/graphql/schema/types/config.graphql b/graphql/schema/types/config.graphql index 89143fb7f..cec708d16 100644 --- a/graphql/schema/types/config.graphql +++ b/graphql/schema/types/config.graphql @@ -255,6 +255,8 @@ input ConfigScrapingInput { scraperCDPPath: String """Whether the scraper should check for invalid certificates""" scraperCertCheck: Boolean! + """Tags blacklist during scraping""" + excludeTagPatterns: [String!] } type ConfigScrapingResult { @@ -264,6 +266,8 @@ type ConfigScrapingResult { scraperCDPPath: String """Whether the scraper should check for invalid certificates""" scraperCertCheck: Boolean! + """Tags blacklist during scraping""" + excludeTagPatterns: [String!]! } """All configuration settings""" diff --git a/pkg/api/resolver_mutation_configure.go b/pkg/api/resolver_mutation_configure.go index f489a23a1..a6083022f 100644 --- a/pkg/api/resolver_mutation_configure.go +++ b/pkg/api/resolver_mutation_configure.go @@ -312,6 +312,10 @@ func (r *mutationResolver) ConfigureScraping(ctx context.Context, input models.C refreshScraperCache = true } + if input.ExcludeTagPatterns != nil { + c.Set(config.ScraperExcludeTagPatterns, input.ExcludeTagPatterns) + } + c.Set(config.ScraperCertCheck, input.ScraperCertCheck) if refreshScraperCache { manager.GetInstance().RefreshScraperCache() diff --git a/pkg/api/resolver_query_configuration.go b/pkg/api/resolver_query_configuration.go index 53ab886bb..c8897c882 100644 --- a/pkg/api/resolver_query_configuration.go +++ b/pkg/api/resolver_query_configuration.go @@ -144,8 +144,9 @@ func makeConfigScrapingResult() *models.ConfigScrapingResult { scraperCDPPath := config.GetScraperCDPPath() return &models.ConfigScrapingResult{ - ScraperUserAgent: &scraperUserAgent, - ScraperCertCheck: config.GetScraperCertCheck(), - ScraperCDPPath: &scraperCDPPath, + ScraperUserAgent: &scraperUserAgent, + ScraperCertCheck: config.GetScraperCertCheck(), + ScraperCDPPath: &scraperCDPPath, + ExcludeTagPatterns: config.GetScraperExcludeTagPatterns(), } } diff --git a/pkg/manager/config/config.go b/pkg/manager/config/config.go index 8e1c29209..4a62c7cbc 100644 --- a/pkg/manager/config/config.go +++ b/pkg/manager/config/config.go @@ -95,6 +95,7 @@ const ScrapersPath = "scrapers_path" const ScraperUserAgent = "scraper_user_agent" const ScraperCertCheck = "scraper_cert_check" const ScraperCDPPath = "scraper_cdp_path" +const ScraperExcludeTagPatterns = "scraper_exclude_tag_patterns" // stash-box options const StashBoxes = "stash_boxes" @@ -368,6 +369,15 @@ func (i *Instance) GetScraperCertCheck() bool { return ret } +func (i *Instance) GetScraperExcludeTagPatterns() []string { + var ret []string + if viper.IsSet(ScraperExcludeTagPatterns) { + ret = viper.GetStringSlice(ScraperExcludeTagPatterns) + } + + return ret +} + func (i *Instance) GetStashBoxes() []*models.StashBox { var boxes []*models.StashBox viper.UnmarshalKey(StashBoxes, &boxes) diff --git a/pkg/scraper/scrapers.go b/pkg/scraper/scrapers.go index a6ad82bf9..6c8d6e09d 100644 --- a/pkg/scraper/scrapers.go +++ b/pkg/scraper/scrapers.go @@ -5,10 +5,12 @@ import ( "errors" "os" "path/filepath" + "regexp" "strconv" "strings" "github.com/stashapp/stash/pkg/logger" + stash_config "github.com/stashapp/stash/pkg/manager/config" "github.com/stashapp/stash/pkg/models" "github.com/stashapp/stash/pkg/utils" ) @@ -239,12 +241,11 @@ func (c Cache) postScrapePerformer(ret *models.ScrapedPerformer) error { if err := c.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error { tqb := r.Tag() - for _, t := range ret.Tags { - err := MatchScrapedSceneTag(tqb, t) - if err != nil { - return err - } + tags, err := postProcessTags(tqb, ret.Tags) + if err != nil { + return err } + ret.Tags = tags return nil }); err != nil { @@ -263,12 +264,11 @@ func (c Cache) postScrapeScenePerformer(ret *models.ScrapedScenePerformer) error if err := c.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error { tqb := r.Tag() - for _, t := range ret.Tags { - err := MatchScrapedSceneTag(tqb, t) - if err != nil { - return err - } + tags, err := postProcessTags(tqb, ret.Tags) + if err != nil { + return err } + ret.Tags = tags return nil }); err != nil { @@ -302,12 +302,11 @@ func (c Cache) postScrapeScene(ret *models.ScrapedScene) error { } } - for _, t := range ret.Tags { - err := MatchScrapedSceneTag(tqb, t) - if err != nil { - return err - } + tags, err := postProcessTags(tqb, ret.Tags) + if err != nil { + return err } + ret.Tags = tags if ret.Studio != nil { err := MatchScrapedSceneStudio(sqb, ret.Studio) @@ -342,12 +341,11 @@ func (c Cache) postScrapeGallery(ret *models.ScrapedGallery) error { } } - for _, t := range ret.Tags { - err := MatchScrapedSceneTag(tqb, t) - if err != nil { - return err - } + tags, err := postProcessTags(tqb, ret.Tags) + if err != nil { + return err } + ret.Tags = tags if ret.Studio != nil { err := MatchScrapedSceneStudio(sqb, ret.Studio) @@ -509,3 +507,42 @@ func (c Cache) ScrapeMovieURL(url string) (*models.ScrapedMovie, error) { return nil, nil } + +func postProcessTags(tqb models.TagReader, scrapedTags []*models.ScrapedSceneTag) ([]*models.ScrapedSceneTag, error) { + var ret []*models.ScrapedSceneTag + + excludePatterns := stash_config.GetInstance().GetScraperExcludeTagPatterns() + var excludeRegexps []*regexp.Regexp + + for _, excludePattern := range excludePatterns { + reg, err := regexp.Compile(strings.ToLower(excludePattern)) + if err != nil { + logger.Errorf("Invalid tag exclusion pattern :%v", err) + } else { + excludeRegexps = append(excludeRegexps, reg) + } + } + + var ignoredTags []string +ScrapeTag: + for _, t := range scrapedTags { + for _, reg := range excludeRegexps { + if reg.MatchString(strings.ToLower(t.Name)) { + ignoredTags = append(ignoredTags, t.Name) + continue ScrapeTag + } + } + + err := MatchScrapedSceneTag(tqb, t) + if err != nil { + return nil, err + } + ret = append(ret, t) + } + + if len(ignoredTags) > 0 { + logger.Infof("Scraping ignored tags: %s", strings.Join(ignoredTags, ", ")) + } + + return ret, nil +} diff --git a/ui/v2.5/src/components/Changelog/versions/v090.md b/ui/v2.5/src/components/Changelog/versions/v090.md index 48641fc4b..47ad2e112 100644 --- a/ui/v2.5/src/components/Changelog/versions/v090.md +++ b/ui/v2.5/src/components/Changelog/versions/v090.md @@ -1,4 +1,5 @@ ### ✨ New Features +* Support excluding tag patterns when scraping. ([#1617](https://github.com/stashapp/stash/pull/1617)) * Support setting a custom directory for default performer images. ([#1489](https://github.com/stashapp/stash/pull/1489)) * Added filtering and sorting on scene marker count for tags. ([#1603](https://github.com/stashapp/stash/pull/1603)) * Support excluding fields and editing tags when saving from scene tagger view. ([#1605](https://github.com/stashapp/stash/pull/1605)) diff --git a/ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx b/ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx index bd8f4406f..dcd2d153b 100644 --- a/ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx +++ b/ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx @@ -17,9 +17,10 @@ import StashConfiguration from "./StashConfiguration"; interface IExclusionPatternsProps { excludes: string[]; setExcludes: (value: string[]) => void; + demo: string; } -const ExclusionPatterns: React.FC = (props) => { +export const ExclusionPatterns: React.FC = (props) => { function excludeRegexChanged(idx: number, value: string) { const newExcludes = props.excludes.map((regex, i) => { const ret = idx !== i ? regex : value; @@ -35,8 +36,7 @@ const ExclusionPatterns: React.FC = (props) => { } function excludeAddRegex() { - const demo = "sample\\.mp4$"; - const newExcludes = props.excludes.concat(demo); + const newExcludes = props.excludes.concat(props.demo); props.setExcludes(newExcludes); } @@ -490,7 +490,11 @@ export const SettingsConfigurationPanel: React.FC = () => { id: "config.general.excluded_video_patterns_head", })} - + {intl.formatMessage({ id: "config.general.excluded_video_patterns_desc", @@ -514,6 +518,7 @@ export const SettingsConfigurationPanel: React.FC = () => { {intl.formatMessage({ diff --git a/ui/v2.5/src/components/Settings/SettingsScrapingPanel.tsx b/ui/v2.5/src/components/Settings/SettingsScrapingPanel.tsx index 51e81cd7c..3739c5d9b 100644 --- a/ui/v2.5/src/components/Settings/SettingsScrapingPanel.tsx +++ b/ui/v2.5/src/components/Settings/SettingsScrapingPanel.tsx @@ -14,6 +14,7 @@ import { useToast } from "src/hooks"; import { TextUtils } from "src/utils"; import { CollapseButton, Icon, LoadingIndicator } from "src/components/Shared"; import { ScrapeType } from "src/core/generated-graphql"; +import { ExclusionPatterns } from "./SettingsConfigurationPanel"; interface IURLList { urls: string[]; @@ -96,6 +97,7 @@ export const SettingsScrapingPanel: React.FC = () => { undefined ); const [scraperCertCheck, setScraperCertCheck] = useState(true); + const [excludeTagPatterns, setExcludeTagPatterns] = useState([]); const { data, error } = useConfiguration(); @@ -103,6 +105,7 @@ export const SettingsScrapingPanel: React.FC = () => { scraperUserAgent, scraperCDPPath, scraperCertCheck, + excludeTagPatterns, }); useEffect(() => { @@ -113,6 +116,7 @@ export const SettingsScrapingPanel: React.FC = () => { setScraperUserAgent(conf.scraping.scraperUserAgent ?? undefined); setScraperCDPPath(conf.scraping.scraperCDPPath ?? undefined); setScraperCertCheck(conf.scraping.scraperCertCheck); + setExcludeTagPatterns(conf.scraping.excludeTagPatterns); } }, [data, error]); @@ -398,6 +402,24 @@ export const SettingsScrapingPanel: React.FC = () => { + +
+ {intl.formatMessage({ + id: "config.scraping.excluded_tag_patterns_head", + })} +
+ + + {intl.formatMessage({ + id: "config.scraping.excluded_tag_patterns_desc", + })} + +
+

{intl.formatMessage({ id: "config.scraping.scrapers" })}

diff --git a/ui/v2.5/src/locales/en-GB.json b/ui/v2.5/src/locales/en-GB.json index 1b9f6b785..8477bb1aa 100644 --- a/ui/v2.5/src/locales/en-GB.json +++ b/ui/v2.5/src/locales/en-GB.json @@ -243,6 +243,8 @@ "scraping": { "entity_metadata": "{entityType} Metadata", "entity_scrapers": "{entityType} scrapers", + "excluded_tag_patterns_desc": "Regexps of tag names to exclude from scraping results", + "excluded_tag_patterns_head": "Excluded Tag Patterns", "scrapers": "Scrapers", "search_by_name": "Search by name", "supported_types": "Supported types",