Scrape tag exclusions (#1617)

* Add config option for scraper tag exclusion patterns

Add a config option for exclusing tags / tag patterns from the scraper
results.

* Handle tag exclusion patterns during scraping
This commit is contained in:
gitgiggety 2021-08-10 06:07:01 +02:00 committed by GitHub
parent 404eaa32d2
commit dfd55346b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 114 additions and 27 deletions

View File

@ -69,6 +69,7 @@ fragment ConfigScrapingData on ConfigScrapingResult {
scraperUserAgent
scraperCertCheck
scraperCDPPath
excludeTagPatterns
}
fragment ConfigData on ConfigResult {

View File

@ -255,6 +255,8 @@ input ConfigScrapingInput {
scraperCDPPath: String
"""Whether the scraper should check for invalid certificates"""
scraperCertCheck: Boolean!
"""Tags blacklist during scraping"""
excludeTagPatterns: [String!]
}
type ConfigScrapingResult {
@ -264,6 +266,8 @@ type ConfigScrapingResult {
scraperCDPPath: String
"""Whether the scraper should check for invalid certificates"""
scraperCertCheck: Boolean!
"""Tags blacklist during scraping"""
excludeTagPatterns: [String!]!
}
"""All configuration settings"""

View File

@ -312,6 +312,10 @@ func (r *mutationResolver) ConfigureScraping(ctx context.Context, input models.C
refreshScraperCache = true
}
if input.ExcludeTagPatterns != nil {
c.Set(config.ScraperExcludeTagPatterns, input.ExcludeTagPatterns)
}
c.Set(config.ScraperCertCheck, input.ScraperCertCheck)
if refreshScraperCache {
manager.GetInstance().RefreshScraperCache()

View File

@ -144,8 +144,9 @@ func makeConfigScrapingResult() *models.ConfigScrapingResult {
scraperCDPPath := config.GetScraperCDPPath()
return &models.ConfigScrapingResult{
ScraperUserAgent: &scraperUserAgent,
ScraperCertCheck: config.GetScraperCertCheck(),
ScraperCDPPath: &scraperCDPPath,
ScraperUserAgent: &scraperUserAgent,
ScraperCertCheck: config.GetScraperCertCheck(),
ScraperCDPPath: &scraperCDPPath,
ExcludeTagPatterns: config.GetScraperExcludeTagPatterns(),
}
}

View File

@ -95,6 +95,7 @@ const ScrapersPath = "scrapers_path"
const ScraperUserAgent = "scraper_user_agent"
const ScraperCertCheck = "scraper_cert_check"
const ScraperCDPPath = "scraper_cdp_path"
const ScraperExcludeTagPatterns = "scraper_exclude_tag_patterns"
// stash-box options
const StashBoxes = "stash_boxes"
@ -368,6 +369,15 @@ func (i *Instance) GetScraperCertCheck() bool {
return ret
}
func (i *Instance) GetScraperExcludeTagPatterns() []string {
var ret []string
if viper.IsSet(ScraperExcludeTagPatterns) {
ret = viper.GetStringSlice(ScraperExcludeTagPatterns)
}
return ret
}
func (i *Instance) GetStashBoxes() []*models.StashBox {
var boxes []*models.StashBox
viper.UnmarshalKey(StashBoxes, &boxes)

View File

@ -5,10 +5,12 @@ import (
"errors"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"github.com/stashapp/stash/pkg/logger"
stash_config "github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/utils"
)
@ -239,12 +241,11 @@ func (c Cache) postScrapePerformer(ret *models.ScrapedPerformer) error {
if err := c.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error {
tqb := r.Tag()
for _, t := range ret.Tags {
err := MatchScrapedSceneTag(tqb, t)
if err != nil {
return err
}
tags, err := postProcessTags(tqb, ret.Tags)
if err != nil {
return err
}
ret.Tags = tags
return nil
}); err != nil {
@ -263,12 +264,11 @@ func (c Cache) postScrapeScenePerformer(ret *models.ScrapedScenePerformer) error
if err := c.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error {
tqb := r.Tag()
for _, t := range ret.Tags {
err := MatchScrapedSceneTag(tqb, t)
if err != nil {
return err
}
tags, err := postProcessTags(tqb, ret.Tags)
if err != nil {
return err
}
ret.Tags = tags
return nil
}); err != nil {
@ -302,12 +302,11 @@ func (c Cache) postScrapeScene(ret *models.ScrapedScene) error {
}
}
for _, t := range ret.Tags {
err := MatchScrapedSceneTag(tqb, t)
if err != nil {
return err
}
tags, err := postProcessTags(tqb, ret.Tags)
if err != nil {
return err
}
ret.Tags = tags
if ret.Studio != nil {
err := MatchScrapedSceneStudio(sqb, ret.Studio)
@ -342,12 +341,11 @@ func (c Cache) postScrapeGallery(ret *models.ScrapedGallery) error {
}
}
for _, t := range ret.Tags {
err := MatchScrapedSceneTag(tqb, t)
if err != nil {
return err
}
tags, err := postProcessTags(tqb, ret.Tags)
if err != nil {
return err
}
ret.Tags = tags
if ret.Studio != nil {
err := MatchScrapedSceneStudio(sqb, ret.Studio)
@ -509,3 +507,42 @@ func (c Cache) ScrapeMovieURL(url string) (*models.ScrapedMovie, error) {
return nil, nil
}
func postProcessTags(tqb models.TagReader, scrapedTags []*models.ScrapedSceneTag) ([]*models.ScrapedSceneTag, error) {
var ret []*models.ScrapedSceneTag
excludePatterns := stash_config.GetInstance().GetScraperExcludeTagPatterns()
var excludeRegexps []*regexp.Regexp
for _, excludePattern := range excludePatterns {
reg, err := regexp.Compile(strings.ToLower(excludePattern))
if err != nil {
logger.Errorf("Invalid tag exclusion pattern :%v", err)
} else {
excludeRegexps = append(excludeRegexps, reg)
}
}
var ignoredTags []string
ScrapeTag:
for _, t := range scrapedTags {
for _, reg := range excludeRegexps {
if reg.MatchString(strings.ToLower(t.Name)) {
ignoredTags = append(ignoredTags, t.Name)
continue ScrapeTag
}
}
err := MatchScrapedSceneTag(tqb, t)
if err != nil {
return nil, err
}
ret = append(ret, t)
}
if len(ignoredTags) > 0 {
logger.Infof("Scraping ignored tags: %s", strings.Join(ignoredTags, ", "))
}
return ret, nil
}

View File

@ -1,4 +1,5 @@
### ✨ New Features
* Support excluding tag patterns when scraping. ([#1617](https://github.com/stashapp/stash/pull/1617))
* Support setting a custom directory for default performer images. ([#1489](https://github.com/stashapp/stash/pull/1489))
* Added filtering and sorting on scene marker count for tags. ([#1603](https://github.com/stashapp/stash/pull/1603))
* Support excluding fields and editing tags when saving from scene tagger view. ([#1605](https://github.com/stashapp/stash/pull/1605))

View File

@ -17,9 +17,10 @@ import StashConfiguration from "./StashConfiguration";
interface IExclusionPatternsProps {
excludes: string[];
setExcludes: (value: string[]) => void;
demo: string;
}
const ExclusionPatterns: React.FC<IExclusionPatternsProps> = (props) => {
export const ExclusionPatterns: React.FC<IExclusionPatternsProps> = (props) => {
function excludeRegexChanged(idx: number, value: string) {
const newExcludes = props.excludes.map((regex, i) => {
const ret = idx !== i ? regex : value;
@ -35,8 +36,7 @@ const ExclusionPatterns: React.FC<IExclusionPatternsProps> = (props) => {
}
function excludeAddRegex() {
const demo = "sample\\.mp4$";
const newExcludes = props.excludes.concat(demo);
const newExcludes = props.excludes.concat(props.demo);
props.setExcludes(newExcludes);
}
@ -490,7 +490,11 @@ export const SettingsConfigurationPanel: React.FC = () => {
id: "config.general.excluded_video_patterns_head",
})}
</h6>
<ExclusionPatterns excludes={excludes} setExcludes={setExcludes} />
<ExclusionPatterns
excludes={excludes}
setExcludes={setExcludes}
demo="sample\.mp4$"
/>
<Form.Text className="text-muted">
{intl.formatMessage({
id: "config.general.excluded_video_patterns_desc",
@ -514,6 +518,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
<ExclusionPatterns
excludes={imageExcludes}
setExcludes={setImageExcludes}
demo="sample\.jpg$"
/>
<Form.Text className="text-muted">
{intl.formatMessage({

View File

@ -14,6 +14,7 @@ import { useToast } from "src/hooks";
import { TextUtils } from "src/utils";
import { CollapseButton, Icon, LoadingIndicator } from "src/components/Shared";
import { ScrapeType } from "src/core/generated-graphql";
import { ExclusionPatterns } from "./SettingsConfigurationPanel";
interface IURLList {
urls: string[];
@ -96,6 +97,7 @@ export const SettingsScrapingPanel: React.FC = () => {
undefined
);
const [scraperCertCheck, setScraperCertCheck] = useState<boolean>(true);
const [excludeTagPatterns, setExcludeTagPatterns] = useState<string[]>([]);
const { data, error } = useConfiguration();
@ -103,6 +105,7 @@ export const SettingsScrapingPanel: React.FC = () => {
scraperUserAgent,
scraperCDPPath,
scraperCertCheck,
excludeTagPatterns,
});
useEffect(() => {
@ -113,6 +116,7 @@ export const SettingsScrapingPanel: React.FC = () => {
setScraperUserAgent(conf.scraping.scraperUserAgent ?? undefined);
setScraperCDPPath(conf.scraping.scraperCDPPath ?? undefined);
setScraperCertCheck(conf.scraping.scraperCertCheck);
setExcludeTagPatterns(conf.scraping.excludeTagPatterns);
}
}, [data, error]);
@ -398,6 +402,24 @@ export const SettingsScrapingPanel: React.FC = () => {
</Form.Group>
</Form.Group>
<Form.Group>
<h6>
{intl.formatMessage({
id: "config.scraping.excluded_tag_patterns_head",
})}
</h6>
<ExclusionPatterns
excludes={excludeTagPatterns}
setExcludes={setExcludeTagPatterns}
demo="4K"
/>
<Form.Text className="text-muted">
{intl.formatMessage({
id: "config.scraping.excluded_tag_patterns_desc",
})}
</Form.Text>
</Form.Group>
<hr />
<h4>{intl.formatMessage({ id: "config.scraping.scrapers" })}</h4>

View File

@ -243,6 +243,8 @@
"scraping": {
"entity_metadata": "{entityType} Metadata",
"entity_scrapers": "{entityType} scrapers",
"excluded_tag_patterns_desc": "Regexps of tag names to exclude from scraping results",
"excluded_tag_patterns_head": "Excluded Tag Patterns",
"scrapers": "Scrapers",
"search_by_name": "Search by name",
"supported_types": "Supported types",