From e883e5fe27399627b7e6c171fa1756e0c51b7960 Mon Sep 17 00:00:00 2001 From: bnkai <48220860+bnkai@users.noreply.github.com> Date: Tue, 22 Dec 2020 00:42:31 +0200 Subject: [PATCH] Add Mouse Click support for the CDP scraper (#827) --- pkg/scraper/config.go | 6 +++ pkg/scraper/url.go | 53 ++++++++++++++++--- .../src/components/Changelog/versions/v050.md | 1 + ui/v2.5/src/docs/en/Scraping.md | 38 +++++++++++++ 4 files changed, 91 insertions(+), 7 deletions(-) diff --git a/pkg/scraper/config.go b/pkg/scraper/config.go index 26c01ba46..2401869fc 100644 --- a/pkg/scraper/config.go +++ b/pkg/scraper/config.go @@ -169,9 +169,15 @@ type cookieOptions struct { Cookies []*scraperCookies `yaml:"Cookies"` } +type clickOptions struct { + XPath string `yaml:"xpath"` + Sleep int `yaml:"sleep"` +} + type scraperDriverOptions struct { UseCDP bool `yaml:"useCDP"` Sleep int `yaml:"sleep"` + Clicks []*clickOptions `yaml:"clicks"` Cookies []*cookieOptions `yaml:"cookies"` } diff --git a/pkg/scraper/url.go b/pkg/scraper/url.go index 9fd9d19e5..159dc1ccd 100644 --- a/pkg/scraper/url.go +++ b/pkg/scraper/url.go @@ -13,6 +13,7 @@ import ( "strings" "time" + "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/dom" "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" @@ -25,7 +26,8 @@ import ( // Timeout for the scrape http request. Includes transfer time. May want to make this // configurable at some point. -const scrapeGetTimeout = time.Second * 30 +const scrapeGetTimeout = time.Second * 60 +const scrapeDefaultSleep = time.Second * 2 func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) { driverOptions := scraperConfig.DriverOptions @@ -89,19 +91,17 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re // if remote is set as true in the scraperConfig it will try to use localhost:9222 // else it will look for google-chrome in path func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) { - const defaultSleep = 2 if !driverOptions.UseCDP { return nil, fmt.Errorf("Url shouldn't be feetched through CDP") } - sleep := defaultSleep + sleepDuration := scrapeDefaultSleep - if driverOptions.Sleep != 0 { - sleep = driverOptions.Sleep + if driverOptions.Sleep > 0 { + sleepDuration = time.Duration(driverOptions.Sleep) * time.Second } - sleepDuration := time.Duration(sleep) * time.Second act := context.Background() // if scraperCDPPath is a remote address, then allocate accordingly @@ -122,7 +122,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo act, cancelAct = chromedp.NewRemoteAllocator(context.Background(), remote) } else { - // user a temporary user directory for chrome + // use a temporary user directory for chrome dir, err := ioutil.TempDir("", "stash-chromedp") if err != nil { return nil, err @@ -142,6 +142,10 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo ctx, cancel := chromedp.NewContext(act) defer cancel() + // add a fixed timeout for the http request + ctx, cancel = context.WithTimeout(ctx, scrapeGetTimeout) + defer cancel() + var res string err := chromedp.Run(ctx, network.Enable(), @@ -149,6 +153,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo printCDPCookies(driverOptions, "Cookies found"), chromedp.Navigate(url), chromedp.Sleep(sleepDuration), + setCDPClicks(driverOptions), chromedp.ActionFunc(func(ctx context.Context) error { node, err := dom.GetDocument().Do(ctx) if err != nil { @@ -159,6 +164,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo }), printCDPCookies(driverOptions, "Cookies set"), ) + if err != nil { return nil, err } @@ -166,6 +172,39 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo return strings.NewReader(res), nil } +// click all xpaths listed in the scraper config +func setCDPClicks(driverOptions scraperDriverOptions) chromedp.Tasks { + var tasks chromedp.Tasks + for _, click := range driverOptions.Clicks { // for each click element find the node from the xpath and add a click action + if click.XPath != "" { + xpath := click.XPath + waitDuration := scrapeDefaultSleep + if click.Sleep > 0 { + waitDuration = time.Duration(click.Sleep) * time.Second + } + + action := chromedp.ActionFunc(func(ctx context.Context) error { + var nodes []*cdp.Node + if err := chromedp.Nodes(xpath, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil { + logger.Debugf("Error %s looking for click xpath %s.\n", err, xpath) + return err + } + if len(nodes) == 0 { + logger.Debugf("Click xpath %s not found in page.\n", xpath) + return nil + } + logger.Debugf("Clicking %s\n", xpath) + return chromedp.MouseClickNode(nodes[0]).Do(ctx) + }) + + tasks = append(tasks, action) + tasks = append(tasks, chromedp.Sleep(waitDuration)) + } + + } + return tasks +} + // getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance func getRemoteCDPWSAddress(address string) (string, error) { resp, err := http.Get(address) diff --git a/ui/v2.5/src/components/Changelog/versions/v050.md b/ui/v2.5/src/components/Changelog/versions/v050.md index 548f9dbb8..fdca55beb 100644 --- a/ui/v2.5/src/components/Changelog/versions/v050.md +++ b/ui/v2.5/src/components/Changelog/versions/v050.md @@ -3,6 +3,7 @@ * Allow configuration of visible navbar items. ### 🎨 Improvements +* Add mouse click support for CDP scrapers. * Add gallery tabs to performer and studio pages. * Add gallery scrapers to scraper page. * Add support for setting cookies in scrapers. diff --git a/ui/v2.5/src/docs/en/Scraping.md b/ui/v2.5/src/docs/en/Scraping.md index 544d25fc8..d65630eeb 100644 --- a/ui/v2.5/src/docs/en/Scraping.md +++ b/ui/v2.5/src/docs/en/Scraping.md @@ -397,6 +397,44 @@ When `useCDP` is set to true, stash will execute or connect to an instance of Ch `Chrome CDP path` can be set to a path to the chrome executable, or an http(s) address to remote chrome instance (for example: `http://localhost:9222/json/version`). As remote instance a docker container can also be used with the `chromedp/headless-shell` image being highly recommended. +### CDP Click support + +When using CDP you can use the `clicks` part of the `driver` section to do Mouse Clicks on elements you need to collapse or toggle. Each click element has an `xpath` value that holds the XPath for the button/element you need to click and an optional `sleep` value that is the time in seconds to wait for after clicking. +If the `sleep` value is not set it defaults to `2` seconds. + +A demo scraper using `clicks` follows. + +```yaml +name: clickDemo # demo only for a single URL +sceneByURL: + - action: scrapeXPath + url: + - https://getbootstrap.com/docs/4.3/components/collapse/ + scraper: sceneScraper + +xPathScrapers: + sceneScraper: + scene: + Title: //head/title + Details: # shows the id/s of the the visible div/s for the Multiple targets example of the page + selector: //div[@class="bd-example"]//div[@class="multi-collapse collapse show"]/@id + concat: "\n\n" + +driver: + useCDP: true + sleep: 1 + clicks: # demo usage toggle on off multiple times + - xpath: //a[@href="#multiCollapseExample1"] # toggle on first element + - xpath: //button[@data-target="#multiCollapseExample2"] # toggle on second element + sleep: 4 + - xpath: //a[@href="#multiCollapseExample1"] # toggle off fist element + sleep: 1 + - xpath: //button[@data-target="#multiCollapseExample2"] # toggle off second element + - xpath: //button[@data-target="#multiCollapseExample2"] # toggle on second element +``` + +Note that each `click` adds an extra delay of `clicks sleep` seconds, so the above adds `2+4+1+2+2=11` seconds to the loading time of the page. + ### Cookie support In some websites the use of cookies is needed to bypass a welcoming message or some other kind of protection. Stash supports the setting of cookies for the direct xpath scraper and the CDP based one. Due to implementation issues the usage varies a bit.