Add Mouse Click support for the CDP scraper (#827)

This commit is contained in:
bnkai 2020-12-22 00:42:31 +02:00 committed by GitHub
parent dd2086a912
commit e883e5fe27
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 91 additions and 7 deletions

View File

@ -169,9 +169,15 @@ type cookieOptions struct {
Cookies []*scraperCookies `yaml:"Cookies"`
}
type clickOptions struct {
XPath string `yaml:"xpath"`
Sleep int `yaml:"sleep"`
}
type scraperDriverOptions struct {
UseCDP bool `yaml:"useCDP"`
Sleep int `yaml:"sleep"`
Clicks []*clickOptions `yaml:"clicks"`
Cookies []*cookieOptions `yaml:"cookies"`
}

View File

@ -13,6 +13,7 @@ import (
"strings"
"time"
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
@ -25,7 +26,8 @@ import (
// Timeout for the scrape http request. Includes transfer time. May want to make this
// configurable at some point.
const scrapeGetTimeout = time.Second * 30
const scrapeGetTimeout = time.Second * 60
const scrapeDefaultSleep = time.Second * 2
func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
driverOptions := scraperConfig.DriverOptions
@ -89,19 +91,17 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re
// if remote is set as true in the scraperConfig it will try to use localhost:9222
// else it will look for google-chrome in path
func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) {
const defaultSleep = 2
if !driverOptions.UseCDP {
return nil, fmt.Errorf("Url shouldn't be feetched through CDP")
}
sleep := defaultSleep
sleepDuration := scrapeDefaultSleep
if driverOptions.Sleep != 0 {
sleep = driverOptions.Sleep
if driverOptions.Sleep > 0 {
sleepDuration = time.Duration(driverOptions.Sleep) * time.Second
}
sleepDuration := time.Duration(sleep) * time.Second
act := context.Background()
// if scraperCDPPath is a remote address, then allocate accordingly
@ -122,7 +122,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
act, cancelAct = chromedp.NewRemoteAllocator(context.Background(), remote)
} else {
// user a temporary user directory for chrome
// use a temporary user directory for chrome
dir, err := ioutil.TempDir("", "stash-chromedp")
if err != nil {
return nil, err
@ -142,6 +142,10 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
ctx, cancel := chromedp.NewContext(act)
defer cancel()
// add a fixed timeout for the http request
ctx, cancel = context.WithTimeout(ctx, scrapeGetTimeout)
defer cancel()
var res string
err := chromedp.Run(ctx,
network.Enable(),
@ -149,6 +153,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
printCDPCookies(driverOptions, "Cookies found"),
chromedp.Navigate(url),
chromedp.Sleep(sleepDuration),
setCDPClicks(driverOptions),
chromedp.ActionFunc(func(ctx context.Context) error {
node, err := dom.GetDocument().Do(ctx)
if err != nil {
@ -159,6 +164,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
}),
printCDPCookies(driverOptions, "Cookies set"),
)
if err != nil {
return nil, err
}
@ -166,6 +172,39 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
return strings.NewReader(res), nil
}
// click all xpaths listed in the scraper config
func setCDPClicks(driverOptions scraperDriverOptions) chromedp.Tasks {
var tasks chromedp.Tasks
for _, click := range driverOptions.Clicks { // for each click element find the node from the xpath and add a click action
if click.XPath != "" {
xpath := click.XPath
waitDuration := scrapeDefaultSleep
if click.Sleep > 0 {
waitDuration = time.Duration(click.Sleep) * time.Second
}
action := chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Nodes(xpath, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
logger.Debugf("Error %s looking for click xpath %s.\n", err, xpath)
return err
}
if len(nodes) == 0 {
logger.Debugf("Click xpath %s not found in page.\n", xpath)
return nil
}
logger.Debugf("Clicking %s\n", xpath)
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
})
tasks = append(tasks, action)
tasks = append(tasks, chromedp.Sleep(waitDuration))
}
}
return tasks
}
// getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance
func getRemoteCDPWSAddress(address string) (string, error) {
resp, err := http.Get(address)

View File

@ -3,6 +3,7 @@
* Allow configuration of visible navbar items.
### 🎨 Improvements
* Add mouse click support for CDP scrapers.
* Add gallery tabs to performer and studio pages.
* Add gallery scrapers to scraper page.
* Add support for setting cookies in scrapers.

View File

@ -397,6 +397,44 @@ When `useCDP` is set to true, stash will execute or connect to an instance of Ch
`Chrome CDP path` can be set to a path to the chrome executable, or an http(s) address to remote chrome instance (for example: `http://localhost:9222/json/version`). As remote instance a docker container can also be used with the `chromedp/headless-shell` image being highly recommended.
### CDP Click support
When using CDP you can use the `clicks` part of the `driver` section to do Mouse Clicks on elements you need to collapse or toggle. Each click element has an `xpath` value that holds the XPath for the button/element you need to click and an optional `sleep` value that is the time in seconds to wait for after clicking.
If the `sleep` value is not set it defaults to `2` seconds.
A demo scraper using `clicks` follows.
```yaml
name: clickDemo # demo only for a single URL
sceneByURL:
- action: scrapeXPath
url:
- https://getbootstrap.com/docs/4.3/components/collapse/
scraper: sceneScraper
xPathScrapers:
sceneScraper:
scene:
Title: //head/title
Details: # shows the id/s of the the visible div/s for the Multiple targets example of the page
selector: //div[@class="bd-example"]//div[@class="multi-collapse collapse show"]/@id
concat: "\n\n"
driver:
useCDP: true
sleep: 1
clicks: # demo usage toggle on off multiple times
- xpath: //a[@href="#multiCollapseExample1"] # toggle on first element
- xpath: //button[@data-target="#multiCollapseExample2"] # toggle on second element
sleep: 4
- xpath: //a[@href="#multiCollapseExample1"] # toggle off fist element
sleep: 1
- xpath: //button[@data-target="#multiCollapseExample2"] # toggle off second element
- xpath: //button[@data-target="#multiCollapseExample2"] # toggle on second element
```
Note that each `click` adds an extra delay of `clicks sleep` seconds, so the above adds `2+4+1+2+2=11` seconds to the loading time of the page.
### Cookie support
In some websites the use of cookies is needed to bypass a welcoming message or some other kind of protection. Stash supports the setting of cookies for the direct xpath scraper and the CDP based one. Due to implementation issues the usage varies a bit.