mirror of https://github.com/stashapp/stash.git
Add Mouse Click support for the CDP scraper (#827)
This commit is contained in:
parent
dd2086a912
commit
e883e5fe27
|
@ -169,9 +169,15 @@ type cookieOptions struct {
|
|||
Cookies []*scraperCookies `yaml:"Cookies"`
|
||||
}
|
||||
|
||||
type clickOptions struct {
|
||||
XPath string `yaml:"xpath"`
|
||||
Sleep int `yaml:"sleep"`
|
||||
}
|
||||
|
||||
type scraperDriverOptions struct {
|
||||
UseCDP bool `yaml:"useCDP"`
|
||||
Sleep int `yaml:"sleep"`
|
||||
Clicks []*clickOptions `yaml:"clicks"`
|
||||
Cookies []*cookieOptions `yaml:"cookies"`
|
||||
}
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/chromedp/cdproto/cdp"
|
||||
"github.com/chromedp/cdproto/dom"
|
||||
"github.com/chromedp/cdproto/network"
|
||||
"github.com/chromedp/chromedp"
|
||||
|
@ -25,7 +26,8 @@ import (
|
|||
|
||||
// Timeout for the scrape http request. Includes transfer time. May want to make this
|
||||
// configurable at some point.
|
||||
const scrapeGetTimeout = time.Second * 30
|
||||
const scrapeGetTimeout = time.Second * 60
|
||||
const scrapeDefaultSleep = time.Second * 2
|
||||
|
||||
func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
|
||||
driverOptions := scraperConfig.DriverOptions
|
||||
|
@ -89,19 +91,17 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re
|
|||
// if remote is set as true in the scraperConfig it will try to use localhost:9222
|
||||
// else it will look for google-chrome in path
|
||||
func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) {
|
||||
const defaultSleep = 2
|
||||
|
||||
if !driverOptions.UseCDP {
|
||||
return nil, fmt.Errorf("Url shouldn't be feetched through CDP")
|
||||
}
|
||||
|
||||
sleep := defaultSleep
|
||||
sleepDuration := scrapeDefaultSleep
|
||||
|
||||
if driverOptions.Sleep != 0 {
|
||||
sleep = driverOptions.Sleep
|
||||
if driverOptions.Sleep > 0 {
|
||||
sleepDuration = time.Duration(driverOptions.Sleep) * time.Second
|
||||
}
|
||||
|
||||
sleepDuration := time.Duration(sleep) * time.Second
|
||||
act := context.Background()
|
||||
|
||||
// if scraperCDPPath is a remote address, then allocate accordingly
|
||||
|
@ -122,7 +122,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
|
|||
|
||||
act, cancelAct = chromedp.NewRemoteAllocator(context.Background(), remote)
|
||||
} else {
|
||||
// user a temporary user directory for chrome
|
||||
// use a temporary user directory for chrome
|
||||
dir, err := ioutil.TempDir("", "stash-chromedp")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -142,6 +142,10 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
|
|||
ctx, cancel := chromedp.NewContext(act)
|
||||
defer cancel()
|
||||
|
||||
// add a fixed timeout for the http request
|
||||
ctx, cancel = context.WithTimeout(ctx, scrapeGetTimeout)
|
||||
defer cancel()
|
||||
|
||||
var res string
|
||||
err := chromedp.Run(ctx,
|
||||
network.Enable(),
|
||||
|
@ -149,6 +153,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
|
|||
printCDPCookies(driverOptions, "Cookies found"),
|
||||
chromedp.Navigate(url),
|
||||
chromedp.Sleep(sleepDuration),
|
||||
setCDPClicks(driverOptions),
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
node, err := dom.GetDocument().Do(ctx)
|
||||
if err != nil {
|
||||
|
@ -159,6 +164,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
|
|||
}),
|
||||
printCDPCookies(driverOptions, "Cookies set"),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -166,6 +172,39 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
|
|||
return strings.NewReader(res), nil
|
||||
}
|
||||
|
||||
// click all xpaths listed in the scraper config
|
||||
func setCDPClicks(driverOptions scraperDriverOptions) chromedp.Tasks {
|
||||
var tasks chromedp.Tasks
|
||||
for _, click := range driverOptions.Clicks { // for each click element find the node from the xpath and add a click action
|
||||
if click.XPath != "" {
|
||||
xpath := click.XPath
|
||||
waitDuration := scrapeDefaultSleep
|
||||
if click.Sleep > 0 {
|
||||
waitDuration = time.Duration(click.Sleep) * time.Second
|
||||
}
|
||||
|
||||
action := chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
var nodes []*cdp.Node
|
||||
if err := chromedp.Nodes(xpath, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
|
||||
logger.Debugf("Error %s looking for click xpath %s.\n", err, xpath)
|
||||
return err
|
||||
}
|
||||
if len(nodes) == 0 {
|
||||
logger.Debugf("Click xpath %s not found in page.\n", xpath)
|
||||
return nil
|
||||
}
|
||||
logger.Debugf("Clicking %s\n", xpath)
|
||||
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
|
||||
})
|
||||
|
||||
tasks = append(tasks, action)
|
||||
tasks = append(tasks, chromedp.Sleep(waitDuration))
|
||||
}
|
||||
|
||||
}
|
||||
return tasks
|
||||
}
|
||||
|
||||
// getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance
|
||||
func getRemoteCDPWSAddress(address string) (string, error) {
|
||||
resp, err := http.Get(address)
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
* Allow configuration of visible navbar items.
|
||||
|
||||
### 🎨 Improvements
|
||||
* Add mouse click support for CDP scrapers.
|
||||
* Add gallery tabs to performer and studio pages.
|
||||
* Add gallery scrapers to scraper page.
|
||||
* Add support for setting cookies in scrapers.
|
||||
|
|
|
@ -397,6 +397,44 @@ When `useCDP` is set to true, stash will execute or connect to an instance of Ch
|
|||
|
||||
`Chrome CDP path` can be set to a path to the chrome executable, or an http(s) address to remote chrome instance (for example: `http://localhost:9222/json/version`). As remote instance a docker container can also be used with the `chromedp/headless-shell` image being highly recommended.
|
||||
|
||||
### CDP Click support
|
||||
|
||||
When using CDP you can use the `clicks` part of the `driver` section to do Mouse Clicks on elements you need to collapse or toggle. Each click element has an `xpath` value that holds the XPath for the button/element you need to click and an optional `sleep` value that is the time in seconds to wait for after clicking.
|
||||
If the `sleep` value is not set it defaults to `2` seconds.
|
||||
|
||||
A demo scraper using `clicks` follows.
|
||||
|
||||
```yaml
|
||||
name: clickDemo # demo only for a single URL
|
||||
sceneByURL:
|
||||
- action: scrapeXPath
|
||||
url:
|
||||
- https://getbootstrap.com/docs/4.3/components/collapse/
|
||||
scraper: sceneScraper
|
||||
|
||||
xPathScrapers:
|
||||
sceneScraper:
|
||||
scene:
|
||||
Title: //head/title
|
||||
Details: # shows the id/s of the the visible div/s for the Multiple targets example of the page
|
||||
selector: //div[@class="bd-example"]//div[@class="multi-collapse collapse show"]/@id
|
||||
concat: "\n\n"
|
||||
|
||||
driver:
|
||||
useCDP: true
|
||||
sleep: 1
|
||||
clicks: # demo usage toggle on off multiple times
|
||||
- xpath: //a[@href="#multiCollapseExample1"] # toggle on first element
|
||||
- xpath: //button[@data-target="#multiCollapseExample2"] # toggle on second element
|
||||
sleep: 4
|
||||
- xpath: //a[@href="#multiCollapseExample1"] # toggle off fist element
|
||||
sleep: 1
|
||||
- xpath: //button[@data-target="#multiCollapseExample2"] # toggle off second element
|
||||
- xpath: //button[@data-target="#multiCollapseExample2"] # toggle on second element
|
||||
```
|
||||
|
||||
Note that each `click` adds an extra delay of `clicks sleep` seconds, so the above adds `2+4+1+2+2=11` seconds to the loading time of the page.
|
||||
|
||||
### Cookie support
|
||||
|
||||
In some websites the use of cookies is needed to bypass a welcoming message or some other kind of protection. Stash supports the setting of cookies for the direct xpath scraper and the CDP based one. Due to implementation issues the usage varies a bit.
|
||||
|
|
Loading…
Reference in New Issue