2020-12-01 05:34:09 +00:00
|
|
|
package scraper
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
2022-03-17 00:33:59 +00:00
|
|
|
"math/rand"
|
2020-12-01 05:34:09 +00:00
|
|
|
"net/http"
|
|
|
|
"net/http/cookiejar"
|
|
|
|
"net/url"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/chromedp/cdproto/cdp"
|
|
|
|
"github.com/chromedp/cdproto/network"
|
|
|
|
"github.com/chromedp/chromedp"
|
Cache and reuse the scraper HTTP client (#1855)
* Add Cookies directly to the request
Rather than maintaining a cookie jar on a one-shot HTTP client, maintain
the jar ourselves: make a new jar, then use it to select the right
cookies.
The cookies are set on the request rather than on the client. This will
retain the current behavior as we are always throwing the client away
after each use.
This patch enables the lifting of the http client as well over time.
* Introduce a cached scraper HTTP client
The scraper cache is augmented with an *http.Client. These are safe for
concurrent use, so the pointer can safely be passed around. Push this
into scraper configurations where applicable, next to the txnManagers.
When we issue a loadUrl request, do so on the cached *http.Client,
which will reuse existing idle connections in the client if any are
present.
* Set MaxIdleConnsPerHost. Closes #1850
We allow for up to 8 idle connections to a single host. This should
make concurrent operation toward the same host reuse connections, even
for sizeable concurrency.
The number isn't bumped excessively high. We should probably limit
concurrency toward a single site anyway, since we'll be able to overrun
a site with queries quite easily if we have many concurrent goroutines
issuing requests at the same time.
* Reinstate driverOptions / useCDP check
Use DeMorgan's laws to invert the logic and exit early. Fixes tests
breaking.
* Documentation fixup.
* Use the scraper http.Client when fetching images
Fold image fetchers onto the cached scraper http.Client as well. This
makes the scraper have a single http.Client cache for all its
operations.
Thread the client upwards to the relevant attachment points: either the
cache, or a stash_box instance, which is extended to include a pointer
to the client.
Style roughly follows that of txnManagers.
* Use the same http Client as the GraphQL client use
Rather than using http.DefaultClient, use the same client as the
GraphQL client use in the stash_box subsystem. This localizes the
client used in the subsystem into the constructing New.. call.
* Hoist HTTP client construction
Create a function for initializaing the HTTP Client we use. While here
hoist magic numbers into constants. Introduce a proper static redirect
error and use it in the client code as well.
* Reinstate printCookies
This is a debugging function, and it might still come in handy in the
future at some point.
* Nitpick comment.
* Minor tidy
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2021-10-20 05:12:24 +00:00
|
|
|
"golang.org/x/net/publicsuffix"
|
2020-12-01 05:34:09 +00:00
|
|
|
|
|
|
|
"github.com/stashapp/stash/pkg/logger"
|
|
|
|
)
|
|
|
|
|
Cache and reuse the scraper HTTP client (#1855)
* Add Cookies directly to the request
Rather than maintaining a cookie jar on a one-shot HTTP client, maintain
the jar ourselves: make a new jar, then use it to select the right
cookies.
The cookies are set on the request rather than on the client. This will
retain the current behavior as we are always throwing the client away
after each use.
This patch enables the lifting of the http client as well over time.
* Introduce a cached scraper HTTP client
The scraper cache is augmented with an *http.Client. These are safe for
concurrent use, so the pointer can safely be passed around. Push this
into scraper configurations where applicable, next to the txnManagers.
When we issue a loadUrl request, do so on the cached *http.Client,
which will reuse existing idle connections in the client if any are
present.
* Set MaxIdleConnsPerHost. Closes #1850
We allow for up to 8 idle connections to a single host. This should
make concurrent operation toward the same host reuse connections, even
for sizeable concurrency.
The number isn't bumped excessively high. We should probably limit
concurrency toward a single site anyway, since we'll be able to overrun
a site with queries quite easily if we have many concurrent goroutines
issuing requests at the same time.
* Reinstate driverOptions / useCDP check
Use DeMorgan's laws to invert the logic and exit early. Fixes tests
breaking.
* Documentation fixup.
* Use the scraper http.Client when fetching images
Fold image fetchers onto the cached scraper http.Client as well. This
makes the scraper have a single http.Client cache for all its
operations.
Thread the client upwards to the relevant attachment points: either the
cache, or a stash_box instance, which is extended to include a pointer
to the client.
Style roughly follows that of txnManagers.
* Use the same http Client as the GraphQL client use
Rather than using http.DefaultClient, use the same client as the
GraphQL client use in the stash_box subsystem. This localizes the
client used in the subsystem into the constructing New.. call.
* Hoist HTTP client construction
Create a function for initializaing the HTTP Client we use. While here
hoist magic numbers into constants. Introduce a proper static redirect
error and use it in the client code as well.
* Reinstate printCookies
This is a debugging function, and it might still come in handy in the
future at some point.
* Nitpick comment.
* Minor tidy
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2021-10-20 05:12:24 +00:00
|
|
|
// jar constructs a cookie jar from a configuration
|
|
|
|
func (c config) jar() (*cookiejar.Jar, error) {
|
|
|
|
opts := c.DriverOptions
|
|
|
|
jar, err := cookiejar.New(&cookiejar.Options{
|
|
|
|
PublicSuffixList: publicsuffix.List,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2020-12-01 05:34:09 +00:00
|
|
|
|
Cache and reuse the scraper HTTP client (#1855)
* Add Cookies directly to the request
Rather than maintaining a cookie jar on a one-shot HTTP client, maintain
the jar ourselves: make a new jar, then use it to select the right
cookies.
The cookies are set on the request rather than on the client. This will
retain the current behavior as we are always throwing the client away
after each use.
This patch enables the lifting of the http client as well over time.
* Introduce a cached scraper HTTP client
The scraper cache is augmented with an *http.Client. These are safe for
concurrent use, so the pointer can safely be passed around. Push this
into scraper configurations where applicable, next to the txnManagers.
When we issue a loadUrl request, do so on the cached *http.Client,
which will reuse existing idle connections in the client if any are
present.
* Set MaxIdleConnsPerHost. Closes #1850
We allow for up to 8 idle connections to a single host. This should
make concurrent operation toward the same host reuse connections, even
for sizeable concurrency.
The number isn't bumped excessively high. We should probably limit
concurrency toward a single site anyway, since we'll be able to overrun
a site with queries quite easily if we have many concurrent goroutines
issuing requests at the same time.
* Reinstate driverOptions / useCDP check
Use DeMorgan's laws to invert the logic and exit early. Fixes tests
breaking.
* Documentation fixup.
* Use the scraper http.Client when fetching images
Fold image fetchers onto the cached scraper http.Client as well. This
makes the scraper have a single http.Client cache for all its
operations.
Thread the client upwards to the relevant attachment points: either the
cache, or a stash_box instance, which is extended to include a pointer
to the client.
Style roughly follows that of txnManagers.
* Use the same http Client as the GraphQL client use
Rather than using http.DefaultClient, use the same client as the
GraphQL client use in the stash_box subsystem. This localizes the
client used in the subsystem into the constructing New.. call.
* Hoist HTTP client construction
Create a function for initializaing the HTTP Client we use. While here
hoist magic numbers into constants. Introduce a proper static redirect
error and use it in the client code as well.
* Reinstate printCookies
This is a debugging function, and it might still come in handy in the
future at some point.
* Nitpick comment.
* Minor tidy
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2021-10-20 05:12:24 +00:00
|
|
|
if opts == nil || opts.UseCDP {
|
|
|
|
return jar, nil
|
|
|
|
}
|
2020-12-01 05:34:09 +00:00
|
|
|
|
Cache and reuse the scraper HTTP client (#1855)
* Add Cookies directly to the request
Rather than maintaining a cookie jar on a one-shot HTTP client, maintain
the jar ourselves: make a new jar, then use it to select the right
cookies.
The cookies are set on the request rather than on the client. This will
retain the current behavior as we are always throwing the client away
after each use.
This patch enables the lifting of the http client as well over time.
* Introduce a cached scraper HTTP client
The scraper cache is augmented with an *http.Client. These are safe for
concurrent use, so the pointer can safely be passed around. Push this
into scraper configurations where applicable, next to the txnManagers.
When we issue a loadUrl request, do so on the cached *http.Client,
which will reuse existing idle connections in the client if any are
present.
* Set MaxIdleConnsPerHost. Closes #1850
We allow for up to 8 idle connections to a single host. This should
make concurrent operation toward the same host reuse connections, even
for sizeable concurrency.
The number isn't bumped excessively high. We should probably limit
concurrency toward a single site anyway, since we'll be able to overrun
a site with queries quite easily if we have many concurrent goroutines
issuing requests at the same time.
* Reinstate driverOptions / useCDP check
Use DeMorgan's laws to invert the logic and exit early. Fixes tests
breaking.
* Documentation fixup.
* Use the scraper http.Client when fetching images
Fold image fetchers onto the cached scraper http.Client as well. This
makes the scraper have a single http.Client cache for all its
operations.
Thread the client upwards to the relevant attachment points: either the
cache, or a stash_box instance, which is extended to include a pointer
to the client.
Style roughly follows that of txnManagers.
* Use the same http Client as the GraphQL client use
Rather than using http.DefaultClient, use the same client as the
GraphQL client use in the stash_box subsystem. This localizes the
client used in the subsystem into the constructing New.. call.
* Hoist HTTP client construction
Create a function for initializaing the HTTP Client we use. While here
hoist magic numbers into constants. Introduce a proper static redirect
error and use it in the client code as well.
* Reinstate printCookies
This is a debugging function, and it might still come in handy in the
future at some point.
* Nitpick comment.
* Minor tidy
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2021-10-20 05:12:24 +00:00
|
|
|
for i, ckURL := range opts.Cookies {
|
|
|
|
url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema
|
|
|
|
if err != nil {
|
|
|
|
logger.Warnf("skipping cookie [%d] for cookieURL %s: %v", i, ckURL.CookieURL, err)
|
|
|
|
continue
|
|
|
|
}
|
2020-12-01 05:34:09 +00:00
|
|
|
|
Cache and reuse the scraper HTTP client (#1855)
* Add Cookies directly to the request
Rather than maintaining a cookie jar on a one-shot HTTP client, maintain
the jar ourselves: make a new jar, then use it to select the right
cookies.
The cookies are set on the request rather than on the client. This will
retain the current behavior as we are always throwing the client away
after each use.
This patch enables the lifting of the http client as well over time.
* Introduce a cached scraper HTTP client
The scraper cache is augmented with an *http.Client. These are safe for
concurrent use, so the pointer can safely be passed around. Push this
into scraper configurations where applicable, next to the txnManagers.
When we issue a loadUrl request, do so on the cached *http.Client,
which will reuse existing idle connections in the client if any are
present.
* Set MaxIdleConnsPerHost. Closes #1850
We allow for up to 8 idle connections to a single host. This should
make concurrent operation toward the same host reuse connections, even
for sizeable concurrency.
The number isn't bumped excessively high. We should probably limit
concurrency toward a single site anyway, since we'll be able to overrun
a site with queries quite easily if we have many concurrent goroutines
issuing requests at the same time.
* Reinstate driverOptions / useCDP check
Use DeMorgan's laws to invert the logic and exit early. Fixes tests
breaking.
* Documentation fixup.
* Use the scraper http.Client when fetching images
Fold image fetchers onto the cached scraper http.Client as well. This
makes the scraper have a single http.Client cache for all its
operations.
Thread the client upwards to the relevant attachment points: either the
cache, or a stash_box instance, which is extended to include a pointer
to the client.
Style roughly follows that of txnManagers.
* Use the same http Client as the GraphQL client use
Rather than using http.DefaultClient, use the same client as the
GraphQL client use in the stash_box subsystem. This localizes the
client used in the subsystem into the constructing New.. call.
* Hoist HTTP client construction
Create a function for initializaing the HTTP Client we use. While here
hoist magic numbers into constants. Introduce a proper static redirect
error and use it in the client code as well.
* Reinstate printCookies
This is a debugging function, and it might still come in handy in the
future at some point.
* Nitpick comment.
* Minor tidy
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2021-10-20 05:12:24 +00:00
|
|
|
var httpCookies []*http.Cookie
|
|
|
|
for _, cookie := range ckURL.Cookies {
|
|
|
|
c := &http.Cookie{
|
|
|
|
Name: cookie.Name,
|
|
|
|
Value: getCookieValue(cookie),
|
|
|
|
Path: cookie.Path,
|
|
|
|
Domain: cookie.Domain,
|
2020-12-01 05:34:09 +00:00
|
|
|
}
|
Cache and reuse the scraper HTTP client (#1855)
* Add Cookies directly to the request
Rather than maintaining a cookie jar on a one-shot HTTP client, maintain
the jar ourselves: make a new jar, then use it to select the right
cookies.
The cookies are set on the request rather than on the client. This will
retain the current behavior as we are always throwing the client away
after each use.
This patch enables the lifting of the http client as well over time.
* Introduce a cached scraper HTTP client
The scraper cache is augmented with an *http.Client. These are safe for
concurrent use, so the pointer can safely be passed around. Push this
into scraper configurations where applicable, next to the txnManagers.
When we issue a loadUrl request, do so on the cached *http.Client,
which will reuse existing idle connections in the client if any are
present.
* Set MaxIdleConnsPerHost. Closes #1850
We allow for up to 8 idle connections to a single host. This should
make concurrent operation toward the same host reuse connections, even
for sizeable concurrency.
The number isn't bumped excessively high. We should probably limit
concurrency toward a single site anyway, since we'll be able to overrun
a site with queries quite easily if we have many concurrent goroutines
issuing requests at the same time.
* Reinstate driverOptions / useCDP check
Use DeMorgan's laws to invert the logic and exit early. Fixes tests
breaking.
* Documentation fixup.
* Use the scraper http.Client when fetching images
Fold image fetchers onto the cached scraper http.Client as well. This
makes the scraper have a single http.Client cache for all its
operations.
Thread the client upwards to the relevant attachment points: either the
cache, or a stash_box instance, which is extended to include a pointer
to the client.
Style roughly follows that of txnManagers.
* Use the same http Client as the GraphQL client use
Rather than using http.DefaultClient, use the same client as the
GraphQL client use in the stash_box subsystem. This localizes the
client used in the subsystem into the constructing New.. call.
* Hoist HTTP client construction
Create a function for initializaing the HTTP Client we use. While here
hoist magic numbers into constants. Introduce a proper static redirect
error and use it in the client code as well.
* Reinstate printCookies
This is a debugging function, and it might still come in handy in the
future at some point.
* Nitpick comment.
* Minor tidy
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2021-10-20 05:12:24 +00:00
|
|
|
httpCookies = append(httpCookies, c)
|
|
|
|
}
|
|
|
|
|
|
|
|
jar.SetCookies(url, httpCookies)
|
|
|
|
if jar.Cookies(url) == nil {
|
|
|
|
logger.Warnf("setting jar cookies for %s failed", url.String())
|
2020-12-01 05:34:09 +00:00
|
|
|
}
|
|
|
|
}
|
Cache and reuse the scraper HTTP client (#1855)
* Add Cookies directly to the request
Rather than maintaining a cookie jar on a one-shot HTTP client, maintain
the jar ourselves: make a new jar, then use it to select the right
cookies.
The cookies are set on the request rather than on the client. This will
retain the current behavior as we are always throwing the client away
after each use.
This patch enables the lifting of the http client as well over time.
* Introduce a cached scraper HTTP client
The scraper cache is augmented with an *http.Client. These are safe for
concurrent use, so the pointer can safely be passed around. Push this
into scraper configurations where applicable, next to the txnManagers.
When we issue a loadUrl request, do so on the cached *http.Client,
which will reuse existing idle connections in the client if any are
present.
* Set MaxIdleConnsPerHost. Closes #1850
We allow for up to 8 idle connections to a single host. This should
make concurrent operation toward the same host reuse connections, even
for sizeable concurrency.
The number isn't bumped excessively high. We should probably limit
concurrency toward a single site anyway, since we'll be able to overrun
a site with queries quite easily if we have many concurrent goroutines
issuing requests at the same time.
* Reinstate driverOptions / useCDP check
Use DeMorgan's laws to invert the logic and exit early. Fixes tests
breaking.
* Documentation fixup.
* Use the scraper http.Client when fetching images
Fold image fetchers onto the cached scraper http.Client as well. This
makes the scraper have a single http.Client cache for all its
operations.
Thread the client upwards to the relevant attachment points: either the
cache, or a stash_box instance, which is extended to include a pointer
to the client.
Style roughly follows that of txnManagers.
* Use the same http Client as the GraphQL client use
Rather than using http.DefaultClient, use the same client as the
GraphQL client use in the stash_box subsystem. This localizes the
client used in the subsystem into the constructing New.. call.
* Hoist HTTP client construction
Create a function for initializaing the HTTP Client we use. While here
hoist magic numbers into constants. Introduce a proper static redirect
error and use it in the client code as well.
* Reinstate printCookies
This is a debugging function, and it might still come in handy in the
future at some point.
* Nitpick comment.
* Minor tidy
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2021-10-20 05:12:24 +00:00
|
|
|
|
|
|
|
return jar, nil
|
2020-12-01 05:34:09 +00:00
|
|
|
}
|
|
|
|
|
2021-02-23 02:40:43 +00:00
|
|
|
func getCookieValue(cookie *scraperCookies) string {
|
|
|
|
if cookie.ValueRandom > 0 {
|
2022-03-17 00:33:59 +00:00
|
|
|
return randomSequence(cookie.ValueRandom)
|
2021-02-23 02:40:43 +00:00
|
|
|
}
|
|
|
|
return cookie.Value
|
|
|
|
}
|
|
|
|
|
2022-03-17 00:33:59 +00:00
|
|
|
var characters = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890")
|
|
|
|
|
|
|
|
func randomSequence(n int) string {
|
|
|
|
b := make([]rune, n)
|
2023-02-15 23:15:58 +00:00
|
|
|
rand := rand.New(rand.NewSource(time.Now().UnixNano()))
|
2022-03-17 00:33:59 +00:00
|
|
|
for i := range b {
|
|
|
|
b[i] = characters[rand.Intn(len(characters))]
|
|
|
|
}
|
|
|
|
return string(b)
|
|
|
|
}
|
|
|
|
|
Cache and reuse the scraper HTTP client (#1855)
* Add Cookies directly to the request
Rather than maintaining a cookie jar on a one-shot HTTP client, maintain
the jar ourselves: make a new jar, then use it to select the right
cookies.
The cookies are set on the request rather than on the client. This will
retain the current behavior as we are always throwing the client away
after each use.
This patch enables the lifting of the http client as well over time.
* Introduce a cached scraper HTTP client
The scraper cache is augmented with an *http.Client. These are safe for
concurrent use, so the pointer can safely be passed around. Push this
into scraper configurations where applicable, next to the txnManagers.
When we issue a loadUrl request, do so on the cached *http.Client,
which will reuse existing idle connections in the client if any are
present.
* Set MaxIdleConnsPerHost. Closes #1850
We allow for up to 8 idle connections to a single host. This should
make concurrent operation toward the same host reuse connections, even
for sizeable concurrency.
The number isn't bumped excessively high. We should probably limit
concurrency toward a single site anyway, since we'll be able to overrun
a site with queries quite easily if we have many concurrent goroutines
issuing requests at the same time.
* Reinstate driverOptions / useCDP check
Use DeMorgan's laws to invert the logic and exit early. Fixes tests
breaking.
* Documentation fixup.
* Use the scraper http.Client when fetching images
Fold image fetchers onto the cached scraper http.Client as well. This
makes the scraper have a single http.Client cache for all its
operations.
Thread the client upwards to the relevant attachment points: either the
cache, or a stash_box instance, which is extended to include a pointer
to the client.
Style roughly follows that of txnManagers.
* Use the same http Client as the GraphQL client use
Rather than using http.DefaultClient, use the same client as the
GraphQL client use in the stash_box subsystem. This localizes the
client used in the subsystem into the constructing New.. call.
* Hoist HTTP client construction
Create a function for initializaing the HTTP Client we use. While here
hoist magic numbers into constants. Introduce a proper static redirect
error and use it in the client code as well.
* Reinstate printCookies
This is a debugging function, and it might still come in handy in the
future at some point.
* Nitpick comment.
* Minor tidy
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2021-10-20 05:12:24 +00:00
|
|
|
// printCookies prints all cookies from the given cookie jar
|
2020-12-01 05:34:09 +00:00
|
|
|
func printCookies(jar *cookiejar.Jar, scraperConfig config, msg string) {
|
|
|
|
driverOptions := scraperConfig.DriverOptions
|
|
|
|
if driverOptions != nil && !driverOptions.UseCDP {
|
|
|
|
var foundURLs []*url.URL
|
|
|
|
|
|
|
|
for _, ckURL := range driverOptions.Cookies { // go through all cookies
|
|
|
|
url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema
|
|
|
|
if err == nil {
|
|
|
|
foundURLs = append(foundURLs, url)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(foundURLs) > 0 {
|
|
|
|
logger.Debugf("%s\n", msg)
|
|
|
|
printJarCookies(jar, foundURLs)
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// print all cookies from the jar of the native http client for given urls
|
|
|
|
func printJarCookies(jar *cookiejar.Jar, urls []*url.URL) {
|
|
|
|
for _, url := range urls {
|
|
|
|
logger.Debugf("Jar cookies for %s", url.String())
|
|
|
|
for i, cookie := range jar.Cookies(url) {
|
|
|
|
logger.Debugf("[%d]: Name: \"%s\" Value: \"%s\"", i, cookie.Name, cookie.Value)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// set all cookies listed in the scraper config
|
|
|
|
func setCDPCookies(driverOptions scraperDriverOptions) chromedp.Tasks {
|
|
|
|
return chromedp.Tasks{
|
|
|
|
chromedp.ActionFunc(func(ctx context.Context) error {
|
|
|
|
// create cookie expiration
|
|
|
|
expr := cdp.TimeSinceEpoch(time.Now().Add(180 * 24 * time.Hour))
|
|
|
|
|
|
|
|
for _, ckURL := range driverOptions.Cookies {
|
|
|
|
for _, cookie := range ckURL.Cookies {
|
2021-06-22 22:05:58 +00:00
|
|
|
err := network.SetCookie(cookie.Name, getCookieValue(cookie)).
|
2020-12-01 05:34:09 +00:00
|
|
|
WithExpires(&expr).
|
|
|
|
WithDomain(cookie.Domain).
|
|
|
|
WithPath(cookie.Path).
|
|
|
|
WithHTTPOnly(false).
|
|
|
|
WithSecure(false).
|
|
|
|
Do(ctx)
|
|
|
|
if err != nil {
|
2021-06-22 22:05:58 +00:00
|
|
|
return fmt.Errorf("could not set chrome cookie %s: %s", cookie.Name, err)
|
2020-12-01 05:34:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// print cookies whose domain is included in the scraper config
|
|
|
|
func printCDPCookies(driverOptions scraperDriverOptions, msg string) chromedp.Action {
|
|
|
|
return chromedp.ActionFunc(func(ctx context.Context) error {
|
|
|
|
chromeCookies, err := network.GetAllCookies().Do(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
scraperDomains := make(map[string]struct{})
|
|
|
|
for _, ckURL := range driverOptions.Cookies {
|
|
|
|
for _, cookie := range ckURL.Cookies {
|
|
|
|
scraperDomains[cookie.Domain] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(scraperDomains) > 0 { // only print the cookies if they are listed in the scraper
|
|
|
|
logger.Debugf("%s\n", msg)
|
|
|
|
for i, cookie := range chromeCookies {
|
|
|
|
_, ok := scraperDomains[cookie.Domain]
|
|
|
|
if ok {
|
|
|
|
logger.Debugf("[%d]: Name: \"%s\" Value: \"%s\" Domain: \"%s\"", i, cookie.Name, cookie.Value, cookie.Domain)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
})
|
|
|
|
}
|