package scraper import ( "bytes" "context" "crypto/tls" "errors" "fmt" "io" "io/ioutil" "net/http" "net/http/cookiejar" "os" "strings" "time" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" jsoniter "github.com/json-iterator/go" "golang.org/x/net/html/charset" "golang.org/x/net/publicsuffix" "github.com/stashapp/stash/pkg/logger" ) // Timeout for the scrape http request. Includes transfer time. May want to make this // configurable at some point. const scrapeGetTimeout = time.Second * 60 const scrapeDefaultSleep = time.Second * 2 func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) { driverOptions := scraperConfig.DriverOptions if driverOptions != nil && driverOptions.UseCDP { // get the page using chrome dp return urlFromCDP(url, *driverOptions, globalConfig) } // get the page using http.Client options := cookiejar.Options{ PublicSuffixList: publicsuffix.List, } jar, er := cookiejar.New(&options) if er != nil { return nil, er } setCookies(jar, scraperConfig) printCookies(jar, scraperConfig, "Jar cookies set from scraper") client := &http.Client{ Transport: &http.Transport{ // ignore insecure certificates TLSClientConfig: &tls.Config{InsecureSkipVerify: !globalConfig.GetScraperCertCheck()}, }, Timeout: scrapeGetTimeout, // defaultCheckRedirect code with max changed from 10 to 20 CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 20 { return errors.New("stopped after 20 redirects") } return nil }, Jar: jar, } req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } userAgent := globalConfig.GetScraperUserAgent() if userAgent != "" { req.Header.Set("User-Agent", userAgent) } if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper for _, h := range driverOptions.Headers { if h.Key != "" { req.Header.Set(h.Key, h.Value) logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value) } } } resp, err := client.Do(req) if err != nil { return nil, err } if resp.StatusCode >= 400 { return nil, fmt.Errorf("http error %d:%s", resp.StatusCode, http.StatusText(resp.StatusCode)) } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { return nil, err } bodyReader := bytes.NewReader(body) printCookies(jar, scraperConfig, "Jar cookies found for scraper urls") return charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) } // func urlFromCDP uses chrome cdp and DOM to load and process the url // if remote is set as true in the scraperConfig it will try to use localhost:9222 // else it will look for google-chrome in path func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) { if !driverOptions.UseCDP { return nil, fmt.Errorf("URL shouldn't be fetched through CDP") } sleepDuration := scrapeDefaultSleep if driverOptions.Sleep > 0 { sleepDuration = time.Duration(driverOptions.Sleep) * time.Second } act := context.Background() // if scraperCDPPath is a remote address, then allocate accordingly cdpPath := globalConfig.GetScraperCDPPath() if cdpPath != "" { var cancelAct context.CancelFunc if isCDPPathHTTP(globalConfig) || isCDPPathWS(globalConfig) { remote := cdpPath // if CDPPath is http(s) then we need to get the websocket URL if isCDPPathHTTP(globalConfig) { var err error remote, err = getRemoteCDPWSAddress(remote) if err != nil { return nil, err } } act, cancelAct = chromedp.NewRemoteAllocator(context.Background(), remote) } else { // use a temporary user directory for chrome dir, err := ioutil.TempDir("", "stash-chromedp") if err != nil { return nil, err } defer os.RemoveAll(dir) opts := append(chromedp.DefaultExecAllocatorOptions[:], chromedp.UserDataDir(dir), chromedp.ExecPath(cdpPath), ) act, cancelAct = chromedp.NewExecAllocator(act, opts...) } defer cancelAct() } ctx, cancel := chromedp.NewContext(act) defer cancel() // add a fixed timeout for the http request ctx, cancel = context.WithTimeout(ctx, scrapeGetTimeout) defer cancel() var res string headers := cdpHeaders(driverOptions) err := chromedp.Run(ctx, network.Enable(), setCDPCookies(driverOptions), printCDPCookies(driverOptions, "Cookies found"), network.SetExtraHTTPHeaders(network.Headers(headers)), chromedp.Navigate(url), chromedp.Sleep(sleepDuration), setCDPClicks(driverOptions), chromedp.OuterHTML("html", &res, chromedp.ByQuery), printCDPCookies(driverOptions, "Cookies set"), ) if err != nil { return nil, err } return strings.NewReader(res), nil } // click all xpaths listed in the scraper config func setCDPClicks(driverOptions scraperDriverOptions) chromedp.Tasks { var tasks chromedp.Tasks for _, click := range driverOptions.Clicks { // for each click element find the node from the xpath and add a click action if click.XPath != "" { xpath := click.XPath waitDuration := scrapeDefaultSleep if click.Sleep > 0 { waitDuration = time.Duration(click.Sleep) * time.Second } action := chromedp.ActionFunc(func(ctx context.Context) error { var nodes []*cdp.Node if err := chromedp.Nodes(xpath, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil { logger.Debugf("Error %s looking for click xpath %s.\n", err, xpath) return err } if len(nodes) == 0 { logger.Debugf("Click xpath %s not found in page.\n", xpath) return nil } logger.Debugf("Clicking %s\n", xpath) return chromedp.MouseClickNode(nodes[0]).Do(ctx) }) tasks = append(tasks, action) tasks = append(tasks, chromedp.Sleep(waitDuration)) } } return tasks } // getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance func getRemoteCDPWSAddress(address string) (string, error) { resp, err := http.Get(address) if err != nil { return "", err } var result map[string]interface{} var json = jsoniter.ConfigCompatibleWithStandardLibrary if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return "", err } remote := result["webSocketDebuggerUrl"].(string) logger.Debugf("Remote cdp instance found %s", remote) return remote, err } func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} { headers := map[string]interface{}{} if driverOptions.Headers != nil { for _, h := range driverOptions.Headers { if h.Key != "" { headers[h.Key] = h.Value logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value) } } } return headers }