diff --git a/go.mod b/go.mod index e4e407dd8..9f7bd1e62 100644 --- a/go.mod +++ b/go.mod @@ -25,7 +25,7 @@ require ( github.com/vektah/gqlparser v1.1.2 golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 golang.org/x/image v0.0.0-20190118043309-183bebdce1b2 - golang.org/x/net v0.0.0-20200421231249-e086a090c8fd + golang.org/x/net v0.0.0-20200602114024-627f9648deb9 gopkg.in/yaml.v2 v2.2.2 ) diff --git a/go.sum b/go.sum index d9ef0d5a2..8d6fce3c4 100644 --- a/go.sum +++ b/go.sum @@ -676,6 +676,8 @@ golang.org/x/net v0.0.0-20190522155817-f3200d17e092 h1:4QSRKanuywn15aTZvI/mIDEgP golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd h1:QPwSajcTUrFriMF1nJ3XzgoqakqQEsnZf9LdXdi2nkI= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM= +golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20181017192945-9dcd33a902f4/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20181106182150-f42d05182288/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go index f108dde1c..3bf1e7b7b 100644 --- a/pkg/scraper/xpath.go +++ b/pkg/scraper/xpath.go @@ -4,6 +4,7 @@ import ( "bytes" "errors" "net/http" + "net/http/cookiejar" "net/url" "reflect" "regexp" @@ -13,6 +14,7 @@ import ( "github.com/antchfx/htmlquery" "golang.org/x/net/html" "golang.org/x/net/html/charset" + "golang.org/x/net/publicsuffix" "github.com/stashapp/stash/pkg/logger" "github.com/stashapp/stash/pkg/manager/config" @@ -587,8 +589,24 @@ func (r xPathResults) setKey(index int, key string, value string) xPathResults { } func loadURL(url string, c *scraperConfig) (*html.Node, error) { + options := cookiejar.Options{ + PublicSuffixList: publicsuffix.List, + } + jar, er := cookiejar.New(&options) + if er != nil { + return nil, er + } + client := &http.Client{ Timeout: scrapeGetTimeout, + // defaultCheckRedirect code with max changed from 10 to 20 + CheckRedirect: func(req *http.Request, via []*http.Request) error { + if len(via) >= 20 { + return errors.New("stopped after 20 redirects") + } + return nil + }, + Jar: jar, } req, err := http.NewRequest("GET", url, nil) if err != nil { diff --git a/vendor/golang.org/x/net/publicsuffix/list.go b/vendor/golang.org/x/net/publicsuffix/list.go new file mode 100644 index 000000000..200617ea8 --- /dev/null +++ b/vendor/golang.org/x/net/publicsuffix/list.go @@ -0,0 +1,181 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:generate go run gen.go + +// Package publicsuffix provides a public suffix list based on data from +// https://publicsuffix.org/ +// +// A public suffix is one under which Internet users can directly register +// names. It is related to, but different from, a TLD (top level domain). +// +// "com" is a TLD (top level domain). Top level means it has no dots. +// +// "com" is also a public suffix. Amazon and Google have registered different +// siblings under that domain: "amazon.com" and "google.com". +// +// "au" is another TLD, again because it has no dots. But it's not "amazon.au". +// Instead, it's "amazon.com.au". +// +// "com.au" isn't an actual TLD, because it's not at the top level (it has +// dots). But it is an eTLD (effective TLD), because that's the branching point +// for domain name registrars. +// +// Another name for "an eTLD" is "a public suffix". Often, what's more of +// interest is the eTLD+1, or one more label than the public suffix. For +// example, browsers partition read/write access to HTTP cookies according to +// the eTLD+1. Web pages served from "amazon.com.au" can't read cookies from +// "google.com.au", but web pages served from "maps.google.com" can share +// cookies from "www.google.com", so you don't have to sign into Google Maps +// separately from signing into Google Web Search. Note that all four of those +// domains have 3 labels and 2 dots. The first two domains are each an eTLD+1, +// the last two are not (but share the same eTLD+1: "google.com"). +// +// All of these domains have the same eTLD+1: +// - "www.books.amazon.co.uk" +// - "books.amazon.co.uk" +// - "amazon.co.uk" +// Specifically, the eTLD+1 is "amazon.co.uk", because the eTLD is "co.uk". +// +// There is no closed form algorithm to calculate the eTLD of a domain. +// Instead, the calculation is data driven. This package provides a +// pre-compiled snapshot of Mozilla's PSL (Public Suffix List) data at +// https://publicsuffix.org/ +package publicsuffix // import "golang.org/x/net/publicsuffix" + +// TODO: specify case sensitivity and leading/trailing dot behavior for +// func PublicSuffix and func EffectiveTLDPlusOne. + +import ( + "fmt" + "net/http/cookiejar" + "strings" +) + +// List implements the cookiejar.PublicSuffixList interface by calling the +// PublicSuffix function. +var List cookiejar.PublicSuffixList = list{} + +type list struct{} + +func (list) PublicSuffix(domain string) string { + ps, _ := PublicSuffix(domain) + return ps +} + +func (list) String() string { + return version +} + +// PublicSuffix returns the public suffix of the domain using a copy of the +// publicsuffix.org database compiled into the library. +// +// icann is whether the public suffix is managed by the Internet Corporation +// for Assigned Names and Numbers. If not, the public suffix is either a +// privately managed domain (and in practice, not a top level domain) or an +// unmanaged top level domain (and not explicitly mentioned in the +// publicsuffix.org list). For example, "foo.org" and "foo.co.uk" are ICANN +// domains, "foo.dyndns.org" and "foo.blogspot.co.uk" are private domains and +// "cromulent" is an unmanaged top level domain. +// +// Use cases for distinguishing ICANN domains like "foo.com" from private +// domains like "foo.appspot.com" can be found at +// https://wiki.mozilla.org/Public_Suffix_List/Use_Cases +func PublicSuffix(domain string) (publicSuffix string, icann bool) { + lo, hi := uint32(0), uint32(numTLD) + s, suffix, icannNode, wildcard := domain, len(domain), false, false +loop: + for { + dot := strings.LastIndex(s, ".") + if wildcard { + icann = icannNode + suffix = 1 + dot + } + if lo == hi { + break + } + f := find(s[1+dot:], lo, hi) + if f == notFound { + break + } + + u := nodes[f] >> (nodesBitsTextOffset + nodesBitsTextLength) + icannNode = u&(1<>= nodesBitsICANN + u = children[u&(1<>= childrenBitsLo + hi = u & (1<>= childrenBitsHi + switch u & (1<>= childrenBitsNodeType + wildcard = u&(1<>= nodesBitsTextLength + offset := x & (1<