diff --git a/pkg/scraper/config.go b/pkg/scraper/config.go index ee672949b..bfeb1e1bc 100644 --- a/pkg/scraper/config.go +++ b/pkg/scraper/config.go @@ -41,6 +41,9 @@ type scraperTypeConfig struct { Script []string `yaml:"script,flow"` Scraper string `yaml:"scraper"` + // for xpath name scraper only + QueryURL string `yaml:"queryURL"` + scraperConfig *scraperConfig } @@ -56,6 +59,8 @@ func (c *performerByNameConfig) resolveFn() { c.performScrape = scrapePerformerNamesScript } else if c.Action == scraperActionStash { c.performScrape = scrapePerformerNamesStash + } else if c.Action == scraperActionXPath { + c.performScrape = scrapePerformerNamesXPath } } @@ -266,6 +271,11 @@ func (c scraperConfig) ScrapePerformer(scrapedPerformer models.ScrapedPerformerI return c.PerformerByFragment.performScrape(c.PerformerByFragment.scraperTypeConfig, scrapedPerformer) } + // try to match against URL if present + if scrapedPerformer.URL != nil && *scrapedPerformer.URL != "" { + return c.ScrapePerformerURL(*scrapedPerformer.URL) + } + return nil, nil } diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go index 9b69026de..745e71437 100644 --- a/pkg/scraper/xpath.go +++ b/pkg/scraper/xpath.go @@ -2,9 +2,11 @@ package scraper import ( "errors" + "net/url" "reflect" "regexp" "strings" + "time" "github.com/antchfx/htmlquery" "golang.org/x/net/html" @@ -43,35 +45,209 @@ func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfi return ret } -func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) []xPathResult { - var ret []xPathResult +type xpathRegexConfig map[interface{}]interface{} +type xpathRegexConfigs []xpathRegexConfig - for k, v := range s { - asStr, isStr := v.(string) +func (c xpathRegexConfig) apply(value string) string { + regex := "" + with := "" - if isStr { - // apply common - if common != nil { - asStr = common.applyCommon(asStr) - } + if regexI, _ := c["regex"]; regexI != nil { + regex, _ = regexI.(string) + } + if withI, _ := c["with"]; withI != nil { + with, _ = withI.(string) + } - found, err := htmlquery.QueryAll(doc, asStr) - if err != nil { - logger.Warnf("Error parsing xpath expression '%s': %s", asStr, err.Error()) - continue - } + if regex != "" { + re, err := regexp.Compile(regex) + if err != nil { + logger.Warnf("Error compiling regex '%s': %s", regex, err.Error()) + return value + } + + return re.ReplaceAllString(value, with) + } + + return value +} + +func (c xpathRegexConfigs) apply(value string) string { + // apply regex in order + for _, config := range c { + value = config.apply(value) + } + + // remove whitespace again + value = commonPostProcess(value) + + return value +} + +type xpathScraperAttrConfig map[interface{}]interface{} + +func (c xpathScraperAttrConfig) getString(key string) string { + ret, _ := c[key] + + if ret == nil { + return "" + } + + asStr, _ := ret.(string) + return asStr +} + +func (c xpathScraperAttrConfig) getSelector() string { + const selectorKey = "selector" + return c.getString(selectorKey) +} + +func (c xpathScraperAttrConfig) getConcat() string { + const concatKey = "concat" + return c.getString(concatKey) +} + +func (c xpathScraperAttrConfig) hasConcat() bool { + return c.getConcat() != "" +} + +func (c xpathScraperAttrConfig) getParseDate() string { + const parseDateKey = "parseDate" + return c.getString(parseDateKey) +} + +func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs { + const replaceKey = "replace" + val, _ := c[replaceKey] + + var ret xpathRegexConfigs + if val == nil { + return ret + } + + asSlice, _ := val.([]interface{}) + + for _, v := range asSlice { + asMap, _ := v.(map[interface{}]interface{}) + ret = append(ret, xpathRegexConfig(asMap)) + } + + return ret +} + +func (c xpathScraperAttrConfig) concatenateResults(nodes []*html.Node) string { + separator := c.getConcat() + result := []string{} + + for _, elem := range nodes { + text := htmlquery.InnerText(elem) + text = commonPostProcess(text) + + result = append(result, text) + } + + return strings.Join(result, separator) +} + +func (c xpathScraperAttrConfig) parseDate(value string) string { + parseDate := c.getParseDate() + + if parseDate == "" { + return value + } + + // try to parse the date using the pattern + // if it fails, then just fall back to the original value + parsedValue, err := time.Parse(parseDate, value) + if err != nil { + logger.Warnf("Error parsing date string '%s' using format '%s': %s", value, parseDate, err.Error()) + return value + } + + // convert it into our date format + const internalDateFormat = "2006-01-02" + return parsedValue.Format(internalDateFormat) +} + +func (c xpathScraperAttrConfig) replaceRegex(value string) string { + replace := c.getReplace() + return replace.apply(value) +} + +func (c xpathScraperAttrConfig) postProcess(value string) string { + // perform regex replacements first + value = c.replaceRegex(value) + value = c.parseDate(value) + + return value +} + +func commonPostProcess(value string) string { + value = strings.TrimSpace(value) + + // remove multiple whitespace and end lines + re := regexp.MustCompile("\n") + value = re.ReplaceAllString(value, "") + re = regexp.MustCompile(" +") + value = re.ReplaceAllString(value, " ") + + return value +} + +func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node { + // apply common + if common != nil { + xpath = common.applyCommon(xpath) + } + + found, err := htmlquery.QueryAll(doc, xpath) + if err != nil { + logger.Warnf("Error parsing xpath expression '%s': %s", xpath, err.Error()) + return nil + } + + return found +} + +func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xPathResults { + var ret xPathResults + + for k, value := range s { + switch v := value.(type) { + case string: + found := runXPathQuery(doc, v, common) if len(found) > 0 { for i, elem := range found { - if i >= len(ret) { - ret = append(ret, make(xPathResult)) - } + text := htmlquery.InnerText(elem) + text = commonPostProcess(text) - ret[i][k] = elem + ret = ret.setKey(i, k, text) + } + } + case map[interface{}]interface{}: + attrConfig := xpathScraperAttrConfig(v) + + found := runXPathQuery(doc, attrConfig.getSelector(), common) + + if len(found) > 0 { + // check if we're concatenating the results into a single result + if attrConfig.hasConcat() { + result := attrConfig.concatenateResults(found) + result = attrConfig.postProcess(result) + const i = 0 + ret = ret.setKey(i, k, result) + } else { + for i, elem := range found { + text := htmlquery.InnerText(elem) + text = commonPostProcess(text) + text = attrConfig.postProcess(text) + + ret = ret.setKey(i, k, text) + } } } } - // TODO - handle map type } return ret @@ -153,6 +329,24 @@ func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, return &ret, nil } +func (s xpathScraper) scrapePerformers(doc *html.Node) ([]*models.ScrapedPerformer, error) { + var ret []*models.ScrapedPerformer + + performerMap := s.Performer + if performerMap == nil { + return nil, nil + } + + results := performerMap.process(doc, s.Common) + for _, r := range results { + var p models.ScrapedPerformer + r.apply(&p) + ret = append(ret, &p) + } + + return ret, nil +} + func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) { var ret models.ScrapedScene @@ -204,7 +398,8 @@ func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) return &ret, nil } -type xPathResult map[string]*html.Node +type xPathResult map[string]string +type xPathResults []xPathResult func (r xPathResult) apply(dest interface{}) { destVal := reflect.ValueOf(dest) @@ -212,22 +407,16 @@ func (r xPathResult) apply(dest interface{}) { // dest should be a pointer destVal = destVal.Elem() - for key, v := range r { + for key, value := range r { field := destVal.FieldByName(key) if field.IsValid() { - value := htmlquery.InnerText(v) - value = strings.TrimSpace(value) - - // remove multiple whitespace and end lines - re := regexp.MustCompile("\n") - value = re.ReplaceAllString(value, "") - re = regexp.MustCompile(" +") - value = re.ReplaceAllString(value, " ") - var reflectValue reflect.Value if field.Kind() == reflect.Ptr { - reflectValue = reflect.ValueOf(&value) + // need to copy the value, otherwise everything is set to the + // same pointer + localValue := value + reflectValue = reflect.ValueOf(&localValue) } else { reflectValue = reflect.ValueOf(value) } @@ -239,6 +428,15 @@ func (r xPathResult) apply(dest interface{}) { } } +func (r xPathResults) setKey(index int, key string, value string) xPathResults { + if index >= len(r) { + r = append(r, make(xPathResult)) + } + + r[index][key] = value + return r +} + func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) { scraper := c.scraperConfig.XPathScrapers[c.Scraper] @@ -270,3 +468,27 @@ func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, return scraper.scrapeScene(doc) } + +func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) { + scraper := c.scraperConfig.XPathScrapers[c.Scraper] + + if scraper == nil { + return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config") + } + + const placeholder = "{}" + + // replace the placeholder string with the URL-escaped name + escapedName := url.QueryEscape(name) + + u := c.QueryURL + u = strings.Replace(u, placeholder, escapedName, -1) + + doc, err := htmlquery.LoadURL(u) + + if err != nil { + return nil, err + } + + return scraper.scrapePerformers(doc) +} diff --git a/pkg/scraper/xpath_test.go b/pkg/scraper/xpath_test.go index 7268bb6dc..8fad5f513 100644 --- a/pkg/scraper/xpath_test.go +++ b/pkg/scraper/xpath_test.go @@ -183,23 +183,50 @@ func makeCommonXPath(attr string) string { return `//table[@id="biographyTable"]//tr/td[@class="paramname"]//b[text() = '` + attr + `']/ancestor::tr/td[@class="paramvalue"]` } +func makeReplaceRegex(regex string, with string) map[interface{}]interface{} { + ret := make(map[interface{}]interface{}) + + ret["regex"] = regex + ret["with"] = with + return ret +} + func makeXPathConfig() xpathScraperConfig { config := make(xpathScraperConfig) config["Name"] = makeCommonXPath("Babe Name:") + `/a` config["Ethnicity"] = makeCommonXPath("Ethnicity:") config["Country"] = makeCommonXPath("Country of Origin:") - config["Birthdate"] = makeCommonXPath("Date of Birth:") config["Aliases"] = makeCommonXPath("Aliases:") config["EyeColor"] = makeCommonXPath("Eye Color:") config["Measurements"] = makeCommonXPath("Measurements:") config["FakeTits"] = makeCommonXPath("Fake boobs:") config["Height"] = makeCommonXPath("Height:") - // no colon in attribute header - config["CareerLength"] = makeCommonXPath("Career Start And End") config["Tattoos"] = makeCommonXPath("Tattoos:") config["Piercings"] = makeCommonXPath("Piercings:") + // special handling for birthdate + birthdateAttrConfig := make(map[interface{}]interface{}) + birthdateAttrConfig["selector"] = makeCommonXPath("Date of Birth:") + + var birthdateReplace []interface{} + birthdateReplace = append(birthdateReplace, makeReplaceRegex(` \(.* years old\)`, "")) + + birthdateAttrConfig["replace"] = birthdateReplace + birthdateAttrConfig["parseDate"] = "January 2, 2006" // "July 1, 1992 (27 years old) " + config["Birthdate"] = birthdateAttrConfig + + // special handling for career length + careerLengthAttrConfig := make(map[interface{}]interface{}) + // no colon in attribute header + careerLengthAttrConfig["selector"] = makeCommonXPath("Career Start And End") + + var careerLengthReplace []interface{} + careerLengthReplace = append(careerLengthReplace, makeReplaceRegex(`\s+\(.*\)`, "")) + careerLengthAttrConfig["replace"] = careerLengthReplace + + config["CareerLength"] = careerLengthAttrConfig + return config } @@ -240,7 +267,7 @@ func TestScrapePerformerXPath(t *testing.T) { const performerName = "Mia Malkova" const ethnicity = "Caucasian" const country = "United States" - const birthdate = "July 1, 1992 (27 years old)" + const birthdate = "1992-07-01" const aliases = "Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica" const eyeColor = "Hazel" const measurements = "34C-26-36" @@ -251,19 +278,65 @@ func TestScrapePerformerXPath(t *testing.T) { verifyField(t, performerName, performer.Name, "Name") verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity") verifyField(t, country, performer.Country, "Country") + verifyField(t, birthdate, performer.Birthdate, "Birthdate") + verifyField(t, aliases, performer.Aliases, "Aliases") verifyField(t, eyeColor, performer.EyeColor, "EyeColor") verifyField(t, measurements, performer.Measurements, "Measurements") verifyField(t, fakeTits, performer.FakeTits, "FakeTits") - // TODO - this needs post-processing - //verifyField(t, careerLength, performer.CareerLength, "CareerLength") + verifyField(t, careerLength, performer.CareerLength, "CareerLength") verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos") verifyField(t, tattoosPiercings, performer.Piercings, "Piercings") } +func TestConcatXPath(t *testing.T) { + const firstName = "FirstName" + const lastName = "LastName" + const eyeColor = "EyeColor" + const separator = " " + const testDoc = ` + +