package scraper import ( "strings" "testing" "github.com/antchfx/htmlquery" "github.com/stashapp/stash/pkg/models" "gopkg.in/yaml.v2" ) // adapted from https://www.freeones.com/html/m_links/bio_Mia_Malkova.php const htmlDoc1 = ` Freeones: Mia Malkova Biography
Babe Name:
Mia Malkova 
Profession:
Porn Star
Ethnicity: Caucasian 
Country of Origin: United States
Date of Birth: July 1, 1992 (27 years old) 
Aliases: Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica 
Eye Color: Hazel 
Hair Color: Blonde 
Height:  
Measurements: 34C-26-36
Fake boobs: No 
Career Start And End 2012 - 2019 (7 Years In The Business)
Tattoos: None 
Piercings: None 
Social Network Links:
` func makeCommonXPath(attr string) string { return `//table[@id="biographyTable"]//tr/td[@class="paramname"]//b[text() = '` + attr + `']/ancestor::tr/td[@class="paramvalue"]` } func makeReplaceRegex(regex string, with string) map[interface{}]interface{} { ret := make(map[interface{}]interface{}) ret["regex"] = regex ret["with"] = with return ret } func makeXPathConfig() xpathScraperConfig { config := make(xpathScraperConfig) config["Name"] = makeCommonXPath("Babe Name:") + `/a` config["Ethnicity"] = makeCommonXPath("Ethnicity:") config["Country"] = makeCommonXPath("Country of Origin:") config["Aliases"] = makeCommonXPath("Aliases:") config["EyeColor"] = makeCommonXPath("Eye Color:") config["Measurements"] = makeCommonXPath("Measurements:") config["FakeTits"] = makeCommonXPath("Fake boobs:") config["Height"] = makeCommonXPath("Height:") config["Tattoos"] = makeCommonXPath("Tattoos:") config["Piercings"] = makeCommonXPath("Piercings:") // special handling for birthdate birthdateAttrConfig := make(map[interface{}]interface{}) birthdateAttrConfig["selector"] = makeCommonXPath("Date of Birth:") var birthdateReplace []interface{} birthdateReplace = append(birthdateReplace, makeReplaceRegex(` \(.* years old\)`, "")) birthdateAttrConfig["replace"] = birthdateReplace birthdateAttrConfig["parseDate"] = "January 2, 2006" // "July 1, 1992 (27 years old) " config["Birthdate"] = birthdateAttrConfig // special handling for career length careerLengthAttrConfig := make(map[interface{}]interface{}) // no colon in attribute header careerLengthAttrConfig["selector"] = makeCommonXPath("Career Start And End") var careerLengthReplace []interface{} careerLengthReplace = append(careerLengthReplace, makeReplaceRegex(`\s+\(.*\)`, "")) careerLengthAttrConfig["replace"] = careerLengthReplace config["CareerLength"] = careerLengthAttrConfig return config } func verifyField(t *testing.T, expected string, actual *string, field string) { t.Helper() if actual == nil || *actual != expected { if actual == nil { t.Errorf("Expected %s to be set to %s, instead got nil", field, expected) } else { t.Errorf("Expected %s to be set to %s, instead got %s", field, expected, *actual) } } } func TestScrapePerformerXPath(t *testing.T) { reader := strings.NewReader(htmlDoc1) doc, err := htmlquery.Parse(reader) if err != nil { t.Errorf("Error loading document: %s", err.Error()) return } xpathConfig := makeXPathConfig() scraper := xpathScraper{ Performer: xpathConfig, } performer, err := scraper.scrapePerformer(doc) if err != nil { t.Errorf("Error scraping performer: %s", err.Error()) return } const performerName = "Mia Malkova" const ethnicity = "Caucasian" const country = "United States" const birthdate = "1992-07-01" const aliases = "Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica" const eyeColor = "Hazel" const measurements = "34C-26-36" const fakeTits = "No" const careerLength = "2012 - 2019" const tattoosPiercings = "None" verifyField(t, performerName, performer.Name, "Name") verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity") verifyField(t, country, performer.Country, "Country") verifyField(t, birthdate, performer.Birthdate, "Birthdate") verifyField(t, aliases, performer.Aliases, "Aliases") verifyField(t, eyeColor, performer.EyeColor, "EyeColor") verifyField(t, measurements, performer.Measurements, "Measurements") verifyField(t, fakeTits, performer.FakeTits, "FakeTits") verifyField(t, careerLength, performer.CareerLength, "CareerLength") verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos") verifyField(t, tattoosPiercings, performer.Piercings, "Piercings") } func TestConcatXPath(t *testing.T) { const firstName = "FirstName" const lastName = "LastName" const eyeColor = "EyeColor" const separator = " " const testDoc = `
` + firstName + `
` + lastName + `
` + eyeColor + ` ` reader := strings.NewReader(testDoc) doc, err := htmlquery.Parse(reader) if err != nil { t.Errorf("Error loading document: %s", err.Error()) return } xpathConfig := make(xpathScraperConfig) nameAttrConfig := make(map[interface{}]interface{}) nameAttrConfig["selector"] = "//div" nameAttrConfig["concat"] = separator xpathConfig["Name"] = nameAttrConfig xpathConfig["EyeColor"] = "//span" scraper := xpathScraper{ Performer: xpathConfig, } performer, err := scraper.scrapePerformer(doc) if err != nil { t.Errorf("Error scraping performer: %s", err.Error()) return } const performerName = firstName + separator + lastName verifyField(t, performerName, performer.Name, "Name") verifyField(t, eyeColor, performer.EyeColor, "EyeColor") } const sceneHTML = ` Test Video - Pornhub.com

Test Video

From:  - 87 videos  459466
Production:  professional
Added on: 2 months ago
Featured on: 1 month ago
Jump to your favorite action
` func makeSceneXPathConfig() xpathScraper { common := make(commonXPathConfig) common["$performerElem"] = `//div[@class="pornstarsWrapper"]/a[@data-mxptype="Pornstar"]` common["$studioElem"] = `//div[@data-type="channel"]/a` config := make(xpathScraperConfig) config["Title"] = `//meta[@property="og:title"]/@content` // this needs post-processing config["Date"] = `//script[@type="application/ld+json"]` tagConfig := make(map[interface{}]interface{}) tagConfig["Name"] = `//div[@class="categoriesWrapper"]//a[not(@class="add-btn-small ")]` config["Tags"] = tagConfig performerConfig := make(map[interface{}]interface{}) performerConfig["Name"] = `$performerElem/@data-mxptext` performerConfig["URL"] = `$performerElem/@href` config["Performers"] = performerConfig studioConfig := make(map[interface{}]interface{}) studioConfig["Name"] = `$studioElem` studioConfig["URL"] = `$studioElem/@href` config["Studio"] = studioConfig const sep = " " moviesNameConfig := make(map[interface{}]interface{}) moviesNameConfig["selector"] = `//i[@class="isMe tooltipTrig"]/@data-title` moviesNameConfig["split"] = sep moviesConfig := make(map[interface{}]interface{}) moviesConfig["Name"] = moviesNameConfig config["Movies"] = moviesConfig scraper := xpathScraper{ Scene: config, Common: common, } return scraper } func verifyTags(t *testing.T, expectedTagNames []string, actualTags []*models.ScrapedSceneTag) { t.Helper() i := 0 for i < len(expectedTagNames) || i < len(actualTags) { expectedTag := "" actualTag := "" if i < len(expectedTagNames) { expectedTag = expectedTagNames[i] } if i < len(actualTags) { actualTag = actualTags[i].Name } if expectedTag != actualTag { t.Errorf("Expected tag %s, got %s", expectedTag, actualTag) } i++ } } func verifyMovies(t *testing.T, expectedMovieNames []string, actualMovies []*models.ScrapedSceneMovie) { t.Helper() i := 0 for i < len(expectedMovieNames) || i < len(actualMovies) { expectedMovie := "" actualMovie := "" if i < len(expectedMovieNames) { expectedMovie = expectedMovieNames[i] } if i < len(actualMovies) { actualMovie = actualMovies[i].Name } if expectedMovie != actualMovie { t.Errorf("Expected movie %s, got %s", expectedMovie, actualMovie) } i++ } } func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedScenePerformer) { t.Helper() i := 0 for i < len(expectedNames) || i < len(actualPerformers) { expectedName := "" actualName := "" expectedURL := "" actualURL := "" if i < len(expectedNames) { expectedName = expectedNames[i] } if i < len(expectedURLs) { expectedURL = expectedURLs[i] } if i < len(actualPerformers) { actualName = actualPerformers[i].Name if actualPerformers[i].URL != nil { actualURL = *actualPerformers[i].URL } } if expectedName != actualName { t.Errorf("Expected performer name %s, got %s", expectedName, actualName) } if expectedURL != actualURL { t.Errorf("Expected perfromer URL %s, got %s", expectedName, actualName) } i++ } } func TestApplySceneXPathConfig(t *testing.T) { reader := strings.NewReader(sceneHTML) doc, err := htmlquery.Parse(reader) if err != nil { t.Errorf("Error loading document: %s", err.Error()) return } scraper := makeSceneXPathConfig() scene, err := scraper.scrapeScene(doc) if err != nil { t.Errorf("Error scraping scene: %s", err.Error()) return } const title = "Test Video" verifyField(t, title, scene.Title, "Title") // verify tags expectedTags := []string{ "Amateur", "Babe", "Blowjob", "Exclusive", "HD Porn", "Pornstar", "Public", "Pussy Licking", "Threesome", "Verified Models", } verifyTags(t, expectedTags, scene.Tags) // verify movies expectedMovies := []string{ "Video", "of", "verified", "member", } verifyMovies(t, expectedMovies, scene.Movies) expectedPerformerNames := []string{ "Alex D", "Mia Malkova", "Riley Reid", } expectedPerformerURLs := []string{ "/pornstar/alex-d", "/pornstar/mia-malkova", "/pornstar/riley-reid", } verifyPerformers(t, expectedPerformerNames, expectedPerformerURLs, scene.Performers) const expectedStudioName = "Sis Loves Me" const expectedStudioURL = "/channels/sis-loves-me" verifyField(t, expectedStudioName, &scene.Studio.Name, "Studio.Name") verifyField(t, expectedStudioURL, scene.Studio.URL, "Studio.URL") } func TestLoadXPathScraperFromYAML(t *testing.T) { const yamlStr = `name: Test performerByURL: - action: scrapeXPath url: - test.com scraper: performerScraper xPathScrapers: performerScraper: performer: name: //h1[@itemprop="name"] ` config := &scraperConfig{} err := yaml.Unmarshal([]byte(yamlStr), &config) if err != nil { t.Errorf("Error loading yaml: %s", err.Error()) return } } func TestLoadInvalidXPath(t *testing.T) { config := make(xpathScraperConfig) config["Name"] = `//a[id=']/span` reader := strings.NewReader(htmlDoc1) doc, err := htmlquery.Parse(reader) if err != nil { t.Errorf("Error loading document: %s", err.Error()) return } common := make(commonXPathConfig) config.process(doc, common) }