package scraper import ( "fmt" "net/http" "net/http/httptest" "strings" "testing" "github.com/antchfx/htmlquery" "github.com/stashapp/stash/pkg/models" "github.com/stretchr/testify/assert" "gopkg.in/yaml.v2" ) // adapted from https://www.freeones.com/html/m_links/bio_Mia_Malkova.php const htmlDoc1 = ` Freeones: Mia Malkova Biography
Babe Name:
Mia Malkova 
Profession:
Porn Star
Ethnicity: Caucasian 
Country of Origin: United States
Date of Birth: July 1, 1992 (27 years old) 
Aliases: Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica 
Eye Color: Hazel 
Hair Color: Blonde 
Height: 5ft7
Measurements: 34C-26-36
Fake boobs: No 
Career Start And End 2012 - 2019 (7 Years In The Business)
Tattoos: None 
Piercings: ;
Social Network Links:
` func makeCommonXPath(attr string) string { return `//table[@id="biographyTable"]//tr/td[@class="paramname"]//b[text() = '` + attr + `']/ancestor::tr/td[@class="paramvalue"]` } func makeSimpleAttrConfig(str string) mappedScraperAttrConfig { return mappedScraperAttrConfig{ Selector: str, } } func makeReplaceRegex(regex string, with string) mappedRegexConfig { ret := mappedRegexConfig{ Regex: regex, With: with, } return ret } func makeXPathConfig() mappedPerformerScraperConfig { config := mappedPerformerScraperConfig{ mappedConfig: make(mappedConfig), } config.mappedConfig["Name"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a`) config.mappedConfig["Ethnicity"] = makeSimpleAttrConfig(makeCommonXPath("Ethnicity:")) config.mappedConfig["Aliases"] = makeSimpleAttrConfig(makeCommonXPath("Aliases:")) config.mappedConfig["EyeColor"] = makeSimpleAttrConfig(makeCommonXPath("Eye Color:")) config.mappedConfig["Measurements"] = makeSimpleAttrConfig(makeCommonXPath("Measurements:")) config.mappedConfig["FakeTits"] = makeSimpleAttrConfig(makeCommonXPath("Fake boobs:")) config.mappedConfig["Tattoos"] = makeSimpleAttrConfig(makeCommonXPath("Tattoos:")) config.mappedConfig["Piercings"] = makeSimpleAttrConfig(makeCommonXPath("Piercings:") + "/comment()") // special handling for birthdate birthdateAttrConfig := makeSimpleAttrConfig(makeCommonXPath("Date of Birth:")) var birthdateReplace mappedRegexConfigs // make this leave the trailing space to test existing scrapers that do so birthdateReplace = append(birthdateReplace, makeReplaceRegex(`\(.* years old\)`, "")) birthdateReplaceAction := postProcessReplace(birthdateReplace) birthdateParseDate := postProcessParseDate("January 2, 2006") // "July 1, 1992 (27 years old) " birthdateAttrConfig.postProcessActions = []postProcessAction{ &birthdateReplaceAction, &birthdateParseDate, } config.mappedConfig["Birthdate"] = birthdateAttrConfig // special handling for career length // no colon in attribute header careerLengthAttrConfig := makeSimpleAttrConfig(makeCommonXPath("Career Start And End")) var careerLengthReplace mappedRegexConfigs careerLengthReplace = append(careerLengthReplace, makeReplaceRegex(`\s+\(.*\)`, "")) careerLengthReplaceAction := postProcessReplace(careerLengthReplace) careerLengthAttrConfig.postProcessActions = []postProcessAction{ &careerLengthReplaceAction, } config.mappedConfig["CareerLength"] = careerLengthAttrConfig // use map post-process action for gender genderConfig := makeSimpleAttrConfig(makeCommonXPath("Profession:")) genderMapAction := make(postProcessMap) genderMapAction["Porn Star"] = "Female" genderConfig.postProcessActions = []postProcessAction{ &genderMapAction, } config.mappedConfig["Gender"] = genderConfig // use fixed for height config.mappedConfig["Country"] = mappedScraperAttrConfig{ Fixed: "United States", } heightConfig := makeSimpleAttrConfig(makeCommonXPath("Height:")) heightConvAction := postProcessFeetToCm(true) heightConfig.postProcessActions = []postProcessAction{ &heightConvAction, } config.mappedConfig["Height"] = heightConfig return config } func verifyField(t *testing.T, expected string, actual *string, field string) { t.Helper() if actual == nil || *actual != expected { if actual == nil { t.Errorf("Expected %s to be set to %s, instead got nil", field, expected) } else { t.Errorf("Expected %s to be set to %s, instead got %s", field, expected, *actual) } } } func TestScrapePerformerXPath(t *testing.T) { reader := strings.NewReader(htmlDoc1) doc, err := htmlquery.Parse(reader) if err != nil { t.Errorf("Error loading document: %s", err.Error()) return } xpathConfig := makeXPathConfig() scraper := mappedScraper{ Performer: &xpathConfig, } q := &xpathQuery{ doc: doc, } performer, err := scraper.scrapePerformer(q) if err != nil { t.Errorf("Error scraping performer: %s", err.Error()) return } const performerName = "Mia Malkova" const ethnicity = "Caucasian" const country = "United States" const birthdate = "1992-07-01" const aliases = "Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica" const eyeColor = "Hazel" const measurements = "34C-26-36" const fakeTits = "No" const careerLength = "2012 - 2019" const tattoos = "None" const piercings = "" const gender = "Female" const height = "170" verifyField(t, performerName, performer.Name, "Name") verifyField(t, gender, performer.Gender, "Gender") verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity") verifyField(t, country, performer.Country, "Country") verifyField(t, birthdate, performer.Birthdate, "Birthdate") verifyField(t, aliases, performer.Aliases, "Aliases") verifyField(t, eyeColor, performer.EyeColor, "EyeColor") verifyField(t, measurements, performer.Measurements, "Measurements") verifyField(t, fakeTits, performer.FakeTits, "FakeTits") verifyField(t, careerLength, performer.CareerLength, "CareerLength") verifyField(t, tattoos, performer.Tattoos, "Tattoos") verifyField(t, piercings, performer.Piercings, "Piercings") verifyField(t, height, performer.Height, "Height") } func TestConcatXPath(t *testing.T) { const firstName = "FirstName" const lastName = "LastName" const eyeColor = "EyeColor" const separator = " " const testDoc = `
` + firstName + `
` + lastName + `
` + eyeColor + ` ` reader := strings.NewReader(testDoc) doc, err := htmlquery.Parse(reader) if err != nil { t.Errorf("Error loading document: %s", err.Error()) return } xpathConfig := make(mappedConfig) nameAttrConfig := mappedScraperAttrConfig{ Selector: "//div", Concat: separator, } xpathConfig["Name"] = nameAttrConfig xpathConfig["EyeColor"] = makeSimpleAttrConfig("//span") scraper := mappedScraper{ Performer: &mappedPerformerScraperConfig{ mappedConfig: xpathConfig, }, } q := &xpathQuery{ doc: doc, } performer, err := scraper.scrapePerformer(q) if err != nil { t.Errorf("Error scraping performer: %s", err.Error()) return } const performerName = firstName + separator + lastName verifyField(t, performerName, performer.Name, "Name") verifyField(t, eyeColor, performer.EyeColor, "EyeColor") } const sceneHTML = ` Test Video - Pornhub.com
` func makeSceneXPathConfig() mappedScraper { common := make(commonMappedConfig) common["$performerElem"] = `//div[@class="pornstarsWrapper"]/a[@data-mxptype="Pornstar"]` common["$studioElem"] = `//div[@data-type="channel"]/a` config := mappedSceneScraperConfig{ mappedConfig: make(mappedConfig), } config.mappedConfig["Title"] = makeSimpleAttrConfig(`//meta[@property="og:title"]/@content`) // this needs post-processing config.mappedConfig["Date"] = makeSimpleAttrConfig(`//script[@type="application/ld+json"]`) tagConfig := make(mappedConfig) tagConfig["Name"] = makeSimpleAttrConfig(`//div[@class="categoriesWrapper"]//a[not(@class="add-btn-small ")]`) config.Tags = tagConfig performerConfig := make(mappedConfig) performerConfig["Name"] = makeSimpleAttrConfig(`$performerElem/@data-mxptext`) performerConfig["URL"] = makeSimpleAttrConfig(`$performerElem/@href`) config.Performers = performerConfig studioConfig := make(mappedConfig) studioConfig["Name"] = makeSimpleAttrConfig(`$studioElem`) studioConfig["URL"] = makeSimpleAttrConfig(`$studioElem/@href`) config.Studio = studioConfig const sep = " " moviesNameConfig := mappedScraperAttrConfig{ Selector: `//i[@class="isMe tooltipTrig"]/@data-title`, Split: sep, } moviesConfig := make(mappedConfig) moviesConfig["Name"] = moviesNameConfig config.Movies = moviesConfig scraper := mappedScraper{ Scene: &config, Common: common, } return scraper } func verifyTags(t *testing.T, expectedTagNames []string, actualTags []*models.ScrapedSceneTag) { t.Helper() i := 0 for i < len(expectedTagNames) || i < len(actualTags) { expectedTag := "" actualTag := "" if i < len(expectedTagNames) { expectedTag = expectedTagNames[i] } if i < len(actualTags) { actualTag = actualTags[i].Name } if expectedTag != actualTag { t.Errorf("Expected tag %s, got %s", expectedTag, actualTag) } i++ } } func verifyMovies(t *testing.T, expectedMovieNames []string, actualMovies []*models.ScrapedSceneMovie) { t.Helper() i := 0 for i < len(expectedMovieNames) || i < len(actualMovies) { expectedMovie := "" actualMovie := "" if i < len(expectedMovieNames) { expectedMovie = expectedMovieNames[i] } if i < len(actualMovies) { actualMovie = actualMovies[i].Name } if expectedMovie != actualMovie { t.Errorf("Expected movie %s, got %s", expectedMovie, actualMovie) } i++ } } func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedScenePerformer) { t.Helper() i := 0 for i < len(expectedNames) || i < len(actualPerformers) { expectedName := "" actualName := "" expectedURL := "" actualURL := "" if i < len(expectedNames) { expectedName = expectedNames[i] } if i < len(expectedURLs) { expectedURL = expectedURLs[i] } if i < len(actualPerformers) { actualName = actualPerformers[i].Name if actualPerformers[i].URL != nil { actualURL = *actualPerformers[i].URL } } if expectedName != actualName { t.Errorf("Expected performer name %s, got %s", expectedName, actualName) } if expectedURL != actualURL { t.Errorf("Expected perfromer URL %s, got %s", expectedName, actualName) } i++ } } func TestApplySceneXPathConfig(t *testing.T) { reader := strings.NewReader(sceneHTML) doc, err := htmlquery.Parse(reader) if err != nil { t.Errorf("Error loading document: %s", err.Error()) return } scraper := makeSceneXPathConfig() q := &xpathQuery{ doc: doc, } scene, err := scraper.scrapeScene(q) if err != nil { t.Errorf("Error scraping scene: %s", err.Error()) return } const title = "Test Video" verifyField(t, title, scene.Title, "Title") // verify tags expectedTags := []string{ "Amateur", "Babe", "Blowjob", "Exclusive", "HD Porn", "Pornstar", "Public", "Pussy Licking", "Threesome", "Verified Models", } verifyTags(t, expectedTags, scene.Tags) // verify movies expectedMovies := []string{ "Video", "of", "verified", "member", } verifyMovies(t, expectedMovies, scene.Movies) expectedPerformerNames := []string{ "Alex D", "Mia Malkova", "Riley Reid", } expectedPerformerURLs := []string{ "/pornstar/alex-d", "/pornstar/mia-malkova", "/pornstar/riley-reid", } verifyPerformers(t, expectedPerformerNames, expectedPerformerURLs, scene.Performers) const expectedStudioName = "Sis Loves Me" const expectedStudioURL = "/channels/sis-loves-me" verifyField(t, expectedStudioName, &scene.Studio.Name, "Studio.Name") verifyField(t, expectedStudioURL, scene.Studio.URL, "Studio.URL") } func TestLoadXPathScraperFromYAML(t *testing.T) { const yamlStr = `name: Test performerByURL: - action: scrapeXPath url: - test.com scraper: performerScraper xPathScrapers: performerScraper: performer: name: //h1[@itemprop="name"] sceneScraper: scene: Title: selector: //title postProcess: - parseDate: January 2, 2006 Tags: Name: //tags Movies: Name: //movies Performers: Name: //performers Studio: Name: //studio ` c := &config{} err := yaml.Unmarshal([]byte(yamlStr), &c) if err != nil { t.Errorf("Error loading yaml: %s", err.Error()) return } // ensure fields are filled in correctly sceneScraper := c.XPathScrapers["sceneScraper"] sceneConfig := sceneScraper.Scene assert.Equal(t, "//title", sceneConfig.mappedConfig["Title"].Selector) assert.Equal(t, "//tags", sceneConfig.Tags["Name"].Selector) assert.Equal(t, "//movies", sceneConfig.Movies["Name"].Selector) assert.Equal(t, "//performers", sceneConfig.Performers["Name"].Selector) assert.Equal(t, "//studio", sceneConfig.Studio["Name"].Selector) postProcess := sceneConfig.mappedConfig["Title"].postProcessActions parseDate := postProcess[0].(*postProcessParseDate) assert.Equal(t, "January 2, 2006", string(*parseDate)) } func TestLoadInvalidXPath(t *testing.T) { config := make(mappedConfig) config["Name"] = makeSimpleAttrConfig(`//a[id=']/span`) reader := strings.NewReader(htmlDoc1) doc, err := htmlquery.Parse(reader) if err != nil { t.Errorf("Error loading document: %s", err.Error()) return } q := &xpathQuery{ doc: doc, } config.process(q, nil) } func TestSubScrape(t *testing.T) { retHTML := `
A link
` ssHTML := ` The name ` ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/getName" { fmt.Fprint(w, ssHTML) } else { fmt.Fprint(w, retHTML) } })) defer ts.Close() yamlStr := `name: Test performerByURL: - action: scrapeXPath url: - ` + ts.URL + ` scraper: performerScraper xPathScrapers: performerScraper: performer: Name: selector: //div/a/@href postProcess: - replace: - regex: ^ with: ` + ts.URL + ` - subScraper: selector: //span ` c := &config{} err := yaml.Unmarshal([]byte(yamlStr), &c) if err != nil { t.Errorf("Error loading yaml: %s", err.Error()) return } globalConfig := GlobalConfig{} performer, err := c.ScrapePerformerURL(ts.URL, globalConfig) if err != nil { t.Errorf("Error scraping performer: %s", err.Error()) return } verifyField(t, "The name", performer.Name, "Name") }