hydrus/include/ClientParsers.py

319 lines
9.2 KiB
Python
Raw Normal View History

2013-02-19 00:11:43 +00:00
import bs4
import lxml
import traceback
import urlparse
def Parse4chanPostScreen( html ):
soup = bs4.BeautifulSoup( html )
title_tag = soup.find( 'title' )
if title_tag.string == 'Post successful!': return ( 'success', None )
2013-03-15 02:38:12 +00:00
elif title_tag.string == '4chan - Banned':
print( soup )
return ( 'big error', 'you are banned from this board! html written to log' )
2013-02-19 00:11:43 +00:00
else:
try:
problem_tag = soup.find( id = 'errmsg' )
if problem_tag is None:
try: print( soup )
except: pass
return ( 'error', 'unknown problem, writing 4chan html to log' )
problem = str( problem_tag )
if 'CAPTCHA' in problem: return ( 'captcha', None )
elif 'seconds' in problem: return ( 'too quick', None )
elif 'Duplicate' in problem: return ( 'error', 'duplicate file detected' )
else: return ( 'error', problem )
except: return ( 'error', 'unknown error' )
def ParseBooruGallery( html, url_base, thumb_classname ):
urls_set = set()
urls = []
soup = bs4.BeautifulSoup( html )
thumbnails = soup.find_all( class_ = thumb_classname )
for thumbnail in thumbnails:
links = thumbnail.find_all( 'a' )
if thumbnail.name == 'a': links.append( thumbnail )
for link in links:
if link.string is not None and link.string == 'Image Only': continue # rule 34 @ paheal fix
url = link[ 'href' ]
url = urlparse.urljoin( url_base, url )
if url not in urls_set:
urls_set.add( url )
urls.append( url )
return urls
def ParseBooruPage( html, url_base, tag_classnames_to_namespaces, image_id = None, image_data = None ):
soup = bs4.BeautifulSoup( html )
image_base = None
if image_id is not None:
image = soup.find( id = image_id )
image_url = image[ 'src' ]
if image_data is not None:
links = soup.find_all( 'a' )
for link in links:
if link.string == image_data: image_url = link[ 'href' ]
image_url = urlparse.urljoin( url_base, image_url )
image_url = image_url.replace( 'sample/sample-', '' ) # fix for danbooru resizing
tags = []
for ( tag_classname, namespace ) in tag_classnames_to_namespaces.items():
tag_list_entries = soup.find_all( class_ = tag_classname )
for tag_list_entry in tag_list_entries:
links = tag_list_entry.find_all( 'a' )
if tag_list_entry.name == 'a': links.append( tag_list_entry )
for link in links:
if link.string not in ( '?', '-', '+' ):
if namespace == '': tags.append( link.string )
else: tags.append( namespace + ':' + link.string )
return ( image_url, tags )
def ParseDeviantArtGallery( html ):
results = []
soup = bs4.BeautifulSoup( html )
thumbs_container = soup.find( class_ = 'stream stream-fh' )
def starts_with_thumb( classname ): return classname is not None and classname.startswith( 'thumb' )
links = thumbs_container.find_all( 'a', class_ = starts_with_thumb )
for link in links:
page_url = link[ 'href' ] # something in the form of blah.da.com/art/blah-123456
page_url_split = page_url.split( '-' )
deviant_art_file_id = page_url_split[-1 ]
image_url = 'http://www.deviantart.com/download/' + deviant_art_file_id + '/' # trailing slash is important
raw_title = link[ 'title' ] # something in the form sweet dolls by ~AngeniaC, Feb 29, 2012 in Artisan Crafts > Miniatures > Jewelry
tags = []
( title, raw_title ) = raw_title.split( ' by ~', 1 )
( creator, raw_title ) = raw_title.split( ', ', 1 )
( date_gumpf, raw_category_tags ) = raw_title.split( ' in ', 1 )
category_tags = raw_category_tags.split( ' > ' )
tags = []
tags.append( 'title:' + title )
tags.append( 'creator:' + creator )
tags.extend( category_tags )
results.append( ( image_url, tags ) )
return results
def ParseHentaiFoundryGallery( html ):
urls_set = set()
soup = bs4.BeautifulSoup( html )
def correct_url( href ):
# a good url is in the form "/pictures/user/artist_name/file_id/title"
if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ):
( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' )
# /pictures/user/artist_name/page/3
if file_id != 'page': return True
return False
links = soup.find_all( 'a', href = correct_url )
urls = [ 'http://www.hentai-foundry.com' + link['href'] for link in links ]
result_urls = []
for url in urls:
if url not in urls_set:
urls_set.add( url )
result_urls.append( url )
return result_urls
def ParseHentaiFoundryPage( html ):
# can't parse this easily normally because HF is a pain with the preview->click to see full size business.
# find http://pictures.hentai-foundry.com//
# then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144.jpg
# the .jpg bit is what we really need, but whatever
try:
index = html.index( 'http://pictures.hentai-foundry.com//' )
stuff = html[ index : index + 100 ]
try: ( image_url, gumpf ) = stuff.split( '"', 1 )
except: ( image_url, gumpf ) = stuff.split( ''', 1 )
except: raise Exception( 'Could not parse image url!' )
soup = bs4.BeautifulSoup( html )
tags = []
try:
title = soup.find( 'title' )
( data, nothing ) = unicode( title.string ).split( ' - Hentai Foundry' )
data_reversed = data[::-1] # want to do it right-side first, because title might have ' by ' in it
( artist_reversed, title_reversed ) = data_reversed.split( ' yb ' )
artist = artist_reversed[::-1]
title = title_reversed[::-1]
tags.append( 'creator:' + artist )
tags.append( 'title:' + title )
except: pass
tag_links = soup.find_all( 'a', rel = 'tag' )
for tag_link in tag_links: tags.append( tag_link.string )
return ( image_url, tags )
2013-03-15 02:38:12 +00:00
def ParsePage( html, starting_url ):
soup = bs4.BeautifulSoup( html )
all_links = soup.find_all( 'a' )
links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]
urls = [ urlparse.urljoin( starting_url, link[ 'href' ] ) for link in links_with_images ]
# old version included (images that don't have a link wrapped around them)'s src
return urls
def ParsePixivGallery( html, starting_url ):
results = []
soup = bs4.BeautifulSoup( html )
thumbnail_links = soup.find_all( class_ = 'work' )
for thumbnail_link in thumbnail_links:
url = urlparse.urljoin( starting_url, thumbnail_link[ 'href' ] ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690
image_url_reference_url = url.replace( 'medium', 'big' ) # http://www.pixiv.net/member_illust.php?mode=big&illust_id=33500690
thumbnail_img = thumbnail_link.find( class_ = '_thumbnail' )
thumbnail_image_url = thumbnail_img[ 'src' ] # http://i2.pixiv.net/img02/img/dnosuke/462657_s.jpg
image_url = thumbnail_image_url.replace( '_s', '' ) # http://i2.pixiv.net/img02/img/dnosuke/462657.jpg
results.append( ( url, image_url_reference_url, image_url ) )
return results
def ParsePixivPage( image_url, html ):
soup = bs4.BeautifulSoup( html )
tags = soup.find( 'ul', class_ = 'tags' )
tags = [ a_item.string for a_item in tags.find_all( 'a', class_ = 'text' ) ]
user = soup.find( 'h1', class_ = 'user' )
tags.append( 'creator:' + user.string )
title_parent = soup.find( 'section', class_ = 'work-info' )
title = title_parent.find( 'h1', class_ = 'title' )
tags.append( 'title:' + title.string )
try: tags.append( 'creator:' + image_url.split( '/' )[ -2 ] ) # http://i2.pixiv.net/img02/img/dnosuke/462657.jpg -> dnosuke
except: pass
return tags
2013-02-19 00:11:43 +00:00