import bs4
import lxml
import traceback
import urlparse

def Parse4chanPostScreen( html ):
    
    soup = bs4.BeautifulSoup( html )
    
    title_tag = soup.find( 'title' )
    
    if title_tag.string == 'Post successful!': return ( 'success', None )
    elif title_tag.string == '4chan - Banned':
        
        print( soup )
        
        return ( 'big error', 'you are banned from this board! html written to log' )
        
    else:
        
        try:
            
            problem_tag = soup.find( id = 'errmsg' )
            
            if problem_tag is None:
                
                try: print( soup )
                except: pass
                
                return ( 'error', 'unknown problem, writing 4chan html to log' )
                
            
            problem = str( problem_tag )
            
            if 'CAPTCHA' in problem: return ( 'captcha', None )
            elif 'seconds' in problem: return ( 'too quick', None )
            elif 'Duplicate' in problem: return ( 'error', 'duplicate file detected' )
            else: return ( 'error', problem )
            
        except: return ( 'error', 'unknown error' )
        
    
def ParseBooruGallery( html, url_base, thumb_classname ):
    
    urls_set = set()
    urls = []
    
    soup = bs4.BeautifulSoup( html )
    
    thumbnails = soup.find_all( class_ = thumb_classname )
    
    for thumbnail in thumbnails:
        
        links = thumbnail.find_all( 'a' )
        
        if thumbnail.name == 'a': links.append( thumbnail )
        
        for link in links:
            
            if link.string is not None and link.string == 'Image Only': continue # rule 34 @ paheal fix
            
            url = link[ 'href' ]
            
            url = urlparse.urljoin( url_base, url )
            
            if url not in urls_set:
                
                urls_set.add( url )
                urls.append( url )
                
            
        
    
    return urls
    
def ParseBooruPage( html, url_base, tag_classnames_to_namespaces, image_id = None, image_data = None ):
    
    soup = bs4.BeautifulSoup( html )
    
    image_base = None
    
    if image_id is not None:
        
        image = soup.find( id = image_id )
        
        image_url = image[ 'src' ]
        
    
    if image_data is not None:
        
        links = soup.find_all( 'a' )
        
        for link in links:
            
            if link.string == image_data: image_url = link[ 'href' ]
            
        
    
    image_url = urlparse.urljoin( url_base, image_url )
    
    image_url = image_url.replace( 'sample/sample-', '' ) # fix for danbooru resizing
    
    tags = []
    
    for ( tag_classname, namespace ) in tag_classnames_to_namespaces.items():
        
        tag_list_entries = soup.find_all( class_ = tag_classname )
        
        for tag_list_entry in tag_list_entries:
            
            links = tag_list_entry.find_all( 'a' )
            
            if tag_list_entry.name == 'a': links.append( tag_list_entry )
            
            for link in links:
                
                if link.string not in ( '?', '-', '+' ):
                    
                    if namespace == '': tags.append( link.string )
                    else: tags.append( namespace + ':' + link.string )
                    
                
            
        
    
    return ( image_url, tags )
    
def ParseDeviantArtGallery( html ):
    
    results = []
    
    soup = bs4.BeautifulSoup( html )
    
    thumbs_container = soup.find( class_ = 'stream stream-fh' )
    
    def starts_with_thumb( classname ): return classname is not None and classname.startswith( 'thumb' )
    
    links = thumbs_container.find_all( 'a', class_ = starts_with_thumb )
    
    for link in links:
        
        page_url = link[ 'href' ] # something in the form of blah.da.com/art/blah-123456
        
        page_url_split = page_url.split( '-' )
        
        deviant_art_file_id = page_url_split[-1 ]
        
        image_url = 'http://www.deviantart.com/download/' + deviant_art_file_id + '/' # trailing slash is important
        
        raw_title = link[ 'title' ] # something in the form sweet dolls by ~AngeniaC, Feb 29, 2012 in Artisan Crafts &gt; Miniatures &gt; Jewelry
        
        tags = []
        
        ( title, raw_title ) = raw_title.split( ' by ~', 1 )
        
        ( creator, raw_title ) = raw_title.split( ', ', 1 )
        
        ( date_gumpf, raw_category_tags ) = raw_title.split( ' in ', 1 )
        
        category_tags = raw_category_tags.split( ' > ' )
        
        tags = []
        
        tags.append( 'title:' + title )
        tags.append( 'creator:' + creator )
        tags.extend( category_tags )
        
        results.append( ( image_url, tags ) )
        
    
    return results
    
def ParseHentaiFoundryGallery( html ):
    
    urls_set = set()
    
    soup = bs4.BeautifulSoup( html )
    
    def correct_url( href ):
        
        # a good url is in the form "/pictures/user/artist_name/file_id/title"
        
        if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ):
            
            ( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' )
            
            # /pictures/user/artist_name/page/3
            if file_id != 'page': return True
            
        
        return False
        
    
    links = soup.find_all( 'a', href = correct_url )
    
    urls = [ 'http://www.hentai-foundry.com' + link['href'] for link in links ]
    
    result_urls = []
    
    for url in urls:
        
        if url not in urls_set:
            
            urls_set.add( url )
            
            result_urls.append( url )
            
        
    
    return result_urls
    
def ParseHentaiFoundryPage( html ):
    
    # can't parse this easily normally because HF is a pain with the preview->click to see full size business.
    # find http://pictures.hentai-foundry.com//
    # then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144.jpg
    # the .jpg bit is what we really need, but whatever
    try:
        
        index = html.index( 'http://pictures.hentai-foundry.com//' )
        
        stuff = html[ index : index + 100 ]
        
        try: ( image_url, gumpf ) = stuff.split( '"', 1 )
        except: ( image_url, gumpf ) = stuff.split( '&#039;', 1 )
        
    except: raise Exception( 'Could not parse image url!' )
    
    soup = bs4.BeautifulSoup( html )
    
    tags = []
    
    try:
        
        title = soup.find( 'title' )
        
        ( data, nothing ) = unicode( title.string ).split( ' - Hentai Foundry' )
        
        data_reversed = data[::-1] # want to do it right-side first, because title might have ' by ' in it
        
        ( artist_reversed, title_reversed ) = data_reversed.split( ' yb ' )
        
        artist = artist_reversed[::-1]
        
        title = title_reversed[::-1]
        
        tags.append( 'creator:' + artist )
        tags.append( 'title:' + title )
        
    except: pass
    
    tag_links = soup.find_all( 'a', rel = 'tag' )
    
    for tag_link in tag_links: tags.append( tag_link.string )
    
    return ( image_url, tags )
    
def ParsePage( html, starting_url ):
    
    soup = bs4.BeautifulSoup( html )
    
    all_links = soup.find_all( 'a' )
    
    links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]
    
    urls = [ urlparse.urljoin( starting_url, link[ 'href' ] ) for link in links_with_images ]
    
    # old version included (images that don't have a link wrapped around them)'s src
    
    return urls
    
def ParsePixivGallery( html, starting_url ):
    
    results = []
    
    soup = bs4.BeautifulSoup( html )
    
    thumbnail_links = soup.find_all( class_ = 'work' )
    
    for thumbnail_link in thumbnail_links:
        
        url = urlparse.urljoin( starting_url, thumbnail_link[ 'href' ] ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690
        
        image_url_reference_url = url.replace( 'medium', 'big' ) # http://www.pixiv.net/member_illust.php?mode=big&illust_id=33500690
        
        thumbnail_img = thumbnail_link.find( class_ = '_thumbnail' )
        
        thumbnail_image_url = thumbnail_img[ 'src' ] # http://i2.pixiv.net/img02/img/dnosuke/462657_s.jpg
        
        image_url = thumbnail_image_url.replace( '_s', '' ) # http://i2.pixiv.net/img02/img/dnosuke/462657.jpg
        
        results.append( ( url, image_url_reference_url, image_url ) )
        
    
    return results
    
def ParsePixivPage( image_url, html ):
    
    soup = bs4.BeautifulSoup( html )
    
    tags = soup.find( 'ul', class_ = 'tags' )
    
    tags = [ a_item.string for a_item in tags.find_all( 'a', class_ = 'text' ) ]
    
    user = soup.find( 'h1', class_ = 'user' )
    
    tags.append( 'creator:' + user.string )
    
    title_parent = soup.find( 'section', class_ = 'work-info' )
    
    title = title_parent.find( 'h1', class_ = 'title' )
    
    tags.append( 'title:' + title.string )
    
    try: tags.append( 'creator:' + image_url.split( '/' )[ -2 ] ) # http://i2.pixiv.net/img02/img/dnosuke/462657.jpg -> dnosuke
    except: pass
    
    return tags