2013-02-19 00:11:43 +00:00
|
|
|
import bs4
|
|
|
|
import lxml
|
|
|
|
import traceback
|
|
|
|
import urlparse
|
|
|
|
|
|
|
|
def Parse4chanPostScreen( html ):
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
title_tag = soup.find( 'title' )
|
|
|
|
|
|
|
|
if title_tag.string == 'Post successful!': return ( 'success', None )
|
2013-03-15 02:38:12 +00:00
|
|
|
elif title_tag.string == '4chan - Banned':
|
|
|
|
|
|
|
|
print( soup )
|
|
|
|
|
|
|
|
return ( 'big error', 'you are banned from this board! html written to log' )
|
|
|
|
|
2013-02-19 00:11:43 +00:00
|
|
|
else:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
problem_tag = soup.find( id = 'errmsg' )
|
|
|
|
|
|
|
|
if problem_tag is None:
|
|
|
|
|
|
|
|
try: print( soup )
|
|
|
|
except: pass
|
|
|
|
|
|
|
|
return ( 'error', 'unknown problem, writing 4chan html to log' )
|
|
|
|
|
|
|
|
|
|
|
|
problem = str( problem_tag )
|
|
|
|
|
|
|
|
if 'CAPTCHA' in problem: return ( 'captcha', None )
|
|
|
|
elif 'seconds' in problem: return ( 'too quick', None )
|
|
|
|
elif 'Duplicate' in problem: return ( 'error', 'duplicate file detected' )
|
|
|
|
else: return ( 'error', problem )
|
|
|
|
|
|
|
|
except: return ( 'error', 'unknown error' )
|
|
|
|
|
|
|
|
|
2013-03-15 02:38:12 +00:00
|
|
|
def ParsePage( html, starting_url ):
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
all_links = soup.find_all( 'a' )
|
|
|
|
|
|
|
|
links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]
|
|
|
|
|
|
|
|
urls = [ urlparse.urljoin( starting_url, link[ 'href' ] ) for link in links_with_images ]
|
|
|
|
|
|
|
|
# old version included (images that don't have a link wrapped around them)'s src
|
|
|
|
|
|
|
|
return urls
|
2013-02-19 00:11:43 +00:00
|
|
|
|