Migrate html2lrf, web2lrf to python logging framework

This commit is contained in:
Kovid Goyal 2007-08-12 21:59:36 +00:00
parent 1f24807b87
commit 0b68623f86
4 changed files with 98 additions and 99 deletions

View File

@ -28,6 +28,9 @@ if iswindows:
except: except:
pass pass
class CommandLineError(Exception):
pass
def setup_cli_handlers(logger, level): def setup_cli_handlers(logger, level):
logger.setLevel(level) logger.setLevel(level)
if level == logging.WARNING: if level == logging.WARNING:

View File

@ -20,7 +20,7 @@ Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs. and to Falstaff for pylrs.
""" """
import os, re, sys, shutil, traceback, copy, glob import os, re, sys, shutil, copy, glob, logging
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from urllib import unquote from urllib import unquote
from urlparse import urlparse from urlparse import urlparse
@ -43,7 +43,7 @@ from libprs500.ebooks.lrf import Book
from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table from libprs500.ebooks.lrf.html.table import Table
from libprs500 import extract, filename_to_utf8 from libprs500 import extract, filename_to_utf8, setup_cli_handlers
from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ptempfile import PersistentTemporaryFile
class Span(_Span): class Span(_Span):
@ -84,7 +84,7 @@ class Span(_Span):
return result return result
@staticmethod @staticmethod
def translate_attrs(d, dpi, fonts, font_delta=0, memory=None): def translate_attrs(d, dpi, fonts, logger, font_delta=0, memory=None):
""" """
Receives a dictionary of html attributes and styles and returns Receives a dictionary of html attributes and styles and returns
approximate Xylog equivalents in a new dictionary approximate Xylog equivalents in a new dictionary
@ -211,20 +211,20 @@ class Span(_Span):
else: else:
memory.append(key) memory.append(key)
if report: if report:
print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key] logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key])
t['fontfacename'] = (family, font_key(family, style, weight)) t['fontfacename'] = (family, font_key(family, style, weight))
if t.has_key('fontsize') and int(t['fontsize']) > 120: if t.has_key('fontsize') and int(t['fontsize']) > 120:
t['wordspace'] = 50 t['wordspace'] = 50
return t return t
def __init__(self, ns, css, memory, dpi, fonts, font_delta=0, normal_font_size=100): def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta=0, normal_font_size=100):
src = ns.string if hasattr(ns, 'string') else ns src = ns.string if hasattr(ns, 'string') else ns
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules: for pat, repl in Span.rules:
src = pat.sub(repl, src) src = pat.sub(repl, src)
if not src: if not src:
raise ConversionError('No point in adding an empty string to a Span') raise ConversionError('No point in adding an empty string to a Span')
attrs = Span.translate_attrs(css, dpi, fonts, font_delta=font_delta, memory=memory) attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
if 'fontsize' in attrs.keys(): if 'fontsize' in attrs.keys():
normal_font_size = int(attrs['fontsize']) normal_font_size = int(attrs['fontsize'])
variant = attrs.pop('fontvariant', None) variant = attrs.pop('fontvariant', None)
@ -323,7 +323,7 @@ class HTMLConverter(object):
else: else:
object.__setattr__(self, attr, val) object.__setattr__(self, attr, val)
def __init__(self, book, fonts, path, options, link_level=0, is_root=True): def __init__(self, book, fonts, path, options, logger, link_level=0, is_root=True):
''' '''
Convert HTML file at C{path} and add it to C{book}. After creating Convert HTML file at C{path} and add it to C{book}. After creating
the object, you must call L{self.process_links} on it to create the links and the object, you must call L{self.process_links} on it to create the links and
@ -357,6 +357,7 @@ class HTMLConverter(object):
big = {'font-size' : 'large', 'font-weight':'bold'}, big = {'font-size' : 'large', 'font-weight':'bold'},
) )
self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'} self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
self.logger = logger
self.fonts = fonts #: dict specifting font families to use self.fonts = fonts #: dict specifting font families to use
self.scaled_images = {} #: Temporary files with scaled version of images self.scaled_images = {} #: Temporary files with scaled version of images
self.rotated_images = {} #: Temporary files with rotated version of images self.rotated_images = {} #: Temporary files with rotated version of images
@ -385,8 +386,7 @@ class HTMLConverter(object):
path = os.path.abspath(path) path = os.path.abspath(path)
os.chdir(os.path.dirname(path)) os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path) self.file_name = os.path.basename(path)
print "Processing", self.file_name self.logger.info('Processing %s\n\tParsing HTML...', self.file_name)
print '\tParsing HTML...',
sys.stdout.flush() sys.stdout.flush()
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.MARKUP_MASSAGE) nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
@ -400,7 +400,7 @@ class HTMLConverter(object):
self.soup = BeautifulSoup(raw, self.soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES, convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage) markupMassage=nmassage)
print 'done\n\tConverting to BBeB...', logger.info('\tConverting to BBeB...')
sys.stdout.flush() sys.stdout.flush()
self.current_page = None self.current_page = None
self.current_para = None self.current_para = None
@ -411,7 +411,6 @@ class HTMLConverter(object):
self.page_break_found = True self.page_break_found = True
self.parse_file() self.parse_file()
HTMLConverter.processed_files[path] = self HTMLConverter.processed_files[path] = self
print 'done'
def parse_css(self, style): def parse_css(self, style):
""" """
@ -554,8 +553,8 @@ class HTMLConverter(object):
if target.parent != None and \ if target.parent != None and \
hasattr(target.parent, 'objId'): hasattr(target.parent, 'objId'):
self.book.addTocEntry(ascii_text, tb) self.book.addTocEntry(ascii_text, tb)
elif self.verbose: else:
print "Cannot add link", ascii_text, "to TOC" self.logger.debug("Cannot add link %s to TOC", ascii_text)
def get_target_block(fragment, targets): def get_target_block(fragment, targets):
@ -624,21 +623,21 @@ class HTMLConverter(object):
if not os.access(path.encode('utf8', 'replace'), os.R_OK): if not os.access(path.encode('utf8', 'replace'), os.R_OK):
continue continue
except Exception: except Exception:
if self.verbose: self.logger.exception('Skipping %s', link)
print "Skipping", link
continue continue
path = os.path.abspath(path) path = os.path.abspath(path)
if not path in HTMLConverter.processed_files.keys(): if not path in HTMLConverter.processed_files.keys():
try: try:
self.files[path] = HTMLConverter( self.files[path] = HTMLConverter(
self.book, self.fonts, path, self.options, self.book, self.fonts, path, self.options,
self.logger,
link_level = self.link_level+1, link_level = self.link_level+1,
is_root = False,) is_root = False,)
HTMLConverter.processed_files[path] = self.files[path] HTMLConverter.processed_files[path] = self.files[path]
except Exception: except Exception:
print >>sys.stderr, 'Unable to process', path self.logger.warning('Unable to process %s', path)
if self.verbose: if self.verbose:
traceback.print_exc() self.logger.exception('')
continue continue
finally: finally:
os.chdir(cwd) os.chdir(cwd)
@ -759,12 +758,12 @@ class HTMLConverter(object):
else: else:
self.process_alignment(css) self.process_alignment(css)
try: try:
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\ self.current_para.append(Span(src, self.sanctify_css(css), self.memory,
self.profile.dpi, self.fonts, font_delta=self.font_delta)) self.profile.dpi, self.fonts, self.logger,
font_delta=self.font_delta))
self.current_para.normalize_spaces() self.current_para.normalize_spaces()
except ConversionError, err: except ConversionError:
if self.verbose: self.logger.exception('Bad text')
print >>sys.stderr, err
def sanctify_css(self, css): def sanctify_css(self, css):
""" Return a copy of C{css} that is safe for use in a SPAM Xylog tag """ """ Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
@ -809,7 +808,7 @@ class HTMLConverter(object):
try: try:
im = PILImage.open(path) im = PILImage.open(path)
except IOError, err: except IOError, err:
print >>sys.stderr, 'Unable to process:', path, err self.logger.warning('Unable to process image: %s\n%s', path, err)
return return
@ -826,7 +825,7 @@ class HTMLConverter(object):
self.scaled_images[path] = pt self.scaled_images[path] = pt
return pt.name return pt.name
except IOError: # PIL chokes on interlaced PNG images except IOError: # PIL chokes on interlaced PNG images
print >>sys.stderr, 'Unable to process interlaced PNG', path self.logger.warning('Unable to process interlaced PNG %s', path)
return None return None
pheight = int(self.current_page.pageStyle.attrs['textheight']) pheight = int(self.current_page.pageStyle.attrs['textheight'])
@ -863,10 +862,8 @@ class HTMLConverter(object):
path = pt.name path = pt.name
self.rotated_images[path] = pt self.rotated_images[path] = pt
width, height = im.size width, height = im.size
except IOError, err: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
if self.verbose: self.logger.debug('Unable to process interlaced PNG %s', path)
print >>sys.stderr, 'Unable to autorotate interlaced PNG', path
print >>sys.stderr, err
finally: finally:
pt.close() pt.close()
@ -945,8 +942,7 @@ class HTMLConverter(object):
if not self.page_break_found and self.page_break.match(tagname): if not self.page_break_found and self.page_break.match(tagname):
if len(self.current_page.contents) > 3: if len(self.current_page.contents) > 3:
self.end_page() self.end_page()
if self.verbose: self.logger.debug('Forcing page break at %s', tagname)
print 'Forcing page break at', tagname
return end_page return end_page
def parse_tag(self, tag, parent_css): def parse_tag(self, tag, parent_css):
@ -1048,8 +1044,7 @@ class HTMLConverter(object):
dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps' dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
self.process_image(path, tag_css, width, height, dropcaps=dropcaps) self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
else: else:
if self.verbose: self.logger.debug("Failed to process: %s", str(tag))
print >>sys.stderr, "Failed to process:", tag
elif tagname in ['style', 'link']: elif tagname in ['style', 'link']:
def update_css(ncss): def update_css(ncss):
for key in ncss.keys(): for key in ncss.keys():
@ -1083,7 +1078,8 @@ class HTMLConverter(object):
c.replaceWith(self.get_text(c)) c.replaceWith(self.get_text(c))
self.end_current_para() self.end_current_para()
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts, self.font_delta, self.memory) attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts,
self.logger, self.font_delta, self.memory)
attrs['fontfacename'] = self.fonts['mono']['normal'][1] attrs['fontfacename'] = self.fonts['mono']['normal'][1]
ts = self.book.create_text_style(**self.unindented_style.attrs) ts = self.book.create_text_style(**self.unindented_style.attrs)
ts.attrs.update(attrs) ts.attrs.update(attrs)
@ -1185,8 +1181,7 @@ class HTMLConverter(object):
src = self.get_text(tag, limit=1000) src = self.get_text(tag, limit=1000)
if self.chapter_detection and tagname.startswith('h'): if self.chapter_detection and tagname.startswith('h'):
if self.chapter_regex.search(src): if self.chapter_regex.search(src):
if self.verbose: self.logger.debug('Detected chapter %s', src)
print 'Detected chapter', src
self.end_page() self.end_page()
self.page_break_found = True self.page_break_found = True
self.end_current_para() self.end_current_para()
@ -1241,9 +1236,8 @@ class HTMLConverter(object):
try: try:
self.process_table(tag, tag_css) self.process_table(tag, tag_css)
except Exception, err: except Exception, err:
print 'WARNING: An error occurred while processing a table:', err self.logger.warning('An error occurred while processing a table: %s', str(err))
print 'Ignoring table markup for table:' self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
print str(tag)[:300]
self.in_table = False self.in_table = False
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
else: else:
@ -1275,16 +1269,20 @@ class HTMLConverter(object):
for _file in self.scaled_images.values() + self.rotated_images.values(): for _file in self.scaled_images.values() + self.rotated_images.values():
_file.__del__() _file.__del__()
def process_file(path, options): def process_file(path, options, logger=None):
if re.match('http://|https://', path): if re.match('http://|https://', path):
raise ConversionError, 'You have to save the website %s as an html file first and then run html2lrf on it.'%(path,) raise ConversionError, 'You have to save the website %s as an html file first and then run html2lrf on it.'%(path,)
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('html2lrf')
setup_cli_handlers(logger, level)
cwd = os.getcwd() cwd = os.getcwd()
dirpath = None dirpath = None
default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0]) default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
try: try:
dirpath, path = get_path(path) dirpath, path = get_path(path)
cpath, tpath = '', '' cpath, tpath = '', ''
try_opf(path, options) try_opf(path, options, logger)
if options.cover: if options.cover:
options.cover = os.path.abspath(os.path.expanduser(options.cover)) options.cover = os.path.abspath(os.path.expanduser(options.cover))
cpath = options.cover cpath = options.cover
@ -1347,7 +1345,7 @@ def process_file(path, options):
fpba = ['$', '', '$'] fpba = ['$', '', '$']
options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1], options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1],
re.compile(fpba[2], re.IGNORECASE)] re.compile(fpba[2], re.IGNORECASE)]
conv = HTMLConverter(book, fonts, path, options) conv = HTMLConverter(book, fonts, path, options, logger)
conv.process_links() conv.process_links()
oname = options.output oname = options.output
if not oname: if not oname:
@ -1356,7 +1354,7 @@ def process_file(path, options):
oname = os.path.join(cwd,name) oname = os.path.join(cwd,name)
oname = os.path.abspath(os.path.expanduser(oname)) oname = os.path.abspath(os.path.expanduser(oname))
conv.writeto(oname, lrs=options.lrs) conv.writeto(oname, lrs=options.lrs)
print 'Output written to', oname logger.info('Output written to %s', oname)
conv.cleanup() conv.cleanup()
return oname return oname
finally: finally:
@ -1364,7 +1362,7 @@ def process_file(path, options):
if dirpath: if dirpath:
shutil.rmtree(dirpath, True) shutil.rmtree(dirpath, True)
def try_opf(path, options): def try_opf(path, options, logger):
try: try:
opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0] opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0]
except IndexError: except IndexError:
@ -1419,12 +1417,9 @@ def try_opf(path, options):
if not os.access(options.cover, os.R_OK): if not os.access(options.cover, os.R_OK):
options.cover = None options.cover = None
except: except:
if options.verbose: logger.exception('Could not load cover')
traceback.print_exc() except Exception:
except Exception, err: logger.exception('Failed to process opf file')
if options.verbose:
print >>sys.stderr, 'Failed to process opf file', err
pass
def option_parser(): def option_parser():
return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n''' return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n'''

View File

@ -14,14 +14,13 @@
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Convert known websites into LRF files.''' '''Convert known websites into LRF files.'''
import sys, time, tempfile, shutil, os import sys, time, tempfile, shutil, os, logging
from urlparse import urlsplit from urlparse import urlsplit
from libprs500 import __appname__ from libprs500 import __appname__, setup_cli_handlers, CommandLineError
from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks.lrf.html.convert_from import process_file from libprs500.ebooks.lrf.html.convert_from import process_file
from libprs500.ebooks.lrf.web.profiles import profiles from libprs500.ebooks.lrf.web.profiles import profiles
from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger
from libprs500.web.fetch.simple import create_fetcher from libprs500.web.fetch.simple import create_fetcher
available_profiles = profiles.keys() available_profiles = profiles.keys()
@ -57,14 +56,14 @@ def option_parser():
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.') help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
return parser return parser
def fetch_website(options): def fetch_website(options, logger):
tdir = tempfile.mkdtemp(prefix=__appname__+'_' ) tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
options.dir = tdir options.dir = tdir
fetcher = create_fetcher(options) fetcher = create_fetcher(options, logger)
fetcher.preprocess_regexps = options.preprocess_regexps fetcher.preprocess_regexps = options.preprocess_regexps
return fetcher.start_fetch(options.url), tdir return fetcher.start_fetch(options.url), tdir
def create_lrf(htmlfile, options): def create_lrf(htmlfile, options, logger):
if not options.author: if not options.author:
options.author = __appname__ options.author = __appname__
options.header = True options.header = True
@ -73,20 +72,16 @@ def create_lrf(htmlfile, options):
else: else:
options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf'))) options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
process_file(htmlfile, options) process_file(htmlfile, options, logger)
def main(args=sys.argv): def process_profile(args, options, logger=None):
parser = option_parser() if logger is None:
options, args = parser.parse_args(args) level = logging.DEBUG if options.verbose else logging.INFO
web2disk_setup_logger(options) logger = logging.getLogger('web2lrf')
if len(args) > 2: setup_cli_handlers(logger, level)
parser.print_help()
return 1
if len(args) == 2: if len(args) == 2:
if not profiles.has_key(args[1]): if not profiles.has_key(args[1]):
print >>sys.stderr, 'Unknown profile', args[1] raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
print >>sys.stderr, 'Valid profiles:', profiles.keys()
return 1
profile = profiles[args[1]] if len(args) == 2 else profiles['default'] profile = profiles[args[1]] if len(args) == 2 else profiles['default']
if profile.has_key('initialize'): if profile.has_key('initialize'):
@ -98,11 +93,7 @@ def main(args=sys.argv):
setattr(options, opt, profile[opt]) setattr(options, opt, profile[opt])
if not options.url: if not options.url:
parser.print_help() raise CommandLineError('You must specify the --url option or a profile from one of: %s', available_profiles)
print >>sys.stderr
print >>sys.stderr, 'You must specify the --url option or a profile from one of:',
print >>sys.stderr, available_profiles
return 1
if not options.title: if not options.title:
title = profile['title'] title = profile['title']
@ -114,12 +105,24 @@ def main(args=sys.argv):
options.preprocess_regexps = profile['preprocess_regexps'] options.preprocess_regexps = profile['preprocess_regexps']
options.filter_regexps += profile['filter_regexps'] options.filter_regexps += profile['filter_regexps']
htmlfile, tdir = fetch_website(options) htmlfile, tdir = fetch_website(options, logger)
create_lrf(htmlfile, options) create_lrf(htmlfile, options, logger)
if profile.has_key('finalize'): if profile.has_key('finalize'):
profile['finalize'](profile) profile['finalize'](profile)
shutil.rmtree(tdir) shutil.rmtree(tdir)
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) > 2:
parser.print_help()
return 1
try:
process_profile(args, options)
except CommandLineError, err:
print >>sys.stderr, err
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -23,8 +23,6 @@ from optparse import OptionParser
from libprs500 import __version__, __appname__, __author__, setup_cli_handlers from libprs500 import __version__, __appname__, __author__, setup_cli_handlers
from libprs500.ebooks.BeautifulSoup import BeautifulSoup from libprs500.ebooks.BeautifulSoup import BeautifulSoup
logger = logging.getLogger('libprs500.web.fetch.simple')
class FetchError(Exception): class FetchError(Exception):
pass pass
@ -52,7 +50,8 @@ class RecursiveFetcher(object):
# ) # )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
def __init__(self, options): def __init__(self, options, logger):
self.logger = logger
self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
if not os.path.exists(self.base_dir): if not os.path.exists(self.base_dir):
os.makedirs(self.base_dir) os.makedirs(self.base_dir)
@ -80,7 +79,7 @@ class RecursiveFetcher(object):
def fetch_url(self, url): def fetch_url(self, url):
f = None f = None
logger.info('Fetching %s', url) self.logger.debug('Fetching %s', url)
delta = time.time() - self.last_fetch_at delta = time.time() - self.last_fetch_at
if delta < self.delay: if delta < self.delay:
time.sleep(delta) time.sleep(delta)
@ -138,8 +137,8 @@ class RecursiveFetcher(object):
try: try:
f = self.fetch_url(iurl) f = self.fetch_url(iurl)
except Exception, err: except Exception, err:
logger.warning('Could not fetch stylesheet %s', iurl) self.logger.warning('Could not fetch stylesheet %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True) self.logger.debug('Error: %s', str(err), exc_info=True)
continue continue
c += 1 c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
@ -160,8 +159,8 @@ class RecursiveFetcher(object):
try: try:
f = self.fetch_url(iurl) f = self.fetch_url(iurl)
except Exception, err: except Exception, err:
logger.warning('Could not fetch stylesheet %s', iurl) self.logger.warning('Could not fetch stylesheet %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True) self.logger.debug('Error: %s', str(err), exc_info=True)
continue continue
c += 1 c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
@ -179,7 +178,7 @@ class RecursiveFetcher(object):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl, ext = tag['src'], os.path.splitext(tag['src'])[1] iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
if not ext: if not ext:
logger.info('Skipping extensionless image %s', iurl) self.logger.debug('Skipping extensionless image %s', iurl)
continue continue
if not urlparse.urlsplit(iurl).scheme: if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False) iurl = urlparse.urljoin(baseurl, iurl, False)
@ -189,8 +188,8 @@ class RecursiveFetcher(object):
try: try:
f = self.fetch_url(iurl) f = self.fetch_url(iurl)
except Exception, err: except Exception, err:
logger.warning('Could not fetch image %s', iurl) self.logger.warning('Could not fetch image %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True) self.logger.debug('Error: %s', str(err), exc_info=True)
continue continue
c += 1 c += 1
imgpath = os.path.join(diskpath, 'img'+str(c)+ext) imgpath = os.path.join(diskpath, 'img'+str(c)+ext)
@ -206,7 +205,7 @@ class RecursiveFetcher(object):
if not parts.scheme: if not parts.scheme:
iurl = urlparse.urljoin(baseurl, iurl, False) iurl = urlparse.urljoin(baseurl, iurl, False)
if not self.is_link_ok(iurl): if not self.is_link_ok(iurl):
logger.info('Skipping invalid link: %s', iurl) self.logger.debug('Skipping invalid link: %s', iurl)
return None return None
return iurl return iurl
@ -258,7 +257,7 @@ class RecursiveFetcher(object):
self.current_dir = linkdiskpath self.current_dir = linkdiskpath
f = self.fetch_url(iurl) f = self.fetch_url(iurl)
soup = self.get_soup(f.read()) soup = self.get_soup(f.read())
logger.info('Processing images...') self.logger.debug('Processing images...')
self.process_images(soup, f.geturl()) self.process_images(soup, f.geturl())
if self.download_stylesheets: if self.download_stylesheets:
self.process_stylesheets(soup, f.geturl()) self.process_stylesheets(soup, f.geturl())
@ -266,17 +265,17 @@ class RecursiveFetcher(object):
res = os.path.join(linkdiskpath, basename(iurl)) res = os.path.join(linkdiskpath, basename(iurl))
self.filemap[nurl] = res self.filemap[nurl] = res
if recursion_level < self.max_recursions: if recursion_level < self.max_recursions:
logger.info('Processing links...') self.logger.debug('Processing links...')
self.process_links(soup, iurl, recursion_level+1) self.process_links(soup, iurl, recursion_level+1)
else: else:
self.process_return_links(soup, iurl) self.process_return_links(soup, iurl)
logger.info('Recursion limit reached. Skipping %s', iurl) self.logger.debug('Recursion limit reached. Skipping %s', iurl)
save_soup(soup, res) save_soup(soup, res)
self.localize_link(tag, 'href', res) self.localize_link(tag, 'href', res)
except Exception, err: except Exception, err:
logger.warning('Could not fetch link %s', iurl) self.logger.warning('Could not fetch link %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True) self.logger.debug('Error: %s', str(err), exc_info=True)
finally: finally:
self.current_dir = diskpath self.current_dir = diskpath
self.files += 1 self.files += 1
@ -313,12 +312,12 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
return parser return parser
def create_fetcher(options): def create_fetcher(options, logger=None):
return RecursiveFetcher(options) if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
def setup_logger(options): logger = logging.getLogger('web2disk')
level = logging.DEBUG if options.verbose else logging.WARNING setup_cli_handlers(logger, level)
setup_cli_handlers(logger, level) return RecursiveFetcher(options, logger)
def main(args=sys.argv): def main(args=sys.argv):
parser = option_parser() parser = option_parser()
@ -327,7 +326,6 @@ def main(args=sys.argv):
parser.print_help() parser.print_help()
return 1 return 1
setup_logger(options)
fetcher = create_fetcher(options) fetcher = create_fetcher(options)
fetcher.start_fetch(args[1]) fetcher.start_fetch(args[1])