Migrate html2lrf, web2lrf to python logging framework

2025-07-09 03:04:10 -04:00 · 2007-08-12 21:59:36 +00:00 · 2007-08-12 21:59:36 +00:00 · 0b68623f86
commit 0b68623f86
parent 1f24807b87
4 changed files with 98 additions and 99 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -28,6 +28,9 @@ if iswindows:
    except:
        pass
 class CommandLineError(Exception):
    pass
 def setup_cli_handlers(logger, level):
    logger.setLevel(level)
    if level == logging.WARNING:
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -20,7 +20,7 @@ Code to convert HTML ebooks into LRF ebooks.
 I am indebted to esperanc for the initial CSS->Xylog Style conversion code
 and to Falstaff for pylrs.
 """
-import os, re, sys, shutil, traceback, copy, glob
+import os, re, sys, shutil, copy, glob, logging
 from htmlentitydefs import name2codepoint
 from urllib import unquote
 from urlparse import urlparse
@ -43,7 +43,7 @@ from libprs500.ebooks.lrf import Book
 from libprs500.ebooks.lrf import option_parser as lrf_option_parser
 from libprs500.ebooks import ConversionError
 from libprs500.ebooks.lrf.html.table import Table 
-from libprs500 import extract, filename_to_utf8
+from libprs500 import extract, filename_to_utf8,  setup_cli_handlers
 from libprs500.ptempfile import PersistentTemporaryFile
 class Span(_Span):
@ -84,7 +84,7 @@ class Span(_Span):
        return result
    @staticmethod
-    def translate_attrs(d, dpi, fonts, font_delta=0, memory=None):
+    def translate_attrs(d, dpi, fonts, logger, font_delta=0, memory=None):
        """
        Receives a dictionary of html attributes and styles and returns
        approximate Xylog equivalents in a new dictionary
@ -211,20 +211,20 @@ class Span(_Span):
                    else:
                        memory.append(key)
                if report:
-                    print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
+                    logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key])
        t['fontfacename'] = (family, font_key(family, style, weight))
        if t.has_key('fontsize') and int(t['fontsize']) > 120:
            t['wordspace'] = 50
        return t
-    def __init__(self, ns, css, memory, dpi, fonts, font_delta=0, normal_font_size=100):
+    def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta=0, normal_font_size=100):
        src = ns.string if hasattr(ns, 'string') else ns
        src = re.sub(r'\s{2,}', ' ', src)  # Remove multiple spaces
        for pat, repl in Span.rules:
            src = pat.sub(repl, src)
        if not src:
            raise ConversionError('No point in adding an empty string to a Span')
-        attrs = Span.translate_attrs(css, dpi, fonts, font_delta=font_delta, memory=memory)
+        attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
        if 'fontsize' in attrs.keys():
            normal_font_size = int(attrs['fontsize'])
        variant = attrs.pop('fontvariant', None)
@ -323,7 +323,7 @@ class HTMLConverter(object):
        else:
            object.__setattr__(self, attr, val)
-    def __init__(self, book, fonts, path, options, link_level=0, is_root=True):
+    def __init__(self, book, fonts, path, options, logger, link_level=0, is_root=True):
        '''
        Convert HTML file at C{path} and add it to C{book}. After creating
        the object, you must call L{self.process_links} on it to create the links and
@ -357,6 +357,7 @@ class HTMLConverter(object):
            big    = {'font-size'   : 'large', 'font-weight':'bold'},
            )
        self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
        self.logger = logger        
        self.fonts = fonts #: dict specifting font families to use
        self.scaled_images = {}   #: Temporary files with scaled version of images        
        self.rotated_images = {}  #: Temporary files with rotated version of images        
@ -385,8 +386,7 @@ class HTMLConverter(object):
        path = os.path.abspath(path)
        os.chdir(os.path.dirname(path))
        self.file_name = os.path.basename(path)
-        print "Processing", self.file_name
+        self.logger.info('Processing %s\n\tParsing HTML...', self.file_name)
        print '\tParsing HTML...',
        sys.stdout.flush()
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
@ -400,7 +400,7 @@ class HTMLConverter(object):
        self.soup = BeautifulSoup(raw, 
                         convertEntities=BeautifulSoup.HTML_ENTITIES,
                         markupMassage=nmassage)
-        print 'done\n\tConverting to BBeB...',
+        logger.info('\tConverting to BBeB...')
        sys.stdout.flush()        
        self.current_page = None
        self.current_para = None
@ -411,7 +411,6 @@ class HTMLConverter(object):
            self.page_break_found = True
        self.parse_file()
        HTMLConverter.processed_files[path] = self
        print 'done'
    def parse_css(self, style):
        """
@ -554,8 +553,8 @@ class HTMLConverter(object):
            if target.parent != None and \
               hasattr(target.parent, 'objId'): 
                self.book.addTocEntry(ascii_text, tb)
-            elif self.verbose:
+            else:
-                print "Cannot add link", ascii_text, "to TOC"
+                self.logger.debug("Cannot add link %s to TOC", ascii_text)
        def get_target_block(fragment, targets):
@ -624,21 +623,21 @@ class HTMLConverter(object):
                    if not os.access(path.encode('utf8', 'replace'), os.R_OK):
                        continue
                except Exception:
-                    if self.verbose:
+                    self.logger.exception('Skipping %s', link)
                        print "Skipping", link
                    continue
                path = os.path.abspath(path)
                if not path in HTMLConverter.processed_files.keys():                    
                    try:                        
                        self.files[path] = HTMLConverter(
                                     self.book, self.fonts, path, self.options,
                                     self.logger,
                                     link_level = self.link_level+1,
                                     is_root = False,)
                        HTMLConverter.processed_files[path] = self.files[path]
                    except Exception:
-                        print >>sys.stderr, 'Unable to process', path
+                        self.logger.warning('Unable to process %s', path)
                        if self.verbose:
-                            traceback.print_exc()
+                            self.logger.exception('')
                        continue
                    finally:
                        os.chdir(cwd)
@ -759,12 +758,12 @@ class HTMLConverter(object):
        else:
            self.process_alignment(css)
            try:
-                self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
+                self.current_para.append(Span(src, self.sanctify_css(css), self.memory,
-                                              self.profile.dpi, self.fonts, font_delta=self.font_delta))
+                                              self.profile.dpi, self.fonts, self.logger, 
                                              font_delta=self.font_delta))
                self.current_para.normalize_spaces()
-            except ConversionError, err:
+            except ConversionError:
-                if self.verbose:
+                self.logger.exception('Bad text')
                    print >>sys.stderr, err
    def sanctify_css(self, css):
        """ Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
@ -809,7 +808,7 @@ class HTMLConverter(object):
        try:
            im = PILImage.open(path)
        except IOError, err:
-            print >>sys.stderr, 'Unable to process:', path, err
+            self.logger.warning('Unable to process image: %s\n%s', path, err)
            return
@ -826,7 +825,7 @@ class HTMLConverter(object):
                self.scaled_images[path] = pt
                return pt.name
            except IOError: # PIL chokes on interlaced PNG images
-                print >>sys.stderr, 'Unable to process interlaced PNG', path
+                self.logger.warning('Unable to process interlaced PNG %s', path)
                return None
        pheight = int(self.current_page.pageStyle.attrs['textheight'])
@ -863,10 +862,8 @@ class HTMLConverter(object):
                path = pt.name
                self.rotated_images[path] = pt
                width, height = im.size
-            except IOError, err: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
+            except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
-                if self.verbose:
+                self.logger.debug('Unable to process interlaced PNG %s', path)                 
                    print >>sys.stderr, 'Unable to autorotate interlaced PNG', path
                    print >>sys.stderr, err 
            finally:
                pt.close()
@ -945,8 +942,7 @@ class HTMLConverter(object):
        if not self.page_break_found and self.page_break.match(tagname):
            if len(self.current_page.contents) > 3:
                self.end_page()
-                if self.verbose:
+                self.logger.debug('Forcing page break at %s', tagname)
                    print 'Forcing page break at', tagname
        return end_page
    def parse_tag(self, tag, parent_css):
@ -1048,8 +1044,7 @@ class HTMLConverter(object):
                dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
                self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
            else:
-                if self.verbose:
+                self.logger.debug("Failed to process: %s", str(tag))
                    print >>sys.stderr, "Failed to process:", tag
        elif tagname in ['style', 'link']:
            def update_css(ncss):
                for key in ncss.keys():
@ -1083,7 +1078,8 @@ class HTMLConverter(object):
                c.replaceWith(self.get_text(c))
            self.end_current_para()
            self.current_block.append_to(self.current_page)
-            attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts, self.font_delta, self.memory)
+            attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts, 
                                    self.logger, self.font_delta, self.memory)
            attrs['fontfacename'] = self.fonts['mono']['normal'][1]
            ts = self.book.create_text_style(**self.unindented_style.attrs)
            ts.attrs.update(attrs)
@ -1185,8 +1181,7 @@ class HTMLConverter(object):
            src = self.get_text(tag, limit=1000)
            if self.chapter_detection and tagname.startswith('h'):
                if self.chapter_regex.search(src):
-                    if self.verbose:
+                    self.logger.debug('Detected chapter %s', src)
                        print 'Detected chapter', src
                    self.end_page()
                    self.page_break_found = True
            self.end_current_para()
@ -1241,9 +1236,8 @@ class HTMLConverter(object):
            try:
                self.process_table(tag, tag_css)
            except Exception, err:
-                print 'WARNING: An error occurred while processing a table:', err
+                self.logger.warning('An error occurred while processing a table: %s', str(err))
-                print 'Ignoring table markup for table:'
+                self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
                print str(tag)[:300]
                self.in_table = False
                self.process_children(tag, tag_css) 
        else:
@ -1275,16 +1269,20 @@ class HTMLConverter(object):
        for _file in self.scaled_images.values() + self.rotated_images.values():   
            _file.__del__()
-def process_file(path, options):
+def process_file(path, options, logger=None):
    if re.match('http://|https://', path):
        raise ConversionError, 'You have to save the website %s as an html file first and then run html2lrf on it.'%(path,)
    if logger is None:
        level = logging.DEBUG if options.verbose else logging.INFO
        logger = logging.getLogger('html2lrf')
        setup_cli_handlers(logger, level)
    cwd = os.getcwd()
    dirpath = None
    default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
    try:
        dirpath, path = get_path(path)
        cpath, tpath = '', '' 
-        try_opf(path, options)
+        try_opf(path, options, logger)
        if options.cover:
            options.cover = os.path.abspath(os.path.expanduser(options.cover))
            cpath = options.cover
@ -1347,7 +1345,7 @@ def process_file(path, options):
            fpba = ['$', '', '$']
        options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1],
                                         re.compile(fpba[2], re.IGNORECASE)]
-        conv = HTMLConverter(book, fonts, path, options)
+        conv = HTMLConverter(book, fonts, path, options, logger)
        conv.process_links()
        oname = options.output
        if not oname:
@ -1356,7 +1354,7 @@ def process_file(path, options):
            oname = os.path.join(cwd,name)
        oname = os.path.abspath(os.path.expanduser(oname))
        conv.writeto(oname, lrs=options.lrs)
-        print 'Output written to', oname
+        logger.info('Output written to %s', oname)
        conv.cleanup()
        return oname
    finally:
@ -1364,7 +1362,7 @@ def process_file(path, options):
        if dirpath:
            shutil.rmtree(dirpath, True)
-def try_opf(path, options):
+def try_opf(path, options, logger):
    try:
        opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0]
    except IndexError:
@ -1419,12 +1417,9 @@ def try_opf(path, options):
                    if not os.access(options.cover, os.R_OK):
                        options.cover = None
                except:
-                    if options.verbose:
+                    logger.exception('Could not load cover')
-                        traceback.print_exc()
+    except Exception:
-    except Exception, err:
+        logger.exception('Failed to process opf file')
        if options.verbose:
            print >>sys.stderr, 'Failed to process opf file', err
        pass
 def option_parser():
    return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n'''
--- a/src/libprs500/ebooks/lrf/web/convert_from.py
+++ b/src/libprs500/ebooks/lrf/web/convert_from.py
@ -14,14 +14,13 @@
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Convert known websites into LRF files.'''
-import sys, time, tempfile, shutil, os
+import sys, time, tempfile, shutil, os, logging
 from urlparse import urlsplit
-from libprs500 import __appname__
+from libprs500 import __appname__, setup_cli_handlers, CommandLineError
 from libprs500.ebooks.lrf import option_parser as lrf_option_parser
 from libprs500.ebooks.lrf.html.convert_from import process_file
 from libprs500.ebooks.lrf.web.profiles import profiles
 from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger
 from libprs500.web.fetch.simple import create_fetcher
 available_profiles = profiles.keys()
@ -57,14 +56,14 @@ def option_parser():
                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
    return parser
-def fetch_website(options):
+def fetch_website(options, logger):
    tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
    options.dir = tdir
-    fetcher = create_fetcher(options)
+    fetcher = create_fetcher(options, logger)
    fetcher.preprocess_regexps = options.preprocess_regexps
    return fetcher.start_fetch(options.url), tdir
-def create_lrf(htmlfile, options):
+def create_lrf(htmlfile, options, logger):
    if not options.author:
        options.author = __appname__
    options.header = True
@ -73,20 +72,16 @@ def create_lrf(htmlfile, options):
    else:
        options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
-    process_file(htmlfile, options)
+    process_file(htmlfile, options, logger)
-def main(args=sys.argv):
+def process_profile(args, options, logger=None):
-    parser = option_parser()
+    if logger is None:
-    options, args = parser.parse_args(args)
+        level = logging.DEBUG if options.verbose else logging.INFO
-    web2disk_setup_logger(options)
+        logger = logging.getLogger('web2lrf')
-    if len(args) > 2:
+        setup_cli_handlers(logger, level)
        parser.print_help()
        return 1
    if len(args) == 2:
        if not profiles.has_key(args[1]):
-            print >>sys.stderr, 'Unknown profile', args[1]
+            raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
            print >>sys.stderr, 'Valid profiles:', profiles.keys()
            return 1
    profile = profiles[args[1]] if len(args) == 2 else profiles['default']
    if profile.has_key('initialize'):
@ -98,11 +93,7 @@ def main(args=sys.argv):
            setattr(options, opt, profile[opt])
    if not options.url:
-        parser.print_help()
+        raise CommandLineError('You must specify the --url option or a profile from one of: %s', available_profiles)
        print >>sys.stderr
        print >>sys.stderr, 'You must specify the --url option or a profile from one of:',
        print >>sys.stderr, available_profiles
        return 1
    if not options.title:
        title = profile['title']
@ -114,12 +105,24 @@ def main(args=sys.argv):
    options.preprocess_regexps = profile['preprocess_regexps']
    options.filter_regexps += profile['filter_regexps']
-    htmlfile, tdir = fetch_website(options)
+    htmlfile, tdir = fetch_website(options, logger)
-    create_lrf(htmlfile, options)
+    create_lrf(htmlfile, options, logger)
    if profile.has_key('finalize'):
        profile['finalize'](profile)
    shutil.rmtree(tdir)
 def main(args=sys.argv):
    parser = option_parser()
    options, args = parser.parse_args(args)
    if len(args) > 2:
        parser.print_help()
        return 1
    try:
        process_profile(args, options)
    except CommandLineError, err:
        print >>sys.stderr, err         
    return 0
 if __name__ == '__main__':
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -23,8 +23,6 @@ from optparse import OptionParser
 from libprs500 import __version__, __appname__, __author__, setup_cli_handlers
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 logger = logging.getLogger('libprs500.web.fetch.simple')
 class FetchError(Exception):
    pass
@ -52,7 +50,8 @@ class RecursiveFetcher(object):
    #                       )
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
-    def __init__(self, options):
+    def __init__(self, options, logger):
        self.logger = logger
        self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
        if not os.path.exists(self.base_dir):
            os.makedirs(self.base_dir)
@ -80,7 +79,7 @@ class RecursiveFetcher(object):
    def fetch_url(self, url):
        f = None
-        logger.info('Fetching %s', url)
+        self.logger.debug('Fetching %s', url)
        delta = time.time() - self.last_fetch_at 
        if  delta < self.delay:
            time.sleep(delta)
@ -138,8 +137,8 @@ class RecursiveFetcher(object):
                try:
                    f = self.fetch_url(iurl)
                except Exception, err:
-                    logger.warning('Could not fetch stylesheet %s', iurl)
+                    self.logger.warning('Could not fetch stylesheet %s', iurl)
-                    logger.debug('Error: %s', str(err), exc_info=True)
+                    self.logger.debug('Error: %s', str(err), exc_info=True)
                    continue
                c += 1
                stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
@ -160,8 +159,8 @@ class RecursiveFetcher(object):
                        try:
                            f = self.fetch_url(iurl)
                        except Exception, err:
-                            logger.warning('Could not fetch stylesheet %s', iurl)
+                            self.logger.warning('Could not fetch stylesheet %s', iurl)
-                            logger.debug('Error: %s', str(err), exc_info=True)
+                            self.logger.debug('Error: %s', str(err), exc_info=True)
                            continue
                        c += 1
                        stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
@ -179,7 +178,7 @@ class RecursiveFetcher(object):
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
            if not ext:
-                logger.info('Skipping extensionless image %s', iurl)
+                self.logger.debug('Skipping extensionless image %s', iurl)
                continue
            if not urlparse.urlsplit(iurl).scheme:
                iurl = urlparse.urljoin(baseurl, iurl, False)
@ -189,8 +188,8 @@ class RecursiveFetcher(object):
            try:
                f = self.fetch_url(iurl)
            except Exception, err:
-                logger.warning('Could not fetch image %s', iurl)
+                self.logger.warning('Could not fetch image %s', iurl)
-                logger.debug('Error: %s', str(err), exc_info=True)
+                self.logger.debug('Error: %s', str(err), exc_info=True)
                continue
            c += 1
            imgpath = os.path.join(diskpath, 'img'+str(c)+ext)
@ -206,7 +205,7 @@ class RecursiveFetcher(object):
        if not parts.scheme:
            iurl = urlparse.urljoin(baseurl, iurl, False)
        if not self.is_link_ok(iurl):
-            logger.info('Skipping invalid link: %s', iurl)
+            self.logger.debug('Skipping invalid link: %s', iurl)
            return None
        return iurl
@ -258,7 +257,7 @@ class RecursiveFetcher(object):
                    self.current_dir = linkdiskpath
                    f = self.fetch_url(iurl)
                    soup = self.get_soup(f.read())
-                    logger.info('Processing images...')
+                    self.logger.debug('Processing images...')
                    self.process_images(soup, f.geturl())
                    if self.download_stylesheets:
                        self.process_stylesheets(soup, f.geturl())
@ -266,17 +265,17 @@ class RecursiveFetcher(object):
                    res = os.path.join(linkdiskpath, basename(iurl))
                    self.filemap[nurl] = res
                    if recursion_level < self.max_recursions:
-                        logger.info('Processing links...')
+                        self.logger.debug('Processing links...')
                        self.process_links(soup, iurl, recursion_level+1)
                    else:
                        self.process_return_links(soup, iurl) 
-                        logger.info('Recursion limit reached. Skipping %s', iurl)
+                        self.logger.debug('Recursion limit reached. Skipping %s', iurl)
                    save_soup(soup, res)
                    self.localize_link(tag, 'href', res)
                except Exception, err:
-                    logger.warning('Could not fetch link %s', iurl)
+                    self.logger.warning('Could not fetch link %s', iurl)
-                    logger.debug('Error: %s', str(err), exc_info=True)
+                    self.logger.debug('Error: %s', str(err), exc_info=True)
                finally:
                    self.current_dir = diskpath
                    self.files += 1                
@ -313,12 +312,12 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
    return parser
-def create_fetcher(options):
+def create_fetcher(options, logger=None):
-    return RecursiveFetcher(options)
+    if logger is None:
-
+        level = logging.DEBUG if options.verbose else logging.INFO
-def setup_logger(options):
+        logger = logging.getLogger('web2disk')
-    level = logging.DEBUG if options.verbose else logging.WARNING
+        setup_cli_handlers(logger, level)
-    setup_cli_handlers(logger, level)
+    return RecursiveFetcher(options, logger)
 def main(args=sys.argv):
    parser = option_parser()    
@ -327,7 +326,6 @@ def main(args=sys.argv):
        parser.print_help()
        return 1
    setup_logger(options)
    fetcher = create_fetcher(options) 
    fetcher.start_fetch(args[1])