Beginnings of the new conversion framework. Input plugins for MOBI and EPUB.

2025-07-07 18:24:30 -04:00 · 2009-03-06 21:38:35 -08:00 · 2009-03-06 21:38:35 -08:00 · 925a86fb0c
commit 925a86fb0c
parent 30bd23ee38
13 changed files with 525 additions and 235 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -90,28 +90,11 @@ def prints(*args, **kwargs):
        if i != len(args)-1:
            file.write(sep)
    file.write(end)
    file.flush()
 class CommandLineError(Exception):
    pass
 class ColoredFormatter(Formatter):
    def format(self, record):
        ln = record.__dict__['levelname']
        col = ''
        if ln == 'CRITICAL':
            col = terminal_controller.YELLOW
        elif ln == 'ERROR':
            col = terminal_controller.RED
        elif ln in ['WARN', 'WARNING']:
            col = terminal_controller.BLUE
        elif ln == 'INFO':
            col = terminal_controller.GREEN
        elif ln == 'DEBUG':
            col = terminal_controller.CYAN
        record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
        return Formatter.format(self, record)
 def setup_cli_handlers(logger, level):
@ -335,66 +318,23 @@ def english_sort(x, y):
    '''
    return cmp(_spat.sub('', x), _spat.sub('', y))
-class LoggingInterface:
+class ColoredFormatter(Formatter):
-    def __init__(self, logger):
+    def format(self, record):
-        self.__logger = self.logger = logger
+        ln = record.__dict__['levelname']
-        
+        col = ''
-    def setup_cli_handler(self, verbosity):
+        if ln == 'CRITICAL':
-        for handler in self.__logger.handlers:
+            col = terminal_controller.YELLOW
-            if isinstance(handler, logging.StreamHandler):
+        elif ln == 'ERROR':
-                return
+            col = terminal_controller.RED
-        if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers:
+        elif ln in ['WARN', 'WARNING']:
-            return
+            col = terminal_controller.BLUE
-        stream    = sys.stdout
+        elif ln == 'INFO':
-        formatter = logging.Formatter()
+            col = terminal_controller.GREEN
-        level     = logging.INFO
+        elif ln == 'DEBUG':
-        if verbosity > 0:
+            col = terminal_controller.CYAN
-            formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \
+        record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
-                        ColoredFormatter('%(levelname)s: %(message)s')
+        return Formatter.format(self, record)
            level     = logging.DEBUG
            if verbosity > 1:
                stream = sys.stderr
        handler = logging.StreamHandler(stream)
        handler.setFormatter(formatter)
        handler.setLevel(level)
        self.__logger.addHandler(handler)
        self.__logger.setLevel(level)
    def ___log(self, func, msg, args, kwargs):
        args = [msg] + list(args)
        for i in range(len(args)):
            if not isinstance(args[i], basestring):
                continue
            if sys.version_info[:2] > (2, 5):
                if not isinstance(args[i], unicode):
                    args[i] = args[i].decode(preferred_encoding, 'replace')
            elif isinstance(args[i], unicode):
                args[i] = args[i].encode(preferred_encoding, 'replace')
        func(*args, **kwargs)
    def log_debug(self, msg, *args, **kwargs):
        self.___log(self.__logger.debug, msg, args, kwargs)
    def log_info(self, msg, *args, **kwargs):
        self.___log(self.__logger.info, msg, args, kwargs)
    def log_warning(self, msg, *args, **kwargs):
        self.___log(self.__logger.warning, msg, args, kwargs)
    def log_warn(self, msg, *args, **kwargs):
        self.___log(self.__logger.warning, msg, args, kwargs)
    def log_error(self, msg, *args, **kwargs):
        self.___log(self.__logger.error, msg, args, kwargs)
    def log_critical(self, msg, *args, **kwargs):
        self.___log(self.__logger.critical, msg, args, kwargs)
    def log_exception(self, msg, *args):
        self.___log(self.__logger.exception, msg, args, {})
 def walk(dir):
    ''' A nice interface to os.walk '''
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -242,8 +242,13 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
        set_metadata(stream, mi)
-plugins = [HTML2ZIP]
+from calibre.ebooks.epub.input import EPUBInput
 from calibre.ebooks.mobi.input import MOBIInput
 from calibre.customize.profiles import input_profiles
 plugins = [HTML2ZIP, EPUBInput, MOBIInput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataWriter')]
 plugins += input_profiles
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -1,28 +1,30 @@
 from __future__ import with_statement
 '''
 Defines the plugin sytem for conversions.
 '''
-import re
+import re, os, shutil
 from lxml import html
 from calibre import CurrentDir
 from calibre.customize import Plugin
 class ConversionOption(object):
    '''
    Class representing conversion options
    '''
-    def __init__(self, name=None, default=None, help=None, long_switch=None, 
+    def __init__(self, name=None, help=None, long_switch=None, 
-                 short_switch=None, choices=None, gui_label=None, 
+                 short_switch=None, choices=None):
                 category=None):
        self.name = name
        self.default = default
        self.help = help
        self.long_switch = long_switch
        self.short_switch = short_switch
        self.choices = choices
-        self.gui_label = gui_label
+        
-        self.category = category
+        if self.long_switch is None:
            self.long_switch = '--'+self.name.replace('_', '-')
        self.validate_parameters()
@ -32,41 +34,156 @@ class ConversionOption(object):
        '''
        if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None:
            raise ValueError(self.name + ' is not a valid Python identifier')
-        if not (isinstance(self.default, (int, float, str, unicode)) or \
+        if not self.help:
-            self.default is None):
+            raise ValueError('You must set the help text')
 class OptionRecommendation(object):
    LOW  = 1
    MED  = 2
    HIGH = 3
    def __init__(self, recommeded_value, level=LOW, **kwargs):
        '''
        An option recommendation. That is, an option as well as its recommended 
        value and the level of the recommendation.
        '''
        self.level = level
        self.recommended_value = recommeded_value
        self.option = kwargs.pop('option', None)
        if self.option is None:
            self.option = ConversionOption(**kwargs)
        self.validate_parameters()
    def validate_parameters(self):
        if self.option.choices and self.recommended_value not in \
                                                    self.option.choices:
            raise ValueError('Recommended value not in choices')
        if not (isinstance(self.recommended_value, (int, float, str, unicode))\
            or self.default is None):
            raise ValueError(unicode(self.default) + 
                             ' is not a string or a number')
-        if not self.help:
+         
            raise ValueError('You must set the help text')      
-class ConversionPlugin(Plugin):
+class InputFormatPlugin(Plugin):
    '''
-    The base class for all conversion related plugins.
+    InputFormatPlugins are responsible for converting a document into 
    HTML+OPF+CSS+etc.
    The results of the conversion *must* be encoded in UTF-8.
    The main action happens in :method:`convert`.
    '''
    #: List of options
    #: Each option must be a dictionary. The dictionary can contain several
    #: keys defining the option. The ones marked by a * are required, the rest
    #: are optional. The keys are::
    #:
    #:    *'name'        : A valid python identifier.
    #:    *'default'     : The default value for this option.
    #:    *'help'        : 
    #:    'short_switch' : A suggestion for a short form of the command line
    #:                     switch (for example if name is 'title', this 
    #:                     could be 't'). It is only used if no prior
    #:                     conversion plugin has claimed it. 
    options = []
-    type = _('Conversion')
+    type = _('Conversion Input')
    can_be_disabled = False
    supported_platforms = ['windows', 'osx', 'linux']
 class InputFormatPlugin(ConversionPlugin):
    #: Set of file types for which this plugin should be run
-    #: For example: ``set(['lit', 'mobi', 'prc'])``
+    #: For example: ``set(['azw', 'mobi', 'prc'])``
    file_types     = set([])
    #: Options shared by all Input format plugins. Do not override
    #: in sub-classes. Use :member:`options` instead. Every option must be an
    #: instance of :class:`OptionRecommendation`. 
    common_options = set([
        OptionRecommendation(name='debug_input',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('Save the output from the input plugin to the specified '
                   'directory. Useful if you are unsure at which stage '
                   'of the conversion process a bug is occurring. '
                   'WARNING: This completely deletes the contents of '
                   'the specified directory.')
        ),
        OptionRecommendation(name='input_encoding',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('Specify the character encoding of the input document. If '
                   'set this option will override any encoding declared by the '
                   'document itself. Particularly useful for documents that '
                   'do not declare an encoding or that have erroneous '
                   'encoding declarations.')
        ),
    ])
    #: Options to customize the behavior of this plugin. Every option must be an
    #: instance of :class:`OptionRecommendation`.  
    options = set([])
    def convert(self, stream, options, file_ext, parse_cache, log):
        '''
        This method must be implemented in sub-classes. It must return
        the path to the created OPF file. All output should be contained in 
        the current directory. If this plugin creates files outside the current
        directory they must be deleted/marked for deletion before this method 
        returns.
        :param stream:   A file like object that contains the input file.
        :param options:  Options to customize the conversion process. 
                         Guaranteed to have attributes corresponding
                         to all the options declared by this plugin. In 
                         addition, it will have a verbose attribute that
                         takes integral values from zero upwards. Higher numbers
                         mean be more verbose. Another useful attribute is 
                         ``input_profile`` that is an instance of 
                         :class:`calibre.customize.profiles.InputProfile`.
        :param file_ext: The extension (without the .) of the input file. It
                         is guaranteed to be one of the `file_types` supported
                         by this plugin.
        :param parse_cache:    A dictionary that maps absolute file paths to
                               parsed representations of their contents. For
                               HTML the representation is an lxml element of 
                               the root of the tree. For CSS it is a cssutils
                               stylesheet. If this plugin parses any of the
                               output files, it should add them to the cache
                               so that later stages of the conversion wont
                               have to re-parse them. If a parsed representation
                               is in the cache, there is no need to actually 
                               write the file to disk.
        :param log: A :class:`calibre.utils.logging.Log` object. All output 
                    should use this object.
        '''
        raise NotImplementedError
    def __call__(self, stream, options, file_ext, parse_cache, log, output_dir):
        log('InputFormatPlugin: %s running'%self.name, end=' ')
        if hasattr(stream, 'name'):
            log('on', stream.name)
        with CurrentDir(output_dir):
            for x in os.listdir('.'):
                shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
            ret = self.convert(stream, options, file_ext, parse_cache, log)
            for key in list(parse_cache.keys()):
                if os.path.abspath(key) != key:
                    log.warn(('InputFormatPlugin: %s returned a '
                             'relative path: %s')%(self.name, key)
                             )
                    parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
        if options.debug_input is not None:
            options.debug_input = os.path.abspath(options.debug_input)
            if not os.path.exists(options.debug_input):
                os.makedirs(options.debug_input)
            shutil.rmtree(options.debug_input)
            for f, obj in parse_cache.items():
                if hasattr(obj, 'cssText'):
                    raw = obj.cssText
                else:
                    raw = html.tostring(obj, encoding='utf-8', method='xml', 
                         include_meta_content_type=True, pretty_print=True)
                if isinstance(raw, unicode):
                    raw = raw.encode('utf-8')
                open(f, 'wb').write(raw)
            shutil.copytree('.', options.debug_input)
        return ret
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -0,0 +1,27 @@
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre.customize import Plugin
 class InputProfile(Plugin):
    author = 'Kovid Goyal'
    supported_platforms = set(['windows', 'osx', 'linux'])
    can_be_disabled = False
    type = _('Input profile')
 # TODO: Add some real information to this profile. All other profiles must
 #       inherit from this profile and override as needed
    name        = 'Default Input Profile'
    short_name  = 'default' # Used in the CLI so dont spaces etc. in it
    description = _('This profile tries to provide sane defaults and is useful '
                    'if you know nothing about the input document.')
 input_profiles = [InputProfile]
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -6,13 +6,14 @@ import os, shutil, traceback, functools, sys
 from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \
                              MetadataWriterPlugin
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.customize.profiles import InputProfile
 from calibre.customize.builtins import plugins as builtin_plugins
 from calibre.constants import __version__, iswindows, isosx
 from calibre.ebooks.metadata import MetaInformation
 from calibre.utils.config import make_config_dir, Config, ConfigProxy, \
                                 plugin_dir, OptionParser
 version = tuple([int(x) for x in __version__.split('.')])
 platform = 'linux'
@ -70,7 +71,10 @@ _on_import           = {}
 _on_preprocess       = {}
 _on_postprocess      = {}
-
+def input_profiles():
    for plugin in _initialized_plugins:
        if isinstance(plugin, InputProfile):
            yield plugin
 def reread_filetype_plugins():
    global _on_import
@ -234,6 +238,17 @@ def find_plugin(name):
        if plugin.name == name:
            return plugin
 def input_format_plugins():
    for plugin in _initialized_plugins:
        if isinstance(plugin, InputFormatPlugin):
            yield plugin
 def plugin_for_input_format(fmt):
    for plugin in input_format_plugins():
        if fmt in plugin.file_types:
            return plugin
 def disable_plugin(plugin_or_name):
    x = getattr(plugin_or_name, 'name', plugin_or_name)
    plugin = find_plugin(x)
--- a/src/calibre/ebooks/conversion/init.py
+++ b/src/calibre/ebooks/conversion/init.py
@ -0,0 +1,4 @@
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -0,0 +1,30 @@
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre.customize.conversion import OptionRecommendation 
 from calibre.customize.ui import input_profiles
 pipeline_options = [
 OptionRecommendation(name='verbose', 
            recommended_value=0, level=OptionRecommendation.LOW,
            short_switch='v', 
            help=_('Level of verbosity. Specify multiple times for greater '
                   'verbosity.')
        ),
 OptionRecommendation(name='input_profile',
            recommended_value='default', level=OptionRecommendation.LOW,
            choices=[x.short_name for x in input_profiles()],
            help=_('Specify the input profile. The input profile gives the '
                   'conversion system information on how to interpret '
                   'various information in the input document. For '
                   'example resolution dependent lengths (i.e. lengths in '
                   'pixels).')
        ),
 ]
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -40,38 +40,6 @@ def rules(stylesheets):
                if r.type == r.STYLE_RULE:
                    yield r
 def decrypt_font(key, path):
    raw = open(path, 'rb').read()
    crypt = raw[:1024]
    key = cycle(iter(key))
    decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
    with open(path, 'wb') as f:
        f.write(decrypt)
        f.write(raw[1024:])
 def process_encryption(encfile, opf):
    key = None
    m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
    if m:
        key = m.group(1)
        key = list(map(ord, uuid.UUID(key).bytes))
    try:
        root = etree.parse(encfile)
        for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
            algorithm = em.get('Algorithm', '')
            if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
                return False
            cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
            uri = cr.get('URI')
            path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
            if os.path.exists(path):
                decrypt_font(key, path)
        return True
    except:
        import traceback
        traceback.print_exc()
    return False
 def initialize_container(path_to_container, opf_name='metadata.opf'):
    '''
    Create an empty EPUB document, with a default skeleton.
--- a/src/calibre/ebooks/epub/input.py
+++ b/src/calibre/ebooks/epub/input.py
@ -0,0 +1,76 @@
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import os, re, uuid
 from itertools import cycle
 from lxml import etree
 from calibre.customize.conversion import InputFormatPlugin
 class EPUBInput(InputFormatPlugin):
    name        = 'EPUB Input'
    author      = 'Kovid Goyal'
    description = 'Convert EPUB files (.epub) to HTML'
    file_types  = set(['epub'])
    @classmethod
    def decrypt_font(cls, key, path):
        raw = open(path, 'rb').read()
        crypt = raw[:1024]
        key = cycle(iter(key))
        decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
        with open(path, 'wb') as f:
            f.write(decrypt)
            f.write(raw[1024:])
    @classmethod
    def process_ecryption(cls, encfile, opf, log):
        key = None
        m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
        if m:
            key = m.group(1)
            key = list(map(ord, uuid.UUID(key).bytes))
        try:
            root = etree.parse(encfile)
            for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
                algorithm = em.get('Algorithm', '')
                if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
                    return False
                cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
                uri = cr.get('URI')
                path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
                if os.path.exists(path):
                    cls.decrypt_font(key, path)
            return True
        except:
            import traceback
            traceback.print_exc()
        return False
    def convert(self, stream, options, file_ext, parse_cache, log):
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
        zf = ZipFile(stream)
        zf.extractall(os.getcwd())
        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
        opf = None
        for f in walk('.'):
            if f.lower().endswith('.opf'):
                opf = f
                break
        path = getattr(stream, 'name', 'stream')
        if opf is None:
            raise ValueError('%s is not a valid EPUB file'%path)
        if os.path.exists(encfile):
            if not self.process_encryption(encfile, opf, log):
                raise DRMError(os.path.basename(path))
        return opf
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -0,0 +1,29 @@
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import os
 from calibre.customize.conversion import InputFormatPlugin
 class MOBIInput(InputFormatPlugin):
    name        = 'MOBI Input'
    author      = 'Kovid Goyal'
    description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
    file_types  = set(['mobi', 'prc', 'azw'])
    def convert(self, stream, options, file_ext, parse_cache, log):
        from calibre.ebooks.mobi.reader import MobiReader
        mr = MobiReader(stream, log, options.input_encoding, 
                        options.debug_input)
        mr.extract_content(output_dir=os.getcwdu(), parse_cache)
        raw = parse_cache.get('calibre_raw_mobi_markup', False)
        if raw:
            if isinstance(raw, unicode):
                raw = raw.encode('utf-8')
            open('debug-raw.html', 'wb').write(raw)
        return mr.created_opf_path
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 Read data from .mobi files
 '''
-import sys, struct, os, cStringIO, re, functools
+import struct, os, cStringIO, re, functools
 try:
    from PIL import Image as PILImage
@ -35,8 +35,10 @@ class EXTHHeader(object):
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        left = self.num_items
-        for i in range(self.num_items):
+        while left > 0:
            left -= 1
            id, size = struct.unpack('>LL', raw[pos:pos+8])
            content = raw[pos+8:pos+size]
            pos += size
@ -76,7 +78,8 @@ class EXTHHeader(object):
 class BookHeader(object):
-    def __init__(self, raw, ident):
+    def __init__(self, raw, ident, user_encoding, log):
        self.log = log
        self.compression_type = raw[:2]
        self.records, self.records_size = struct.unpack('>HH', raw[8:12])
        self.encryption_type, = struct.unpack('>H', raw[12:14])
@ -92,8 +95,8 @@ class BookHeader(object):
        else:
            self.ancient = False
            self.doctype = raw[16:20]
-            self.length, self.type, self.codepage, self.unique_id, self.version = \
+            self.length, self.type, self.codepage, self.unique_id, \
-                     struct.unpack('>LLLLL', raw[20:40])
+                self.version = struct.unpack('>LLLLL', raw[20:40])
            try:
@ -102,8 +105,9 @@ class BookHeader(object):
                          65001 : 'utf-8',
                          }[self.codepage]
            except (IndexError, KeyError):
-                print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage
+                self.codec = 'cp1252' if user_encoding is None else user_encoding
-                self.codec = 'cp1252'
+                log.warn('Unknown codepage %d. Assuming %s'%(self.codepage,
                                                            self.codec))
            if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
                self.extra_flags = 0
@ -138,9 +142,24 @@ class MobiReader(object):
    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
-    def __init__(self, filename_or_stream, verbose=False):
+    def __init__(self, filename_or_stream, log, user_encoding=None, debug=None):
-        self.verbose = verbose
+        self.log = log
        self.debug = debug
        self.embedded_mi = None
        self.base_css_rules = '''
                blockquote { margin: 0em 0em 0em 1.25em; text-align: justify }
                p { margin: 0em; text-align: justify }
                .bold { font-weight: bold }
                .italic { font-style: italic }
                .mbp_pagebreak {
                    page-break-after: always; margin: 0; display: block
                }
                '''
        self.tag_css_rules = []
        if hasattr(filename_or_stream, 'read'):
            stream = filename_or_stream
@ -177,17 +196,21 @@ class MobiReader(object):
            self.sections.append((section(i), self.section_headers[i])) 
-        self.book_header = BookHeader(self.sections[0][0], self.ident)
+        self.book_header = BookHeader(self.sections[0][0], self.ident, 
                                      user_encoding, self.log)
        self.name = self.name.decode(self.book_header.codec, 'replace')
-    def extract_content(self, output_dir=os.getcwdu()):
+    def extract_content(self, output_dir, parse_cache):
        output_dir = os.path.abspath(output_dir)
        if self.book_header.encryption_type != 0:
            raise DRMError(self.name)
        processed_records = self.extract_text()
        if self.debug is not None:
            self.parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
        self.add_anchors()
-        self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
+        self.processed_html = self.processed_html.decode(self.book_header.codec,
                                                          'ignore')
        for pat in ENCODING_PATS:
            self.processed_html = pat.sub('', self.processed_html)
        e2u = functools.partial(entity_to_unicode, 
@ -203,16 +226,10 @@ class MobiReader(object):
        self.processed_html = \
            re.compile('<head>', re.IGNORECASE).sub(
                '\n<head>\n'
-                '<style type="text/css">\n'
+                '\t<link type="text/css" href="styles.css" />\n',
                'blockquote { margin: 0em 0em 0em 1.25em; text-align: justify; }\n'
                'p { margin: 0em; text-align: justify; }\n'
                '.bold { font-weight: bold; }\n'
                '.italic { font-style: italic; }\n'
                '</style>\n',
                self.processed_html)
-        if self.verbose:
+        self.log.debug('Parsing HTML...')
            print 'Parsing HTML...'
        root = html.fromstring(self.processed_html)
        self.upshift_markup(root)
        guides = root.xpath('//guide')
@ -230,25 +247,24 @@ class MobiReader(object):
                    ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href']
        except AttributeError:
            pass
-        if self.verbose:
+        parse_cache[htmlfile] = root
            print 'Serializing...'
        with open(htmlfile, 'wb') as f:
            raw = html.tostring(root, encoding='utf-8', method='xml', 
                         include_meta_content_type=True, pretty_print=True)
            raw = raw.replace('<head>', 
            '<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n')
            f.write(raw)
        self.htmlfile = htmlfile
-        if self.book_header.exth is not None or self.embedded_mi is not None:
+        self.log.debug('Creating OPF...')
-            if self.verbose:
+        ncx = cStringIO.StringIO()
-                print 'Creating OPF...'
+        opf = self.create_opf(htmlfile, guide, root)
-            ncx = cStringIO.StringIO()
+        self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' 
-            opf = self.create_opf(htmlfile, guide, root)
+        opf.render(open(self.created_opf_path, 'wb'), ncx)
-            opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx)
+        ncx = ncx.getvalue()
-            ncx = ncx.getvalue()
+        if ncx:
-            if ncx:
+            open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
-                open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
+                
        with open('styles.css', 'wb') as s:
            s.write(self.base_css_rules+'\n\n')
            for rule in self.tag_css_rules:
                if isinstance(rule, unicode):
                    rule = rule.encode('utf-8')
                s.write(rule+'\n\n')
    def read_embedded_metadata(self, root, elem, guide):
        raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
@ -277,8 +293,7 @@ class MobiReader(object):
    def cleanup_html(self):
-        if self.verbose:
+        self.log.debug('Cleaning up HTML...')
            print 'Cleaning up HTML...'
        self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
        if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
            self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
@ -286,8 +301,7 @@ class MobiReader(object):
        self.processed_html = self.processed_html.replace('> <', '>\n<')
    def upshift_markup(self, root):
-        if self.verbose:
+        self.log.debug('Converting style information to CSS...')
            print 'Converting style information to CSS...'
        size_map = {
                    'xx-small' : '0.5',
                    'x-small'  : '1',
@ -298,7 +312,7 @@ class MobiReader(object):
                    'xx-large' : '6',
                    }
        mobi_version = self.book_header.mobi_version
-        for tag in root.iter(etree.Element):
+        for i, tag in enumerate(root.iter(etree.Element)):
            if tag.tag in ('country-region', 'place', 'placetype', 'placename',
                           'state', 'city'):
                tag.tag = 'span'
@ -352,8 +366,7 @@ class MobiReader(object):
            elif tag.tag == 'pre':
                if not tag.text:
                    tag.tag = 'div'
-            if styles:
+            
                attrib['style'] = '; '.join(styles)
            if 'filepos-id' in attrib:
                attrib['id'] = attrib.pop('filepos-id')
            if 'filepos' in attrib:
@ -362,15 +375,24 @@ class MobiReader(object):
                    attrib['href'] = "#filepos%d" % int(filepos)
                except ValueError:
                    pass
            if styles:
                attrib['id'] = attrib.get('id', 'calibre_mr_gid%d'%i)
                self.tag_css_rules.append('#%s {%s}'%(attrib['id'], 
                                                      '; '.join(styles)))
    def create_opf(self, htmlfile, guide=None, root=None):
        mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
        if mi is None:
            mi = MetaInformation(self.title, [_('Unknown')])
        opf = OPFCreator(os.path.dirname(htmlfile), mi)
        if hasattr(self.book_header.exth, 'cover_offset'):
            opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
        elif mi.cover is not None:
            opf.cover = mi.cover
-        manifest = [(htmlfile, 'text/x-oeb1-document')]
+        manifest = [(htmlfile, 'text/x-oeb1-document'), 
                    (os.path.abspath('styles.css'), 'text/css')]
        bp = os.path.dirname(htmlfile)
        for i in getattr(self, 'image_names', []):
            manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
@ -441,8 +463,7 @@ class MobiReader(object):
        return data[:len(data)-trail_size]
    def extract_text(self):
-        if self.verbose:
+        self.log.debug('Extracting text...')
            print 'Extracting text...'
        text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
        processed_records = list(range(0, self.book_header.records+1))
@ -472,12 +493,11 @@ class MobiReader(object):
    def replace_page_breaks(self):
        self.processed_html = self.PAGE_BREAK_PAT.sub(
-            '<div class="mbp_pagebreak" style="page-break-after: always; margin: 0; display: block" />',
+            '<div class="mbp_pagebreak" />',
            self.processed_html)
    def add_anchors(self):
-        if self.verbose:
+        self.log.debug('Adding anchors...')
            print 'Adding anchors...'
        positions = set([])
        link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
                                  re.IGNORECASE)
@ -507,8 +527,7 @@ class MobiReader(object):
    def extract_images(self, processed_records, output_dir):
-        if self.verbose:
+        self.log.debug('Extracting images...')
            print 'Extracting images...'
        output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
@ -535,14 +554,17 @@ class MobiReader(object):
            im.convert('RGB').save(open(path, 'wb'), format='JPEG')
 def get_metadata(stream):
-    mr = MobiReader(stream)
+    from calibre.utils.logging import Log
    log = Log()
    mr = MobiReader(stream, log)
    if mr.book_header.exth is None:
        mi = MetaInformation(mr.name, [_('Unknown')])
    else:
        mi = mr.create_opf('dummy.html')
        try:
            if hasattr(mr.book_header.exth, 'cover_offset'):
-                cover_index = mr.book_header.first_image_index + mr.book_header.exth.cover_offset
+                cover_index = mr.book_header.first_image_index + \
                              mr.book_header.exth.cover_offset
                data  = mr.sections[int(cover_index)][0]
            else:
                data  = mr.sections[mr.book_header.first_image_index][0]
@ -552,42 +574,7 @@ def get_metadata(stream):
            im.convert('RGBA').save(obuf, format='JPEG')
            mi.cover_data = ('jpg', obuf.getvalue())
        except:
-            import traceback
+            log.exception()
            traceback.print_exc()
    return mi
 def option_parser():
    from calibre.utils.config import OptionParser
    parser = OptionParser(usage=_('%prog [options] myebook.mobi'))
    parser.add_option('-o', '--output-dir', default='.', 
                      help=_('Output directory. Defaults to current directory.'))
    parser.add_option('-v', '--verbose', default=False, action='store_true',
                      help='Useful for debugging.')
    return parser
 def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)
    if len(args) != 2:
        parser.print_help()
        return 1
    mr = MobiReader(args[1], verbose=opts.verbose)
    opts.output_dir = os.path.abspath(opts.output_dir)
    mr.extract_content(opts.output_dir)
    if opts.verbose:
        oname = os.path.join(opts.output_dir, 'debug-raw.html')
        dat = mr.mobi_html
        if isinstance(dat, unicode):
            dat = dat.encode('utf-8')
        open(oname, 'wb').write(dat)
        print _('Raw MOBI HTML saved in'), oname
    print _('OEB ebook created in'), opts.output_dir
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/calibre/utils/logging.py
+++ b/src/calibre/utils/logging.py
@ -0,0 +1,92 @@
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 'A simplified logging system'
 DEBUG = 0
 INFO  = 1
 WARN  = 2
 ERROR = 3
 import sys, traceback
 from functools import partial
 from calibre import prints
 from calibre.utils.terminfo import TerminalController
 class ANSIStream:
    def __init__(self, stream=sys.stdout):
        self.stream = stream
        tc = TerminalController(stream)
        self.color = {
                      DEBUG: tc.GREEN,
                      INFO:'',
                      WARN: tc.YELLOW,
                      ERROR: tc.RED
                      }
        self.normal = tc.NORMAL
    def prints(self, level, *args, **kwargs):
        self.stream.write(self.color[level])
        kwargs['file'] = self.stream
        prints(*args, **kwargs)
        self.stream.write(self.normal)
    def flush(self):
        self.stream.flush()
 class HTMLStream:
    def __init__(self, stream=sys.stdout):
        self.stream = stream
        self.color = {
                      DEBUG: '<span style="color:green">',
                      INFO:'<span>',
                      WARN: '<span style="color:yellow">',
                      ERROR: '<span style="color:red">'
                      }
        self.normal = '</span>'
    def prints(self, level, *args, **kwargs):
        self.stream.write(self.color[level])
        kwargs['file'] = self.stream
        prints(*args, **kwargs)
        self.stream.write(self.normal)
    def flush(self):
        self.stream.flush()
 class Log(object):
    DEBUG = DEBUG
    INFO  = INFO
    WARN  = WARN
    ERROR = ERROR
    def __init__(self, level=INFO):
        self.filter_level = level
        default_output = ANSIStream()
        self.outputs = [default_output]
        self.debug = partial(self.prints, DEBUG) 
        self.info  = partial(self.prints, INFO)
        self.warn  = self.warning = partial(self.prints, WARN)
        self.error = partial(self.prints, ERROR) 
    def prints(self, level, *args, **kwargs):
        if level < self.filter_level:
            return
        for output in self.outputs:
            output.prints(level, *args, **kwargs)
    def exception(self, *args, **kwargs):
        limit = kwargs.pop('limit', None)
        self.prints(ERROR, *args, **kwargs)
        self.prints(DEBUG, traceback.format_exc(limit))
    def __call__(self, *args, **kwargs):
        self.prints(INFO, *args, **kwargs)
--- a/src/calibre/utils/terminfo.py
+++ b/src/calibre/utils/terminfo.py
@ -33,7 +33,7 @@ class TerminalController:
    >>> term = TerminalController()
    >>> if term.CLEAR_SCREEN:
-    ...     print 'This terminal supports clearning the screen.'
+    ...     print 'This terminal supports clearing the screen.'
    Finally, if the width and height of the terminal are known, then
    they will be stored in the `COLS` and `LINES` attributes.