Beginnings of the new conversion framework. Input plugins for MOBI and EPUB.

2025-11-24 23:35:01 -05:00 · 2009-03-06 21:38:35 -08:00 · 2009-03-06 21:38:35 -08:00 · 925a86fb0c
commit 925a86fb0c
parent 30bd23ee38
13 changed files with 525 additions and 235 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -90,28 +90,11 @@ def prints(*args, **kwargs):
        if i != len(args)-1:
            file.write(sep)
    file.write(end)
-    file.flush()

 class CommandLineError(Exception):
    pass

-class ColoredFormatter(Formatter):

-    def format(self, record):
-        ln = record.__dict__['levelname']
-        col = ''
-        if ln == 'CRITICAL':
-            col = terminal_controller.YELLOW
-        elif ln == 'ERROR':
-            col = terminal_controller.RED
-        elif ln in ['WARN', 'WARNING']:
-            col = terminal_controller.BLUE
-        elif ln == 'INFO':
-            col = terminal_controller.GREEN
-        elif ln == 'DEBUG':
-            col = terminal_controller.CYAN
-        record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
-        return Formatter.format(self, record)


 def setup_cli_handlers(logger, level):
@ -335,66 +318,23 @@ def english_sort(x, y):
    '''
    return cmp(_spat.sub('', x), _spat.sub('', y))

-class LoggingInterface:
+class ColoredFormatter(Formatter):

-    def __init__(self, logger):
-        self.__logger = self.logger = logger
-        
-    def setup_cli_handler(self, verbosity):
-        for handler in self.__logger.handlers:
-            if isinstance(handler, logging.StreamHandler):
-                return
-        if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers:
-            return
-        stream    = sys.stdout
-        formatter = logging.Formatter()
-        level     = logging.INFO
-        if verbosity > 0:
-            formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \
-                        ColoredFormatter('%(levelname)s: %(message)s')
-            level     = logging.DEBUG
-            if verbosity > 1:
-                stream = sys.stderr
-        
-        handler = logging.StreamHandler(stream)
-        handler.setFormatter(formatter)
-        handler.setLevel(level)
-        self.__logger.addHandler(handler)
-        self.__logger.setLevel(level)
-
-
-    def ___log(self, func, msg, args, kwargs):
-        args = [msg] + list(args)
-        for i in range(len(args)):
-            if not isinstance(args[i], basestring):
-                continue
-            if sys.version_info[:2] > (2, 5):
-                if not isinstance(args[i], unicode):
-                    args[i] = args[i].decode(preferred_encoding, 'replace')
-            elif isinstance(args[i], unicode):
-                args[i] = args[i].encode(preferred_encoding, 'replace')
-        func(*args, **kwargs)
-
-    def log_debug(self, msg, *args, **kwargs):
-        self.___log(self.__logger.debug, msg, args, kwargs)
-
-    def log_info(self, msg, *args, **kwargs):
-        self.___log(self.__logger.info, msg, args, kwargs)
-
-    def log_warning(self, msg, *args, **kwargs):
-        self.___log(self.__logger.warning, msg, args, kwargs)
-
-    def log_warn(self, msg, *args, **kwargs):
-        self.___log(self.__logger.warning, msg, args, kwargs)
-
-    def log_error(self, msg, *args, **kwargs):
-        self.___log(self.__logger.error, msg, args, kwargs)
-
-    def log_critical(self, msg, *args, **kwargs):
-        self.___log(self.__logger.critical, msg, args, kwargs)
-
-    def log_exception(self, msg, *args):
-        self.___log(self.__logger.exception, msg, args, {})
+    def format(self, record):
+        ln = record.__dict__['levelname']
+        col = ''
+        if ln == 'CRITICAL':
+            col = terminal_controller.YELLOW
+        elif ln == 'ERROR':
+            col = terminal_controller.RED
+        elif ln in ['WARN', 'WARNING']:
+            col = terminal_controller.BLUE
+        elif ln == 'INFO':
+            col = terminal_controller.GREEN
+        elif ln == 'DEBUG':
+            col = terminal_controller.CYAN
+        record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
+        return Formatter.format(self, record)

 def walk(dir):
    ''' A nice interface to os.walk '''
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -242,8 +242,13 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
        set_metadata(stream, mi)


-plugins = [HTML2ZIP]
+from calibre.ebooks.epub.input import EPUBInput
+from calibre.ebooks.mobi.input import MOBIInput
+from calibre.customize.profiles import input_profiles
+
+plugins = [HTML2ZIP, EPUBInput, MOBIInput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataWriter')]
+plugins += input_profiles
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -1,28 +1,30 @@
+from __future__ import with_statement
 '''
 Defines the plugin sytem for conversions.
 '''
-import re
+import re, os, shutil

+from lxml import html
+
+from calibre import CurrentDir
 from calibre.customize import Plugin

-
 class ConversionOption(object):
    
    '''
    Class representing conversion options
    '''
    
-    def __init__(self, name=None, default=None, help=None, long_switch=None, 
-                 short_switch=None, choices=None, gui_label=None, 
-                 category=None):
+    def __init__(self, name=None, help=None, long_switch=None, 
+                 short_switch=None, choices=None):
        self.name = name
-        self.default = default
        self.help = help
        self.long_switch = long_switch
        self.short_switch = short_switch
        self.choices = choices
-        self.gui_label = gui_label
-        self.category = category
+        
+        if self.long_switch is None:
+            self.long_switch = '--'+self.name.replace('_', '-')
        
        self.validate_parameters()
        
@ -32,41 +34,156 @@ class ConversionOption(object):
        '''
        if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None:
            raise ValueError(self.name + ' is not a valid Python identifier')
-        if not (isinstance(self.default, (int, float, str, unicode)) or \
-            self.default is None):
-            raise ValueError(unicode(self.default) + 
-                             ' is not a string or a number')
        if not self.help:
            raise ValueError('You must set the help text')
        
-class ConversionPlugin(Plugin):
        
-    '''
-    The base class for all conversion related plugins.
-    '''
-    #: List of options
-    #: Each option must be a dictionary. The dictionary can contain several
-    #: keys defining the option. The ones marked by a * are required, the rest
-    #: are optional. The keys are::
-    #:
-    #:    *'name'        : A valid python identifier.
-    #:    *'default'     : The default value for this option.
-    #:    *'help'        : 
-    #:    'short_switch' : A suggestion for a short form of the command line
-    #:                     switch (for example if name is 'title', this 
-    #:                     could be 't'). It is only used if no prior
-    #:                     conversion plugin has claimed it. 
-    options = []
+class OptionRecommendation(object):
+    LOW  = 1
+    MED  = 2
+    HIGH = 3
    
-    type = _('Conversion')
+    def __init__(self, recommeded_value, level=LOW, **kwargs):
+        '''
+        An option recommendation. That is, an option as well as its recommended 
+        value and the level of the recommendation.
+        '''
+        self.level = level
+        self.recommended_value = recommeded_value
+        self.option = kwargs.pop('option', None)
+        if self.option is None:
+            self.option = ConversionOption(**kwargs)
+            
+        self.validate_parameters()
+    
+    def validate_parameters(self):
+        if self.option.choices and self.recommended_value not in \
+                                                    self.option.choices:
+            raise ValueError('Recommended value not in choices')
+        if not (isinstance(self.recommended_value, (int, float, str, unicode))\
+            or self.default is None):
+            raise ValueError(unicode(self.default) + 
+                             ' is not a string or a number')
+         
+
+class InputFormatPlugin(Plugin):
+    '''
+    InputFormatPlugins are responsible for converting a document into 
+    HTML+OPF+CSS+etc.
+    The results of the conversion *must* be encoded in UTF-8.
+    The main action happens in :method:`convert`.
+    '''
+    
+    type = _('Conversion Input')
    can_be_disabled = False
    supported_platforms = ['windows', 'osx', 'linux']
    
-
-class InputFormatPlugin(ConversionPlugin):
-    
    #: Set of file types for which this plugin should be run
-    #: For example: ``set(['lit', 'mobi', 'prc'])``
+    #: For example: ``set(['azw', 'mobi', 'prc'])``
    file_types     = set([])
    
+    #: Options shared by all Input format plugins. Do not override
+    #: in sub-classes. Use :member:`options` instead. Every option must be an
+    #: instance of :class:`OptionRecommendation`. 
+    common_options = set([
+        OptionRecommendation(name='debug_input',
+            recommended_value=None, level=OptionRecommendation.LOW,
+            help=_('Save the output from the input plugin to the specified '
+                   'directory. Useful if you are unsure at which stage '
+                   'of the conversion process a bug is occurring. '
+                   'WARNING: This completely deletes the contents of '
+                   'the specified directory.')
+        ),
+        
+        OptionRecommendation(name='input_encoding',
+            recommended_value=None, level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the input document. If '
+                   'set this option will override any encoding declared by the '
+                   'document itself. Particularly useful for documents that '
+                   'do not declare an encoding or that have erroneous '
+                   'encoding declarations.')
+        ),
+        
+    ])
+    
+    #: Options to customize the behavior of this plugin. Every option must be an
+    #: instance of :class:`OptionRecommendation`.  
+    options = set([])
+    
+    def convert(self, stream, options, file_ext, parse_cache, log):
+        '''
+        This method must be implemented in sub-classes. It must return
+        the path to the created OPF file. All output should be contained in 
+        the current directory. If this plugin creates files outside the current
+        directory they must be deleted/marked for deletion before this method 
+        returns.
+        
+        :param stream:   A file like object that contains the input file.
+        
+        :param options:  Options to customize the conversion process. 
+                         Guaranteed to have attributes corresponding
+                         to all the options declared by this plugin. In 
+                         addition, it will have a verbose attribute that
+                         takes integral values from zero upwards. Higher numbers
+                         mean be more verbose. Another useful attribute is 
+                         ``input_profile`` that is an instance of 
+                         :class:`calibre.customize.profiles.InputProfile`.
+                         
+        :param file_ext: The extension (without the .) of the input file. It
+                         is guaranteed to be one of the `file_types` supported
+                         by this plugin.
+        
+        :param parse_cache:    A dictionary that maps absolute file paths to
+                               parsed representations of their contents. For
+                               HTML the representation is an lxml element of 
+                               the root of the tree. For CSS it is a cssutils
+                               stylesheet. If this plugin parses any of the
+                               output files, it should add them to the cache
+                               so that later stages of the conversion wont
+                               have to re-parse them. If a parsed representation
+                               is in the cache, there is no need to actually 
+                               write the file to disk.
+        
+        :param log: A :class:`calibre.utils.logging.Log` object. All output 
+                    should use this object.
+        '''
+        raise NotImplementedError
+    
+    def __call__(self, stream, options, file_ext, parse_cache, log, output_dir):
+        log('InputFormatPlugin: %s running'%self.name, end=' ')
+        if hasattr(stream, 'name'):
+            log('on', stream.name)
+        
+        with CurrentDir(output_dir):
+            for x in os.listdir('.'):
+                shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
+                    
+                    
+            ret = self.convert(stream, options, file_ext, parse_cache, log)
+            for key in list(parse_cache.keys()):
+                if os.path.abspath(key) != key:
+                    log.warn(('InputFormatPlugin: %s returned a '
+                             'relative path: %s')%(self.name, key)
+                             )
+                    parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
+                    
+        if options.debug_input is not None:
+            options.debug_input = os.path.abspath(options.debug_input)
+            if not os.path.exists(options.debug_input):
+                os.makedirs(options.debug_input)
+            shutil.rmtree(options.debug_input)
+            for f, obj in parse_cache.items():
+                if hasattr(obj, 'cssText'):
+                    raw = obj.cssText
+                else:
+                    raw = html.tostring(obj, encoding='utf-8', method='xml', 
+                         include_meta_content_type=True, pretty_print=True)
+                if isinstance(raw, unicode):
+                    raw = raw.encode('utf-8')
+                open(f, 'wb').write(raw)
+            shutil.copytree('.', options.debug_input)
+                
+            
+                    
+        return ret
 
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -0,0 +1,27 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize import Plugin
+
+class InputProfile(Plugin):
+    
+    author = 'Kovid Goyal'
+    supported_platforms = set(['windows', 'osx', 'linux'])
+    can_be_disabled = False
+    type = _('Input profile')
+
+# TODO: Add some real information to this profile. All other profiles must
+#       inherit from this profile and override as needed
+
+    name        = 'Default Input Profile'
+    short_name  = 'default' # Used in the CLI so dont spaces etc. in it
+    description = _('This profile tries to provide sane defaults and is useful '
+                    'if you know nothing about the input document.')
+                  
+input_profiles = [InputProfile]
+    
+
+
+    
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -6,13 +6,14 @@ import os, shutil, traceback, functools, sys

 from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \
                              MetadataWriterPlugin
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.customize.profiles import InputProfile
 from calibre.customize.builtins import plugins as builtin_plugins
 from calibre.constants import __version__, iswindows, isosx
 from calibre.ebooks.metadata import MetaInformation
 from calibre.utils.config import make_config_dir, Config, ConfigProxy, \
                                 plugin_dir, OptionParser

-
 version = tuple([int(x) for x in __version__.split('.')])

 platform = 'linux'
@ -70,7 +71,10 @@ _on_import           = {}
 _on_preprocess       = {}
 _on_postprocess      = {}

-
+def input_profiles():
+    for plugin in _initialized_plugins:
+        if isinstance(plugin, InputProfile):
+            yield plugin

 def reread_filetype_plugins():
    global _on_import
@ -234,6 +238,17 @@ def find_plugin(name):
        if plugin.name == name:
            return plugin

+def input_format_plugins():
+    for plugin in _initialized_plugins:
+        if isinstance(plugin, InputFormatPlugin):
+            yield plugin
+        
+def plugin_for_input_format(fmt):
+    for plugin in input_format_plugins():
+        if fmt in plugin.file_types:
+            return plugin
+    
+
 def disable_plugin(plugin_or_name):
    x = getattr(plugin_or_name, 'name', plugin_or_name)
    plugin = find_plugin(x)
--- a/src/calibre/ebooks/conversion/init.py
+++ b/src/calibre/ebooks/conversion/init.py
@ -0,0 +1,4 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -0,0 +1,30 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+from calibre.customize.conversion import OptionRecommendation 
+from calibre.customize.ui import input_profiles
+
+pipeline_options = [
+
+OptionRecommendation(name='verbose', 
+            recommended_value=0, level=OptionRecommendation.LOW,
+            short_switch='v', 
+            help=_('Level of verbosity. Specify multiple times for greater '
+                   'verbosity.')
+        ),
+
+
+OptionRecommendation(name='input_profile',
+            recommended_value='default', level=OptionRecommendation.LOW,
+            choices=[x.short_name for x in input_profiles()],
+            help=_('Specify the input profile. The input profile gives the '
+                   'conversion system information on how to interpret '
+                   'various information in the input document. For '
+                   'example resolution dependent lengths (i.e. lengths in '
+                   'pixels).')
+        ),
+
+]
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -40,38 +40,6 @@ def rules(stylesheets):
                if r.type == r.STYLE_RULE:
                    yield r

-def decrypt_font(key, path):
-    raw = open(path, 'rb').read()
-    crypt = raw[:1024]
-    key = cycle(iter(key))
-    decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
-    with open(path, 'wb') as f:
-        f.write(decrypt)
-        f.write(raw[1024:])
-
-def process_encryption(encfile, opf):
-    key = None
-    m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
-    if m:
-        key = m.group(1)
-        key = list(map(ord, uuid.UUID(key).bytes))
-    try:
-        root = etree.parse(encfile)
-        for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
-            algorithm = em.get('Algorithm', '')
-            if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
-                return False
-            cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
-            uri = cr.get('URI')
-            path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
-            if os.path.exists(path):
-                decrypt_font(key, path)
-        return True
-    except:
-        import traceback
-        traceback.print_exc()
-    return False
-
 def initialize_container(path_to_container, opf_name='metadata.opf'):
    '''
    Create an empty EPUB document, with a default skeleton.
--- a/src/calibre/ebooks/epub/input.py
+++ b/src/calibre/ebooks/epub/input.py
@ -0,0 +1,76 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, re, uuid
+from itertools import cycle
+
+from lxml import etree
+
+from calibre.customize.conversion import InputFormatPlugin
+
+class EPUBInput(InputFormatPlugin):
+    
+    name        = 'EPUB Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert EPUB files (.epub) to HTML'
+    file_types  = set(['epub'])
+    
+    @classmethod
+    def decrypt_font(cls, key, path):
+        raw = open(path, 'rb').read()
+        crypt = raw[:1024]
+        key = cycle(iter(key))
+        decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
+        with open(path, 'wb') as f:
+            f.write(decrypt)
+            f.write(raw[1024:])
+    
+    @classmethod
+    def process_ecryption(cls, encfile, opf, log):
+        key = None
+        m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
+        if m:
+            key = m.group(1)
+            key = list(map(ord, uuid.UUID(key).bytes))
+        try:
+            root = etree.parse(encfile)
+            for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
+                algorithm = em.get('Algorithm', '')
+                if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
+                    return False
+                cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
+                uri = cr.get('URI')
+                path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
+                if os.path.exists(path):
+                    cls.decrypt_font(key, path)
+            return True
+        except:
+            import traceback
+            traceback.print_exc()
+        return False
+
+    def convert(self, stream, options, file_ext, parse_cache, log):
+        from calibre.utils.zipfile import ZipFile
+        from calibre import walk
+        from calibre.ebooks import DRMError
+        zf = ZipFile(stream)
+        zf.extractall(os.getcwd())
+        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
+        opf = None
+        for f in walk('.'):
+            if f.lower().endswith('.opf'):
+                opf = f
+                break
+        path = getattr(stream, 'name', 'stream')
+        
+        if opf is None:
+            raise ValueError('%s is not a valid EPUB file'%path)
+        
+        if os.path.exists(encfile):
+            if not self.process_encryption(encfile, opf, log):
+                raise DRMError(os.path.basename(path))
+        
+        return opf
+        
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -0,0 +1,29 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+
+class MOBIInput(InputFormatPlugin):
+    
+    name        = 'MOBI Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
+    file_types  = set(['mobi', 'prc', 'azw'])
+    
+    def convert(self, stream, options, file_ext, parse_cache, log):
+        from calibre.ebooks.mobi.reader import MobiReader
+        mr = MobiReader(stream, log, options.input_encoding, 
+                        options.debug_input)
+        mr.extract_content(output_dir=os.getcwdu(), parse_cache)
+        raw = parse_cache.get('calibre_raw_mobi_markup', False)
+        if raw:
+            if isinstance(raw, unicode):
+                raw = raw.encode('utf-8')
+            open('debug-raw.html', 'wb').write(raw)
+            
+        return mr.created_opf_path
+        
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 Read data from .mobi files
 '''

-import sys, struct, os, cStringIO, re, functools
+import struct, os, cStringIO, re, functools

 try:
    from PIL import Image as PILImage
@ -35,8 +35,10 @@ class EXTHHeader(object):
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
+        left = self.num_items
        
-        for i in range(self.num_items):
+        while left > 0:
+            left -= 1
            id, size = struct.unpack('>LL', raw[pos:pos+8])
            content = raw[pos+8:pos+size]
            pos += size
@ -76,7 +78,8 @@ class EXTHHeader(object):

 class BookHeader(object):
    
-    def __init__(self, raw, ident):
+    def __init__(self, raw, ident, user_encoding, log):
+        self.log = log
        self.compression_type = raw[:2]
        self.records, self.records_size = struct.unpack('>HH', raw[8:12])
        self.encryption_type, = struct.unpack('>H', raw[12:14])
@ -92,8 +95,8 @@ class BookHeader(object):
        else:
            self.ancient = False
            self.doctype = raw[16:20]
-            self.length, self.type, self.codepage, self.unique_id, self.version = \
-                     struct.unpack('>LLLLL', raw[20:40])
+            self.length, self.type, self.codepage, self.unique_id, \
+                self.version = struct.unpack('>LLLLL', raw[20:40])
                    
            
            try:
@ -102,8 +105,9 @@ class BookHeader(object):
                          65001 : 'utf-8',
                          }[self.codepage]
            except (IndexError, KeyError):
-                print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage
-                self.codec = 'cp1252'
+                self.codec = 'cp1252' if user_encoding is None else user_encoding
+                log.warn('Unknown codepage %d. Assuming %s'%(self.codepage,
+                                                            self.codec))
            
            if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
                self.extra_flags = 0
@ -138,9 +142,24 @@ class MobiReader(object):
    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
    
-    def __init__(self, filename_or_stream, verbose=False):
-        self.verbose = verbose
+    def __init__(self, filename_or_stream, log, user_encoding=None, debug=None):
+        self.log = log
+        self.debug = debug
        self.embedded_mi = None
+        self.base_css_rules = '''
+                blockquote { margin: 0em 0em 0em 1.25em; text-align: justify }
+                
+                p { margin: 0em; text-align: justify }
+                
+                .bold { font-weight: bold }
+                
+                .italic { font-style: italic }
+                
+                .mbp_pagebreak {
+                    page-break-after: always; margin: 0; display: block
+                }
+                '''
+        self.tag_css_rules = []
        
        if hasattr(filename_or_stream, 'read'):
            stream = filename_or_stream
@ -177,17 +196,21 @@ class MobiReader(object):
            self.sections.append((section(i), self.section_headers[i])) 
         
            
-        self.book_header = BookHeader(self.sections[0][0], self.ident)
+        self.book_header = BookHeader(self.sections[0][0], self.ident, 
+                                      user_encoding, self.log)
        self.name = self.name.decode(self.book_header.codec, 'replace')
        
-    def extract_content(self, output_dir=os.getcwdu()):
+    def extract_content(self, output_dir, parse_cache):
        output_dir = os.path.abspath(output_dir)
        if self.book_header.encryption_type != 0:
            raise DRMError(self.name)
        
        processed_records = self.extract_text()
+        if self.debug is not None:
+            self.parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
        self.add_anchors()
-        self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
+        self.processed_html = self.processed_html.decode(self.book_header.codec,
+                                                          'ignore')
        for pat in ENCODING_PATS:
            self.processed_html = pat.sub('', self.processed_html)
        e2u = functools.partial(entity_to_unicode, 
@ -203,16 +226,10 @@ class MobiReader(object):
        self.processed_html = \
            re.compile('<head>', re.IGNORECASE).sub(
                '\n<head>\n'
-                '<style type="text/css">\n'
-                'blockquote { margin: 0em 0em 0em 1.25em; text-align: justify; }\n'
-                'p { margin: 0em; text-align: justify; }\n'
-                '.bold { font-weight: bold; }\n'
-                '.italic { font-style: italic; }\n'
-                '</style>\n',
+                '\t<link type="text/css" href="styles.css" />\n',
                self.processed_html)
        
-        if self.verbose:
-            print 'Parsing HTML...'
+        self.log.debug('Parsing HTML...')
        root = html.fromstring(self.processed_html)
        self.upshift_markup(root)
        guides = root.xpath('//guide')
@ -230,26 +247,25 @@ class MobiReader(object):
                    ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href']
        except AttributeError:
            pass
-        if self.verbose:
-            print 'Serializing...'
-        with open(htmlfile, 'wb') as f:
-            raw = html.tostring(root, encoding='utf-8', method='xml', 
-                         include_meta_content_type=True, pretty_print=True)
-            raw = raw.replace('<head>', 
-            '<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n')
-            f.write(raw)
+        parse_cache[htmlfile] = root
        self.htmlfile = htmlfile
        
-        if self.book_header.exth is not None or self.embedded_mi is not None:
-            if self.verbose:
-                print 'Creating OPF...'
+        self.log.debug('Creating OPF...')
        ncx = cStringIO.StringIO()
        opf = self.create_opf(htmlfile, guide, root)
-            opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx)
+        self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' 
+        opf.render(open(self.created_opf_path, 'wb'), ncx)
        ncx = ncx.getvalue()
        if ncx:
            open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
                
+        with open('styles.css', 'wb') as s:
+            s.write(self.base_css_rules+'\n\n')
+            for rule in self.tag_css_rules:
+                if isinstance(rule, unicode):
+                    rule = rule.encode('utf-8')
+                s.write(rule+'\n\n')
+    
    def read_embedded_metadata(self, root, elem, guide):
        raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
        stream = cStringIO.StringIO(raw)
@ -277,8 +293,7 @@ class MobiReader(object):
        
    
    def cleanup_html(self):
-        if self.verbose:
-            print 'Cleaning up HTML...'
+        self.log.debug('Cleaning up HTML...')
        self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
        if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
            self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
@ -286,8 +301,7 @@ class MobiReader(object):
        self.processed_html = self.processed_html.replace('> <', '>\n<')
        
    def upshift_markup(self, root):
-        if self.verbose:
-            print 'Converting style information to CSS...'
+        self.log.debug('Converting style information to CSS...')
        size_map = {
                    'xx-small' : '0.5',
                    'x-small'  : '1',
@ -298,7 +312,7 @@ class MobiReader(object):
                    'xx-large' : '6',
                    }
        mobi_version = self.book_header.mobi_version
-        for tag in root.iter(etree.Element):
+        for i, tag in enumerate(root.iter(etree.Element)):
            if tag.tag in ('country-region', 'place', 'placetype', 'placename',
                           'state', 'city'):
                tag.tag = 'span'
@ -352,8 +366,7 @@ class MobiReader(object):
            elif tag.tag == 'pre':
                if not tag.text:
                    tag.tag = 'div'
-            if styles:
-                attrib['style'] = '; '.join(styles)
+            
            if 'filepos-id' in attrib:
                attrib['id'] = attrib.pop('filepos-id')
            if 'filepos' in attrib:
@ -363,14 +376,23 @@ class MobiReader(object):
                except ValueError:
                    pass
            
+            if styles:
+                attrib['id'] = attrib.get('id', 'calibre_mr_gid%d'%i)
+                self.tag_css_rules.append('#%s {%s}'%(attrib['id'], 
+                                                      '; '.join(styles)))
+    
+    
    def create_opf(self, htmlfile, guide=None, root=None):
        mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
+        if mi is None:
+            mi = MetaInformation(self.title, [_('Unknown')])
        opf = OPFCreator(os.path.dirname(htmlfile), mi)
        if hasattr(self.book_header.exth, 'cover_offset'):
            opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
        elif mi.cover is not None:
            opf.cover = mi.cover
-        manifest = [(htmlfile, 'text/x-oeb1-document')]
+        manifest = [(htmlfile, 'text/x-oeb1-document'), 
+                    (os.path.abspath('styles.css'), 'text/css')]
        bp = os.path.dirname(htmlfile)
        for i in getattr(self, 'image_names', []):
            manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
@ -441,8 +463,7 @@ class MobiReader(object):
        return data[:len(data)-trail_size]
    
    def extract_text(self):
-        if self.verbose:
-            print 'Extracting text...'
+        self.log.debug('Extracting text...')
        text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
        processed_records = list(range(0, self.book_header.records+1))
        
@ -472,12 +493,11 @@ class MobiReader(object):
    
    def replace_page_breaks(self):
        self.processed_html = self.PAGE_BREAK_PAT.sub(
-            '<div class="mbp_pagebreak" style="page-break-after: always; margin: 0; display: block" />',
+            '<div class="mbp_pagebreak" />',
            self.processed_html)
    
    def add_anchors(self):
-        if self.verbose:
-            print 'Adding anchors...'
+        self.log.debug('Adding anchors...')
        positions = set([])
        link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
                                  re.IGNORECASE)
@ -507,8 +527,7 @@ class MobiReader(object):
        
    
    def extract_images(self, processed_records, output_dir):
-        if self.verbose:
-            print 'Extracting images...'
+        self.log.debug('Extracting images...')
        output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
@ -535,14 +554,17 @@ class MobiReader(object):
            im.convert('RGB').save(open(path, 'wb'), format='JPEG')

 def get_metadata(stream):
-    mr = MobiReader(stream)
+    from calibre.utils.logging import Log
+    log = Log()
+    mr = MobiReader(stream, log)
    if mr.book_header.exth is None:
        mi = MetaInformation(mr.name, [_('Unknown')])
    else:
        mi = mr.create_opf('dummy.html')
        try:
            if hasattr(mr.book_header.exth, 'cover_offset'):
-                cover_index = mr.book_header.first_image_index + mr.book_header.exth.cover_offset
+                cover_index = mr.book_header.first_image_index + \
+                              mr.book_header.exth.cover_offset
                data  = mr.sections[int(cover_index)][0]
            else:
                data  = mr.sections[mr.book_header.first_image_index][0]
@ -552,42 +574,7 @@ def get_metadata(stream):
            im.convert('RGBA').save(obuf, format='JPEG')
            mi.cover_data = ('jpg', obuf.getvalue())
        except:
-            import traceback
-            traceback.print_exc()
+            log.exception()
    return mi
        
        
-def option_parser():
-    from calibre.utils.config import OptionParser
-    parser = OptionParser(usage=_('%prog [options] myebook.mobi'))
-    parser.add_option('-o', '--output-dir', default='.', 
-                      help=_('Output directory. Defaults to current directory.'))
-    parser.add_option('-v', '--verbose', default=False, action='store_true',
-                      help='Useful for debugging.')
-    return parser
-    
-
-def main(args=sys.argv):
-    parser = option_parser()
-    opts, args = parser.parse_args(args)
-    if len(args) != 2:
-        parser.print_help()
-        return 1
-    
-    mr = MobiReader(args[1], verbose=opts.verbose)
-    opts.output_dir = os.path.abspath(opts.output_dir)
-    mr.extract_content(opts.output_dir)
-    if opts.verbose:
-        oname = os.path.join(opts.output_dir, 'debug-raw.html')
-        dat = mr.mobi_html
-        if isinstance(dat, unicode):
-            dat = dat.encode('utf-8')
-        open(oname, 'wb').write(dat)
-        print _('Raw MOBI HTML saved in'), oname
-    
-    print _('OEB ebook created in'), opts.output_dir
-    
-    return 0
-
-if __name__ == '__main__':
-    sys.exit(main())
--- a/src/calibre/utils/logging.py
+++ b/src/calibre/utils/logging.py
@ -0,0 +1,92 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+'A simplified logging system'
+
+DEBUG = 0
+INFO  = 1
+WARN  = 2
+ERROR = 3
+
+import sys, traceback
+from functools import partial
+
+from calibre import prints
+from calibre.utils.terminfo import TerminalController
+
+class ANSIStream:
+    
+    def __init__(self, stream=sys.stdout):
+        self.stream = stream
+        tc = TerminalController(stream)
+        self.color = {
+                      DEBUG: tc.GREEN,
+                      INFO:'',
+                      WARN: tc.YELLOW,
+                      ERROR: tc.RED
+                      }
+        self.normal = tc.NORMAL
+    
+    def prints(self, level, *args, **kwargs):
+        self.stream.write(self.color[level])
+        kwargs['file'] = self.stream
+        prints(*args, **kwargs)
+        self.stream.write(self.normal)
+        
+    def flush(self):
+        self.stream.flush()
+        
+class HTMLStream:
+    
+    def __init__(self, stream=sys.stdout):
+        self.stream = stream
+        self.color = {
+                      DEBUG: '<span style="color:green">',
+                      INFO:'<span>',
+                      WARN: '<span style="color:yellow">',
+                      ERROR: '<span style="color:red">'
+                      }
+        self.normal = '</span>'
+        
+    def prints(self, level, *args, **kwargs):
+        self.stream.write(self.color[level])
+        kwargs['file'] = self.stream
+        prints(*args, **kwargs)
+        self.stream.write(self.normal)
+        
+    def flush(self):
+        self.stream.flush()
+
+class Log(object):
+
+    DEBUG = DEBUG
+    INFO  = INFO
+    WARN  = WARN
+    ERROR = ERROR
+    
+    def __init__(self, level=INFO):
+        self.filter_level = level
+        default_output = ANSIStream()
+        self.outputs = [default_output]
+        
+        self.debug = partial(self.prints, DEBUG) 
+        self.info  = partial(self.prints, INFO)
+        self.warn  = self.warning = partial(self.prints, WARN)
+        self.error = partial(self.prints, ERROR) 
+        
+        
+    def prints(self, level, *args, **kwargs):
+        if level < self.filter_level:
+            return
+        for output in self.outputs:
+            output.prints(level, *args, **kwargs)
+    
+    def exception(self, *args, **kwargs):
+        limit = kwargs.pop('limit', None)
+        self.prints(ERROR, *args, **kwargs)
+        self.prints(DEBUG, traceback.format_exc(limit))
+
+    def __call__(self, *args, **kwargs):
+        self.prints(INFO, *args, **kwargs)
--- a/src/calibre/utils/terminfo.py
+++ b/src/calibre/utils/terminfo.py
@ -33,7 +33,7 @@ class TerminalController:
    
    >>> term = TerminalController()
    >>> if term.CLEAR_SCREEN:
-    ...     print 'This terminal supports clearning the screen.'
+    ...     print 'This terminal supports clearing the screen.'
    
    Finally, if the width and height of the terminal are known, then
    they will be stored in the `COLS` and `LINES` attributes.