From 925a86fb0c991c51a4665cac1ff7a7f191ec39a7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 6 Mar 2009 21:38:35 -0800 Subject: [PATCH] Beginnings of the new conversion framework. Input plugins for MOBI and EPUB. --- src/calibre/__init__.py | 92 ++--------- src/calibre/customize/builtins.py | 7 +- src/calibre/customize/conversion.py | 183 ++++++++++++++++++---- src/calibre/customize/profiles.py | 27 ++++ src/calibre/customize/ui.py | 19 ++- src/calibre/ebooks/conversion/__init__.py | 4 + src/calibre/ebooks/conversion/plumber.py | 30 ++++ src/calibre/ebooks/epub/__init__.py | 32 ---- src/calibre/ebooks/epub/input.py | 76 +++++++++ src/calibre/ebooks/mobi/input.py | 29 ++++ src/calibre/ebooks/mobi/reader.py | 167 +++++++++----------- src/calibre/utils/logging.py | 92 +++++++++++ src/calibre/utils/terminfo.py | 2 +- 13 files changed, 525 insertions(+), 235 deletions(-) create mode 100644 src/calibre/customize/profiles.py create mode 100644 src/calibre/ebooks/conversion/__init__.py create mode 100644 src/calibre/ebooks/conversion/plumber.py create mode 100644 src/calibre/ebooks/epub/input.py create mode 100644 src/calibre/ebooks/mobi/input.py create mode 100644 src/calibre/utils/logging.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index e69d42c90a..de133ddb57 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -90,28 +90,11 @@ def prints(*args, **kwargs): if i != len(args)-1: file.write(sep) file.write(end) - file.flush() class CommandLineError(Exception): pass -class ColoredFormatter(Formatter): - def format(self, record): - ln = record.__dict__['levelname'] - col = '' - if ln == 'CRITICAL': - col = terminal_controller.YELLOW - elif ln == 'ERROR': - col = terminal_controller.RED - elif ln in ['WARN', 'WARNING']: - col = terminal_controller.BLUE - elif ln == 'INFO': - col = terminal_controller.GREEN - elif ln == 'DEBUG': - col = terminal_controller.CYAN - record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL - return Formatter.format(self, record) def setup_cli_handlers(logger, level): @@ -335,66 +318,23 @@ def english_sort(x, y): ''' return cmp(_spat.sub('', x), _spat.sub('', y)) -class LoggingInterface: +class ColoredFormatter(Formatter): - def __init__(self, logger): - self.__logger = self.logger = logger - - def setup_cli_handler(self, verbosity): - for handler in self.__logger.handlers: - if isinstance(handler, logging.StreamHandler): - return - if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers: - return - stream = sys.stdout - formatter = logging.Formatter() - level = logging.INFO - if verbosity > 0: - formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \ - ColoredFormatter('%(levelname)s: %(message)s') - level = logging.DEBUG - if verbosity > 1: - stream = sys.stderr - - handler = logging.StreamHandler(stream) - handler.setFormatter(formatter) - handler.setLevel(level) - self.__logger.addHandler(handler) - self.__logger.setLevel(level) - - - def ___log(self, func, msg, args, kwargs): - args = [msg] + list(args) - for i in range(len(args)): - if not isinstance(args[i], basestring): - continue - if sys.version_info[:2] > (2, 5): - if not isinstance(args[i], unicode): - args[i] = args[i].decode(preferred_encoding, 'replace') - elif isinstance(args[i], unicode): - args[i] = args[i].encode(preferred_encoding, 'replace') - func(*args, **kwargs) - - def log_debug(self, msg, *args, **kwargs): - self.___log(self.__logger.debug, msg, args, kwargs) - - def log_info(self, msg, *args, **kwargs): - self.___log(self.__logger.info, msg, args, kwargs) - - def log_warning(self, msg, *args, **kwargs): - self.___log(self.__logger.warning, msg, args, kwargs) - - def log_warn(self, msg, *args, **kwargs): - self.___log(self.__logger.warning, msg, args, kwargs) - - def log_error(self, msg, *args, **kwargs): - self.___log(self.__logger.error, msg, args, kwargs) - - def log_critical(self, msg, *args, **kwargs): - self.___log(self.__logger.critical, msg, args, kwargs) - - def log_exception(self, msg, *args): - self.___log(self.__logger.exception, msg, args, {}) + def format(self, record): + ln = record.__dict__['levelname'] + col = '' + if ln == 'CRITICAL': + col = terminal_controller.YELLOW + elif ln == 'ERROR': + col = terminal_controller.RED + elif ln in ['WARN', 'WARNING']: + col = terminal_controller.BLUE + elif ln == 'INFO': + col = terminal_controller.GREEN + elif ln == 'DEBUG': + col = terminal_controller.CYAN + record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL + return Formatter.format(self, record) def walk(dir): ''' A nice interface to os.walk ''' diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 14d3c79062..fafe8e5afa 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -242,8 +242,13 @@ class MOBIMetadataWriter(MetadataWriterPlugin): set_metadata(stream, mi) -plugins = [HTML2ZIP] +from calibre.ebooks.epub.input import EPUBInput +from calibre.ebooks.mobi.input import MOBIInput +from calibre.customize.profiles import input_profiles + +plugins = [HTML2ZIP, EPUBInput, MOBIInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataWriter')] +plugins += input_profiles \ No newline at end of file diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 36b2781c9d..aa7b0c1dea 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -1,28 +1,30 @@ +from __future__ import with_statement ''' Defines the plugin sytem for conversions. ''' -import re +import re, os, shutil +from lxml import html + +from calibre import CurrentDir from calibre.customize import Plugin - class ConversionOption(object): ''' Class representing conversion options ''' - def __init__(self, name=None, default=None, help=None, long_switch=None, - short_switch=None, choices=None, gui_label=None, - category=None): + def __init__(self, name=None, help=None, long_switch=None, + short_switch=None, choices=None): self.name = name - self.default = default self.help = help self.long_switch = long_switch self.short_switch = short_switch self.choices = choices - self.gui_label = gui_label - self.category = category + + if self.long_switch is None: + self.long_switch = '--'+self.name.replace('_', '-') self.validate_parameters() @@ -32,41 +34,156 @@ class ConversionOption(object): ''' if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None: raise ValueError(self.name + ' is not a valid Python identifier') - if not (isinstance(self.default, (int, float, str, unicode)) or \ - self.default is None): + if not self.help: + raise ValueError('You must set the help text') + + +class OptionRecommendation(object): + LOW = 1 + MED = 2 + HIGH = 3 + + def __init__(self, recommeded_value, level=LOW, **kwargs): + ''' + An option recommendation. That is, an option as well as its recommended + value and the level of the recommendation. + ''' + self.level = level + self.recommended_value = recommeded_value + self.option = kwargs.pop('option', None) + if self.option is None: + self.option = ConversionOption(**kwargs) + + self.validate_parameters() + + def validate_parameters(self): + if self.option.choices and self.recommended_value not in \ + self.option.choices: + raise ValueError('Recommended value not in choices') + if not (isinstance(self.recommended_value, (int, float, str, unicode))\ + or self.default is None): raise ValueError(unicode(self.default) + ' is not a string or a number') - if not self.help: - raise ValueError('You must set the help text') + -class ConversionPlugin(Plugin): - +class InputFormatPlugin(Plugin): ''' - The base class for all conversion related plugins. + InputFormatPlugins are responsible for converting a document into + HTML+OPF+CSS+etc. + The results of the conversion *must* be encoded in UTF-8. + The main action happens in :method:`convert`. ''' - #: List of options - #: Each option must be a dictionary. The dictionary can contain several - #: keys defining the option. The ones marked by a * are required, the rest - #: are optional. The keys are:: - #: - #: *'name' : A valid python identifier. - #: *'default' : The default value for this option. - #: *'help' : - #: 'short_switch' : A suggestion for a short form of the command line - #: switch (for example if name is 'title', this - #: could be 't'). It is only used if no prior - #: conversion plugin has claimed it. - options = [] - type = _('Conversion') + type = _('Conversion Input') can_be_disabled = False supported_platforms = ['windows', 'osx', 'linux'] - -class InputFormatPlugin(ConversionPlugin): - #: Set of file types for which this plugin should be run - #: For example: ``set(['lit', 'mobi', 'prc'])`` + #: For example: ``set(['azw', 'mobi', 'prc'])`` file_types = set([]) + #: Options shared by all Input format plugins. Do not override + #: in sub-classes. Use :member:`options` instead. Every option must be an + #: instance of :class:`OptionRecommendation`. + common_options = set([ + OptionRecommendation(name='debug_input', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Save the output from the input plugin to the specified ' + 'directory. Useful if you are unsure at which stage ' + 'of the conversion process a bug is occurring. ' + 'WARNING: This completely deletes the contents of ' + 'the specified directory.') + ), + + OptionRecommendation(name='input_encoding', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Specify the character encoding of the input document. If ' + 'set this option will override any encoding declared by the ' + 'document itself. Particularly useful for documents that ' + 'do not declare an encoding or that have erroneous ' + 'encoding declarations.') + ), + + ]) + + #: Options to customize the behavior of this plugin. Every option must be an + #: instance of :class:`OptionRecommendation`. + options = set([]) + + def convert(self, stream, options, file_ext, parse_cache, log): + ''' + This method must be implemented in sub-classes. It must return + the path to the created OPF file. All output should be contained in + the current directory. If this plugin creates files outside the current + directory they must be deleted/marked for deletion before this method + returns. + + :param stream: A file like object that contains the input file. + + :param options: Options to customize the conversion process. + Guaranteed to have attributes corresponding + to all the options declared by this plugin. In + addition, it will have a verbose attribute that + takes integral values from zero upwards. Higher numbers + mean be more verbose. Another useful attribute is + ``input_profile`` that is an instance of + :class:`calibre.customize.profiles.InputProfile`. + + :param file_ext: The extension (without the .) of the input file. It + is guaranteed to be one of the `file_types` supported + by this plugin. + + :param parse_cache: A dictionary that maps absolute file paths to + parsed representations of their contents. For + HTML the representation is an lxml element of + the root of the tree. For CSS it is a cssutils + stylesheet. If this plugin parses any of the + output files, it should add them to the cache + so that later stages of the conversion wont + have to re-parse them. If a parsed representation + is in the cache, there is no need to actually + write the file to disk. + + :param log: A :class:`calibre.utils.logging.Log` object. All output + should use this object. + ''' + raise NotImplementedError + + def __call__(self, stream, options, file_ext, parse_cache, log, output_dir): + log('InputFormatPlugin: %s running'%self.name, end=' ') + if hasattr(stream, 'name'): + log('on', stream.name) + + with CurrentDir(output_dir): + for x in os.listdir('.'): + shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) + + + ret = self.convert(stream, options, file_ext, parse_cache, log) + for key in list(parse_cache.keys()): + if os.path.abspath(key) != key: + log.warn(('InputFormatPlugin: %s returned a ' + 'relative path: %s')%(self.name, key) + ) + parse_cache[os.path.abspath(key)] = parse_cache.pop(key) + + if options.debug_input is not None: + options.debug_input = os.path.abspath(options.debug_input) + if not os.path.exists(options.debug_input): + os.makedirs(options.debug_input) + shutil.rmtree(options.debug_input) + for f, obj in parse_cache.items(): + if hasattr(obj, 'cssText'): + raw = obj.cssText + else: + raw = html.tostring(obj, encoding='utf-8', method='xml', + include_meta_content_type=True, pretty_print=True) + if isinstance(raw, unicode): + raw = raw.encode('utf-8') + open(f, 'wb').write(raw) + shutil.copytree('.', options.debug_input) + + + + return ret diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py new file mode 100644 index 0000000000..002f56879f --- /dev/null +++ b/src/calibre/customize/profiles.py @@ -0,0 +1,27 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.customize import Plugin + +class InputProfile(Plugin): + + author = 'Kovid Goyal' + supported_platforms = set(['windows', 'osx', 'linux']) + can_be_disabled = False + type = _('Input profile') + +# TODO: Add some real information to this profile. All other profiles must +# inherit from this profile and override as needed + + name = 'Default Input Profile' + short_name = 'default' # Used in the CLI so dont spaces etc. in it + description = _('This profile tries to provide sane defaults and is useful ' + 'if you know nothing about the input document.') + +input_profiles = [InputProfile] + + + + diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 95bf01ff6d..1cdafae4f0 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -6,13 +6,14 @@ import os, shutil, traceback, functools, sys from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \ MetadataWriterPlugin +from calibre.customize.conversion import InputFormatPlugin +from calibre.customize.profiles import InputProfile from calibre.customize.builtins import plugins as builtin_plugins from calibre.constants import __version__, iswindows, isosx from calibre.ebooks.metadata import MetaInformation from calibre.utils.config import make_config_dir, Config, ConfigProxy, \ plugin_dir, OptionParser - version = tuple([int(x) for x in __version__.split('.')]) platform = 'linux' @@ -70,7 +71,10 @@ _on_import = {} _on_preprocess = {} _on_postprocess = {} - +def input_profiles(): + for plugin in _initialized_plugins: + if isinstance(plugin, InputProfile): + yield plugin def reread_filetype_plugins(): global _on_import @@ -234,6 +238,17 @@ def find_plugin(name): if plugin.name == name: return plugin +def input_format_plugins(): + for plugin in _initialized_plugins: + if isinstance(plugin, InputFormatPlugin): + yield plugin + +def plugin_for_input_format(fmt): + for plugin in input_format_plugins(): + if fmt in plugin.file_types: + return plugin + + def disable_plugin(plugin_or_name): x = getattr(plugin_or_name, 'name', plugin_or_name) plugin = find_plugin(x) diff --git a/src/calibre/ebooks/conversion/__init__.py b/src/calibre/ebooks/conversion/__init__.py new file mode 100644 index 0000000000..384ccfb79c --- /dev/null +++ b/src/calibre/ebooks/conversion/__init__.py @@ -0,0 +1,4 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py new file mode 100644 index 0000000000..ac7490bd39 --- /dev/null +++ b/src/calibre/ebooks/conversion/plumber.py @@ -0,0 +1,30 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +from calibre.customize.conversion import OptionRecommendation +from calibre.customize.ui import input_profiles + +pipeline_options = [ + +OptionRecommendation(name='verbose', + recommended_value=0, level=OptionRecommendation.LOW, + short_switch='v', + help=_('Level of verbosity. Specify multiple times for greater ' + 'verbosity.') + ), + + +OptionRecommendation(name='input_profile', + recommended_value='default', level=OptionRecommendation.LOW, + choices=[x.short_name for x in input_profiles()], + help=_('Specify the input profile. The input profile gives the ' + 'conversion system information on how to interpret ' + 'various information in the input document. For ' + 'example resolution dependent lengths (i.e. lengths in ' + 'pixels).') + ), + +] \ No newline at end of file diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index aa17024d50..989391902b 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -40,38 +40,6 @@ def rules(stylesheets): if r.type == r.STYLE_RULE: yield r -def decrypt_font(key, path): - raw = open(path, 'rb').read() - crypt = raw[:1024] - key = cycle(iter(key)) - decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) - with open(path, 'wb') as f: - f.write(decrypt) - f.write(raw[1024:]) - -def process_encryption(encfile, opf): - key = None - m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read()) - if m: - key = m.group(1) - key = list(map(ord, uuid.UUID(key).bytes)) - try: - root = etree.parse(encfile) - for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): - algorithm = em.get('Algorithm', '') - if algorithm != 'http://ns.adobe.com/pdf/enc#RC': - return False - cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] - uri = cr.get('URI') - path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) - if os.path.exists(path): - decrypt_font(key, path) - return True - except: - import traceback - traceback.print_exc() - return False - def initialize_container(path_to_container, opf_name='metadata.opf'): ''' Create an empty EPUB document, with a default skeleton. diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py new file mode 100644 index 0000000000..1b69424a9e --- /dev/null +++ b/src/calibre/ebooks/epub/input.py @@ -0,0 +1,76 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, re, uuid +from itertools import cycle + +from lxml import etree + +from calibre.customize.conversion import InputFormatPlugin + +class EPUBInput(InputFormatPlugin): + + name = 'EPUB Input' + author = 'Kovid Goyal' + description = 'Convert EPUB files (.epub) to HTML' + file_types = set(['epub']) + + @classmethod + def decrypt_font(cls, key, path): + raw = open(path, 'rb').read() + crypt = raw[:1024] + key = cycle(iter(key)) + decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) + with open(path, 'wb') as f: + f.write(decrypt) + f.write(raw[1024:]) + + @classmethod + def process_ecryption(cls, encfile, opf, log): + key = None + m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read()) + if m: + key = m.group(1) + key = list(map(ord, uuid.UUID(key).bytes)) + try: + root = etree.parse(encfile) + for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): + algorithm = em.get('Algorithm', '') + if algorithm != 'http://ns.adobe.com/pdf/enc#RC': + return False + cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] + uri = cr.get('URI') + path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) + if os.path.exists(path): + cls.decrypt_font(key, path) + return True + except: + import traceback + traceback.print_exc() + return False + + def convert(self, stream, options, file_ext, parse_cache, log): + from calibre.utils.zipfile import ZipFile + from calibre import walk + from calibre.ebooks import DRMError + zf = ZipFile(stream) + zf.extractall(os.getcwd()) + encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) + opf = None + for f in walk('.'): + if f.lower().endswith('.opf'): + opf = f + break + path = getattr(stream, 'name', 'stream') + + if opf is None: + raise ValueError('%s is not a valid EPUB file'%path) + + if os.path.exists(encfile): + if not self.process_encryption(encfile, opf, log): + raise DRMError(os.path.basename(path)) + + return opf + diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py new file mode 100644 index 0000000000..1ce9950677 --- /dev/null +++ b/src/calibre/ebooks/mobi/input.py @@ -0,0 +1,29 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin + +class MOBIInput(InputFormatPlugin): + + name = 'MOBI Input' + author = 'Kovid Goyal' + description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML' + file_types = set(['mobi', 'prc', 'azw']) + + def convert(self, stream, options, file_ext, parse_cache, log): + from calibre.ebooks.mobi.reader import MobiReader + mr = MobiReader(stream, log, options.input_encoding, + options.debug_input) + mr.extract_content(output_dir=os.getcwdu(), parse_cache) + raw = parse_cache.get('calibre_raw_mobi_markup', False) + if raw: + if isinstance(raw, unicode): + raw = raw.encode('utf-8') + open('debug-raw.html', 'wb').write(raw) + + return mr.created_opf_path + diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 2c80cc1c8c..18663660b4 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal ' Read data from .mobi files ''' -import sys, struct, os, cStringIO, re, functools +import struct, os, cStringIO, re, functools try: from PIL import Image as PILImage @@ -35,8 +35,10 @@ class EXTHHeader(object): pos = 0 self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.has_fake_cover = True + left = self.num_items - for i in range(self.num_items): + while left > 0: + left -= 1 id, size = struct.unpack('>LL', raw[pos:pos+8]) content = raw[pos+8:pos+size] pos += size @@ -76,7 +78,8 @@ class EXTHHeader(object): class BookHeader(object): - def __init__(self, raw, ident): + def __init__(self, raw, ident, user_encoding, log): + self.log = log self.compression_type = raw[:2] self.records, self.records_size = struct.unpack('>HH', raw[8:12]) self.encryption_type, = struct.unpack('>H', raw[12:14]) @@ -92,8 +95,8 @@ class BookHeader(object): else: self.ancient = False self.doctype = raw[16:20] - self.length, self.type, self.codepage, self.unique_id, self.version = \ - struct.unpack('>LLLLL', raw[20:40]) + self.length, self.type, self.codepage, self.unique_id, \ + self.version = struct.unpack('>LLLLL', raw[20:40]) try: @@ -102,8 +105,9 @@ class BookHeader(object): 65001 : 'utf-8', }[self.codepage] except (IndexError, KeyError): - print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage - self.codec = 'cp1252' + self.codec = 'cp1252' if user_encoding is None else user_encoding + log.warn('Unknown codepage %d. Assuming %s'%(self.codepage, + self.codec)) if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: self.extra_flags = 0 @@ -138,9 +142,24 @@ class MobiReader(object): PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') - def __init__(self, filename_or_stream, verbose=False): - self.verbose = verbose + def __init__(self, filename_or_stream, log, user_encoding=None, debug=None): + self.log = log + self.debug = debug self.embedded_mi = None + self.base_css_rules = ''' + blockquote { margin: 0em 0em 0em 1.25em; text-align: justify } + + p { margin: 0em; text-align: justify } + + .bold { font-weight: bold } + + .italic { font-style: italic } + + .mbp_pagebreak { + page-break-after: always; margin: 0; display: block + } + ''' + self.tag_css_rules = [] if hasattr(filename_or_stream, 'read'): stream = filename_or_stream @@ -177,17 +196,21 @@ class MobiReader(object): self.sections.append((section(i), self.section_headers[i])) - self.book_header = BookHeader(self.sections[0][0], self.ident) + self.book_header = BookHeader(self.sections[0][0], self.ident, + user_encoding, self.log) self.name = self.name.decode(self.book_header.codec, 'replace') - def extract_content(self, output_dir=os.getcwdu()): + def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) if self.book_header.encryption_type != 0: raise DRMError(self.name) processed_records = self.extract_text() + if self.debug is not None: + self.parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() - self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') + self.processed_html = self.processed_html.decode(self.book_header.codec, + 'ignore') for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) e2u = functools.partial(entity_to_unicode, @@ -203,16 +226,10 @@ class MobiReader(object): self.processed_html = \ re.compile('', re.IGNORECASE).sub( '\n\n' - '\n', + '\t\n', self.processed_html) - if self.verbose: - print 'Parsing HTML...' + self.log.debug('Parsing HTML...') root = html.fromstring(self.processed_html) self.upshift_markup(root) guides = root.xpath('//guide') @@ -230,25 +247,24 @@ class MobiReader(object): ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href'] except AttributeError: pass - if self.verbose: - print 'Serializing...' - with open(htmlfile, 'wb') as f: - raw = html.tostring(root, encoding='utf-8', method='xml', - include_meta_content_type=True, pretty_print=True) - raw = raw.replace('', - '\n\n') - f.write(raw) + parse_cache[htmlfile] = root self.htmlfile = htmlfile - if self.book_header.exth is not None or self.embedded_mi is not None: - if self.verbose: - print 'Creating OPF...' - ncx = cStringIO.StringIO() - opf = self.create_opf(htmlfile, guide, root) - opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx) - ncx = ncx.getvalue() - if ncx: - open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) + self.log.debug('Creating OPF...') + ncx = cStringIO.StringIO() + opf = self.create_opf(htmlfile, guide, root) + self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' + opf.render(open(self.created_opf_path, 'wb'), ncx) + ncx = ncx.getvalue() + if ncx: + open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) + + with open('styles.css', 'wb') as s: + s.write(self.base_css_rules+'\n\n') + for rule in self.tag_css_rules: + if isinstance(rule, unicode): + rule = rule.encode('utf-8') + s.write(rule+'\n\n') def read_embedded_metadata(self, root, elem, guide): raw = ''+html.tostring(elem, encoding='utf-8')+'' @@ -277,8 +293,7 @@ class MobiReader(object): def cleanup_html(self): - if self.verbose: - print 'Cleaning up HTML...' + self.log.debug('Cleaning up HTML...') self.processed_html = re.sub(r'
', '', self.processed_html) if self.book_header.ancient and '')+'' @@ -286,8 +301,7 @@ class MobiReader(object): self.processed_html = self.processed_html.replace('> <', '>\n<') def upshift_markup(self, root): - if self.verbose: - print 'Converting style information to CSS...' + self.log.debug('Converting style information to CSS...') size_map = { 'xx-small' : '0.5', 'x-small' : '1', @@ -298,7 +312,7 @@ class MobiReader(object): 'xx-large' : '6', } mobi_version = self.book_header.mobi_version - for tag in root.iter(etree.Element): + for i, tag in enumerate(root.iter(etree.Element)): if tag.tag in ('country-region', 'place', 'placetype', 'placename', 'state', 'city'): tag.tag = 'span' @@ -352,8 +366,7 @@ class MobiReader(object): elif tag.tag == 'pre': if not tag.text: tag.tag = 'div' - if styles: - attrib['style'] = '; '.join(styles) + if 'filepos-id' in attrib: attrib['id'] = attrib.pop('filepos-id') if 'filepos' in attrib: @@ -362,15 +375,24 @@ class MobiReader(object): attrib['href'] = "#filepos%d" % int(filepos) except ValueError: pass + + if styles: + attrib['id'] = attrib.get('id', 'calibre_mr_gid%d'%i) + self.tag_css_rules.append('#%s {%s}'%(attrib['id'], + '; '.join(styles))) + def create_opf(self, htmlfile, guide=None, root=None): mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) + if mi is None: + mi = MetaInformation(self.title, [_('Unknown')]) opf = OPFCreator(os.path.dirname(htmlfile), mi) if hasattr(self.book_header.exth, 'cover_offset'): opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) elif mi.cover is not None: opf.cover = mi.cover - manifest = [(htmlfile, 'text/x-oeb1-document')] + manifest = [(htmlfile, 'text/x-oeb1-document'), + (os.path.abspath('styles.css'), 'text/css')] bp = os.path.dirname(htmlfile) for i in getattr(self, 'image_names', []): manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg')) @@ -441,8 +463,7 @@ class MobiReader(object): return data[:len(data)-trail_size] def extract_text(self): - if self.verbose: - print 'Extracting text...' + self.log.debug('Extracting text...') text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] processed_records = list(range(0, self.book_header.records+1)) @@ -472,12 +493,11 @@ class MobiReader(object): def replace_page_breaks(self): self.processed_html = self.PAGE_BREAK_PAT.sub( - '
', + '
', self.processed_html) def add_anchors(self): - if self.verbose: - print 'Adding anchors...' + self.log.debug('Adding anchors...') positions = set([]) link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE) @@ -507,8 +527,7 @@ class MobiReader(object): def extract_images(self, processed_records, output_dir): - if self.verbose: - print 'Extracting images...' + self.log.debug('Extracting images...') output_dir = os.path.abspath(os.path.join(output_dir, 'images')) if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -535,14 +554,17 @@ class MobiReader(object): im.convert('RGB').save(open(path, 'wb'), format='JPEG') def get_metadata(stream): - mr = MobiReader(stream) + from calibre.utils.logging import Log + log = Log() + mr = MobiReader(stream, log) if mr.book_header.exth is None: mi = MetaInformation(mr.name, [_('Unknown')]) else: mi = mr.create_opf('dummy.html') try: if hasattr(mr.book_header.exth, 'cover_offset'): - cover_index = mr.book_header.first_image_index + mr.book_header.exth.cover_offset + cover_index = mr.book_header.first_image_index + \ + mr.book_header.exth.cover_offset data = mr.sections[int(cover_index)][0] else: data = mr.sections[mr.book_header.first_image_index][0] @@ -552,42 +574,7 @@ def get_metadata(stream): im.convert('RGBA').save(obuf, format='JPEG') mi.cover_data = ('jpg', obuf.getvalue()) except: - import traceback - traceback.print_exc() + log.exception() return mi -def option_parser(): - from calibre.utils.config import OptionParser - parser = OptionParser(usage=_('%prog [options] myebook.mobi')) - parser.add_option('-o', '--output-dir', default='.', - help=_('Output directory. Defaults to current directory.')) - parser.add_option('-v', '--verbose', default=False, action='store_true', - help='Useful for debugging.') - return parser - - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - return 1 - - mr = MobiReader(args[1], verbose=opts.verbose) - opts.output_dir = os.path.abspath(opts.output_dir) - mr.extract_content(opts.output_dir) - if opts.verbose: - oname = os.path.join(opts.output_dir, 'debug-raw.html') - dat = mr.mobi_html - if isinstance(dat, unicode): - dat = dat.encode('utf-8') - open(oname, 'wb').write(dat) - print _('Raw MOBI HTML saved in'), oname - - print _('OEB ebook created in'), opts.output_dir - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/utils/logging.py b/src/calibre/utils/logging.py new file mode 100644 index 0000000000..ae2e1a792b --- /dev/null +++ b/src/calibre/utils/logging.py @@ -0,0 +1,92 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +'A simplified logging system' + +DEBUG = 0 +INFO = 1 +WARN = 2 +ERROR = 3 + +import sys, traceback +from functools import partial + +from calibre import prints +from calibre.utils.terminfo import TerminalController + +class ANSIStream: + + def __init__(self, stream=sys.stdout): + self.stream = stream + tc = TerminalController(stream) + self.color = { + DEBUG: tc.GREEN, + INFO:'', + WARN: tc.YELLOW, + ERROR: tc.RED + } + self.normal = tc.NORMAL + + def prints(self, level, *args, **kwargs): + self.stream.write(self.color[level]) + kwargs['file'] = self.stream + prints(*args, **kwargs) + self.stream.write(self.normal) + + def flush(self): + self.stream.flush() + +class HTMLStream: + + def __init__(self, stream=sys.stdout): + self.stream = stream + self.color = { + DEBUG: '', + INFO:'', + WARN: '', + ERROR: '' + } + self.normal = '' + + def prints(self, level, *args, **kwargs): + self.stream.write(self.color[level]) + kwargs['file'] = self.stream + prints(*args, **kwargs) + self.stream.write(self.normal) + + def flush(self): + self.stream.flush() + +class Log(object): + + DEBUG = DEBUG + INFO = INFO + WARN = WARN + ERROR = ERROR + + def __init__(self, level=INFO): + self.filter_level = level + default_output = ANSIStream() + self.outputs = [default_output] + + self.debug = partial(self.prints, DEBUG) + self.info = partial(self.prints, INFO) + self.warn = self.warning = partial(self.prints, WARN) + self.error = partial(self.prints, ERROR) + + + def prints(self, level, *args, **kwargs): + if level < self.filter_level: + return + for output in self.outputs: + output.prints(level, *args, **kwargs) + + def exception(self, *args, **kwargs): + limit = kwargs.pop('limit', None) + self.prints(ERROR, *args, **kwargs) + self.prints(DEBUG, traceback.format_exc(limit)) + + def __call__(self, *args, **kwargs): + self.prints(INFO, *args, **kwargs) \ No newline at end of file diff --git a/src/calibre/utils/terminfo.py b/src/calibre/utils/terminfo.py index 075c0e694d..fd394cbfe9 100644 --- a/src/calibre/utils/terminfo.py +++ b/src/calibre/utils/terminfo.py @@ -33,7 +33,7 @@ class TerminalController: >>> term = TerminalController() >>> if term.CLEAR_SCREEN: - ... print 'This terminal supports clearning the screen.' + ... print 'This terminal supports clearing the screen.' Finally, if the width and height of the terminal are known, then they will be stored in the `COLS` and `LINES` attributes.