Beginnings of the new conversion framework. Input plugins for MOBI and EPUB.

This commit is contained in:
Kovid Goyal 2009-03-06 21:38:35 -08:00
parent 30bd23ee38
commit 925a86fb0c
13 changed files with 525 additions and 235 deletions

View File

@ -90,28 +90,11 @@ def prints(*args, **kwargs):
if i != len(args)-1:
file.write(sep)
file.write(end)
file.flush()
class CommandLineError(Exception):
pass
class ColoredFormatter(Formatter):
def format(self, record):
ln = record.__dict__['levelname']
col = ''
if ln == 'CRITICAL':
col = terminal_controller.YELLOW
elif ln == 'ERROR':
col = terminal_controller.RED
elif ln in ['WARN', 'WARNING']:
col = terminal_controller.BLUE
elif ln == 'INFO':
col = terminal_controller.GREEN
elif ln == 'DEBUG':
col = terminal_controller.CYAN
record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
return Formatter.format(self, record)
def setup_cli_handlers(logger, level):
@ -335,66 +318,23 @@ def english_sort(x, y):
'''
return cmp(_spat.sub('', x), _spat.sub('', y))
class LoggingInterface:
class ColoredFormatter(Formatter):
def __init__(self, logger):
self.__logger = self.logger = logger
def setup_cli_handler(self, verbosity):
for handler in self.__logger.handlers:
if isinstance(handler, logging.StreamHandler):
return
if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers:
return
stream = sys.stdout
formatter = logging.Formatter()
level = logging.INFO
if verbosity > 0:
formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \
ColoredFormatter('%(levelname)s: %(message)s')
level = logging.DEBUG
if verbosity > 1:
stream = sys.stderr
handler = logging.StreamHandler(stream)
handler.setFormatter(formatter)
handler.setLevel(level)
self.__logger.addHandler(handler)
self.__logger.setLevel(level)
def ___log(self, func, msg, args, kwargs):
args = [msg] + list(args)
for i in range(len(args)):
if not isinstance(args[i], basestring):
continue
if sys.version_info[:2] > (2, 5):
if not isinstance(args[i], unicode):
args[i] = args[i].decode(preferred_encoding, 'replace')
elif isinstance(args[i], unicode):
args[i] = args[i].encode(preferred_encoding, 'replace')
func(*args, **kwargs)
def log_debug(self, msg, *args, **kwargs):
self.___log(self.__logger.debug, msg, args, kwargs)
def log_info(self, msg, *args, **kwargs):
self.___log(self.__logger.info, msg, args, kwargs)
def log_warning(self, msg, *args, **kwargs):
self.___log(self.__logger.warning, msg, args, kwargs)
def log_warn(self, msg, *args, **kwargs):
self.___log(self.__logger.warning, msg, args, kwargs)
def log_error(self, msg, *args, **kwargs):
self.___log(self.__logger.error, msg, args, kwargs)
def log_critical(self, msg, *args, **kwargs):
self.___log(self.__logger.critical, msg, args, kwargs)
def log_exception(self, msg, *args):
self.___log(self.__logger.exception, msg, args, {})
def format(self, record):
ln = record.__dict__['levelname']
col = ''
if ln == 'CRITICAL':
col = terminal_controller.YELLOW
elif ln == 'ERROR':
col = terminal_controller.RED
elif ln in ['WARN', 'WARNING']:
col = terminal_controller.BLUE
elif ln == 'INFO':
col = terminal_controller.GREEN
elif ln == 'DEBUG':
col = terminal_controller.CYAN
record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
return Formatter.format(self, record)
def walk(dir):
''' A nice interface to os.walk '''

View File

@ -242,8 +242,13 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
set_metadata(stream, mi)
plugins = [HTML2ZIP]
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.mobi.input import MOBIInput
from calibre.customize.profiles import input_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataWriter')]
plugins += input_profiles

View File

@ -1,28 +1,30 @@
from __future__ import with_statement
'''
Defines the plugin sytem for conversions.
'''
import re
import re, os, shutil
from lxml import html
from calibre import CurrentDir
from calibre.customize import Plugin
class ConversionOption(object):
'''
Class representing conversion options
'''
def __init__(self, name=None, default=None, help=None, long_switch=None,
short_switch=None, choices=None, gui_label=None,
category=None):
def __init__(self, name=None, help=None, long_switch=None,
short_switch=None, choices=None):
self.name = name
self.default = default
self.help = help
self.long_switch = long_switch
self.short_switch = short_switch
self.choices = choices
self.gui_label = gui_label
self.category = category
if self.long_switch is None:
self.long_switch = '--'+self.name.replace('_', '-')
self.validate_parameters()
@ -32,41 +34,156 @@ class ConversionOption(object):
'''
if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None:
raise ValueError(self.name + ' is not a valid Python identifier')
if not (isinstance(self.default, (int, float, str, unicode)) or \
self.default is None):
raise ValueError(unicode(self.default) +
' is not a string or a number')
if not self.help:
raise ValueError('You must set the help text')
class ConversionPlugin(Plugin):
'''
The base class for all conversion related plugins.
'''
#: List of options
#: Each option must be a dictionary. The dictionary can contain several
#: keys defining the option. The ones marked by a * are required, the rest
#: are optional. The keys are::
#:
#: *'name' : A valid python identifier.
#: *'default' : The default value for this option.
#: *'help' :
#: 'short_switch' : A suggestion for a short form of the command line
#: switch (for example if name is 'title', this
#: could be 't'). It is only used if no prior
#: conversion plugin has claimed it.
options = []
class OptionRecommendation(object):
LOW = 1
MED = 2
HIGH = 3
type = _('Conversion')
def __init__(self, recommeded_value, level=LOW, **kwargs):
'''
An option recommendation. That is, an option as well as its recommended
value and the level of the recommendation.
'''
self.level = level
self.recommended_value = recommeded_value
self.option = kwargs.pop('option', None)
if self.option is None:
self.option = ConversionOption(**kwargs)
self.validate_parameters()
def validate_parameters(self):
if self.option.choices and self.recommended_value not in \
self.option.choices:
raise ValueError('Recommended value not in choices')
if not (isinstance(self.recommended_value, (int, float, str, unicode))\
or self.default is None):
raise ValueError(unicode(self.default) +
' is not a string or a number')
class InputFormatPlugin(Plugin):
'''
InputFormatPlugins are responsible for converting a document into
HTML+OPF+CSS+etc.
The results of the conversion *must* be encoded in UTF-8.
The main action happens in :method:`convert`.
'''
type = _('Conversion Input')
can_be_disabled = False
supported_platforms = ['windows', 'osx', 'linux']
class InputFormatPlugin(ConversionPlugin):
#: Set of file types for which this plugin should be run
#: For example: ``set(['lit', 'mobi', 'prc'])``
#: For example: ``set(['azw', 'mobi', 'prc'])``
file_types = set([])
#: Options shared by all Input format plugins. Do not override
#: in sub-classes. Use :member:`options` instead. Every option must be an
#: instance of :class:`OptionRecommendation`.
common_options = set([
OptionRecommendation(name='debug_input',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Save the output from the input plugin to the specified '
'directory. Useful if you are unsure at which stage '
'of the conversion process a bug is occurring. '
'WARNING: This completely deletes the contents of '
'the specified directory.')
),
OptionRecommendation(name='input_encoding',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the input document. If '
'set this option will override any encoding declared by the '
'document itself. Particularly useful for documents that '
'do not declare an encoding or that have erroneous '
'encoding declarations.')
),
])
#: Options to customize the behavior of this plugin. Every option must be an
#: instance of :class:`OptionRecommendation`.
options = set([])
def convert(self, stream, options, file_ext, parse_cache, log):
'''
This method must be implemented in sub-classes. It must return
the path to the created OPF file. All output should be contained in
the current directory. If this plugin creates files outside the current
directory they must be deleted/marked for deletion before this method
returns.
:param stream: A file like object that contains the input file.
:param options: Options to customize the conversion process.
Guaranteed to have attributes corresponding
to all the options declared by this plugin. In
addition, it will have a verbose attribute that
takes integral values from zero upwards. Higher numbers
mean be more verbose. Another useful attribute is
``input_profile`` that is an instance of
:class:`calibre.customize.profiles.InputProfile`.
:param file_ext: The extension (without the .) of the input file. It
is guaranteed to be one of the `file_types` supported
by this plugin.
:param parse_cache: A dictionary that maps absolute file paths to
parsed representations of their contents. For
HTML the representation is an lxml element of
the root of the tree. For CSS it is a cssutils
stylesheet. If this plugin parses any of the
output files, it should add them to the cache
so that later stages of the conversion wont
have to re-parse them. If a parsed representation
is in the cache, there is no need to actually
write the file to disk.
:param log: A :class:`calibre.utils.logging.Log` object. All output
should use this object.
'''
raise NotImplementedError
def __call__(self, stream, options, file_ext, parse_cache, log, output_dir):
log('InputFormatPlugin: %s running'%self.name, end=' ')
if hasattr(stream, 'name'):
log('on', stream.name)
with CurrentDir(output_dir):
for x in os.listdir('.'):
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
ret = self.convert(stream, options, file_ext, parse_cache, log)
for key in list(parse_cache.keys()):
if os.path.abspath(key) != key:
log.warn(('InputFormatPlugin: %s returned a '
'relative path: %s')%(self.name, key)
)
parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
if options.debug_input is not None:
options.debug_input = os.path.abspath(options.debug_input)
if not os.path.exists(options.debug_input):
os.makedirs(options.debug_input)
shutil.rmtree(options.debug_input)
for f, obj in parse_cache.items():
if hasattr(obj, 'cssText'):
raw = obj.cssText
else:
raw = html.tostring(obj, encoding='utf-8', method='xml',
include_meta_content_type=True, pretty_print=True)
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open(f, 'wb').write(raw)
shutil.copytree('.', options.debug_input)
return ret

View File

@ -0,0 +1,27 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize import Plugin
class InputProfile(Plugin):
author = 'Kovid Goyal'
supported_platforms = set(['windows', 'osx', 'linux'])
can_be_disabled = False
type = _('Input profile')
# TODO: Add some real information to this profile. All other profiles must
# inherit from this profile and override as needed
name = 'Default Input Profile'
short_name = 'default' # Used in the CLI so dont spaces etc. in it
description = _('This profile tries to provide sane defaults and is useful '
'if you know nothing about the input document.')
input_profiles = [InputProfile]

View File

@ -6,13 +6,14 @@ import os, shutil, traceback, functools, sys
from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \
MetadataWriterPlugin
from calibre.customize.conversion import InputFormatPlugin
from calibre.customize.profiles import InputProfile
from calibre.customize.builtins import plugins as builtin_plugins
from calibre.constants import __version__, iswindows, isosx
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.config import make_config_dir, Config, ConfigProxy, \
plugin_dir, OptionParser
version = tuple([int(x) for x in __version__.split('.')])
platform = 'linux'
@ -70,7 +71,10 @@ _on_import = {}
_on_preprocess = {}
_on_postprocess = {}
def input_profiles():
for plugin in _initialized_plugins:
if isinstance(plugin, InputProfile):
yield plugin
def reread_filetype_plugins():
global _on_import
@ -234,6 +238,17 @@ def find_plugin(name):
if plugin.name == name:
return plugin
def input_format_plugins():
for plugin in _initialized_plugins:
if isinstance(plugin, InputFormatPlugin):
yield plugin
def plugin_for_input_format(fmt):
for plugin in input_format_plugins():
if fmt in plugin.file_types:
return plugin
def disable_plugin(plugin_or_name):
x = getattr(plugin_or_name, 'name', plugin_or_name)
plugin = find_plugin(x)

View File

@ -0,0 +1,4 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@ -0,0 +1,30 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import OptionRecommendation
from calibre.customize.ui import input_profiles
pipeline_options = [
OptionRecommendation(name='verbose',
recommended_value=0, level=OptionRecommendation.LOW,
short_switch='v',
help=_('Level of verbosity. Specify multiple times for greater '
'verbosity.')
),
OptionRecommendation(name='input_profile',
recommended_value='default', level=OptionRecommendation.LOW,
choices=[x.short_name for x in input_profiles()],
help=_('Specify the input profile. The input profile gives the '
'conversion system information on how to interpret '
'various information in the input document. For '
'example resolution dependent lengths (i.e. lengths in '
'pixels).')
),
]

View File

@ -40,38 +40,6 @@ def rules(stylesheets):
if r.type == r.STYLE_RULE:
yield r
def decrypt_font(key, path):
raw = open(path, 'rb').read()
crypt = raw[:1024]
key = cycle(iter(key))
decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
with open(path, 'wb') as f:
f.write(decrypt)
f.write(raw[1024:])
def process_encryption(encfile, opf):
key = None
m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
if m:
key = m.group(1)
key = list(map(ord, uuid.UUID(key).bytes))
try:
root = etree.parse(encfile)
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
algorithm = em.get('Algorithm', '')
if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
return False
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
uri = cr.get('URI')
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
if os.path.exists(path):
decrypt_font(key, path)
return True
except:
import traceback
traceback.print_exc()
return False
def initialize_container(path_to_container, opf_name='metadata.opf'):
'''
Create an empty EPUB document, with a default skeleton.

View File

@ -0,0 +1,76 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re, uuid
from itertools import cycle
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
class EPUBInput(InputFormatPlugin):
name = 'EPUB Input'
author = 'Kovid Goyal'
description = 'Convert EPUB files (.epub) to HTML'
file_types = set(['epub'])
@classmethod
def decrypt_font(cls, key, path):
raw = open(path, 'rb').read()
crypt = raw[:1024]
key = cycle(iter(key))
decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
with open(path, 'wb') as f:
f.write(decrypt)
f.write(raw[1024:])
@classmethod
def process_ecryption(cls, encfile, opf, log):
key = None
m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
if m:
key = m.group(1)
key = list(map(ord, uuid.UUID(key).bytes))
try:
root = etree.parse(encfile)
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
algorithm = em.get('Algorithm', '')
if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
return False
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
uri = cr.get('URI')
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
if os.path.exists(path):
cls.decrypt_font(key, path)
return True
except:
import traceback
traceback.print_exc()
return False
def convert(self, stream, options, file_ext, parse_cache, log):
from calibre.utils.zipfile import ZipFile
from calibre import walk
from calibre.ebooks import DRMError
zf = ZipFile(stream)
zf.extractall(os.getcwd())
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
opf = None
for f in walk('.'):
if f.lower().endswith('.opf'):
opf = f
break
path = getattr(stream, 'name', 'stream')
if opf is None:
raise ValueError('%s is not a valid EPUB file'%path)
if os.path.exists(encfile):
if not self.process_encryption(encfile, opf, log):
raise DRMError(os.path.basename(path))
return opf

View File

@ -0,0 +1,29 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
class MOBIInput(InputFormatPlugin):
name = 'MOBI Input'
author = 'Kovid Goyal'
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = set(['mobi', 'prc', 'azw'])
def convert(self, stream, options, file_ext, parse_cache, log):
from calibre.ebooks.mobi.reader import MobiReader
mr = MobiReader(stream, log, options.input_encoding,
options.debug_input)
mr.extract_content(output_dir=os.getcwdu(), parse_cache)
raw = parse_cache.get('calibre_raw_mobi_markup', False)
if raw:
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open('debug-raw.html', 'wb').write(raw)
return mr.created_opf_path

View File

@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
Read data from .mobi files
'''
import sys, struct, os, cStringIO, re, functools
import struct, os, cStringIO, re, functools
try:
from PIL import Image as PILImage
@ -35,8 +35,10 @@ class EXTHHeader(object):
pos = 0
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
self.has_fake_cover = True
left = self.num_items
for i in range(self.num_items):
while left > 0:
left -= 1
id, size = struct.unpack('>LL', raw[pos:pos+8])
content = raw[pos+8:pos+size]
pos += size
@ -76,7 +78,8 @@ class EXTHHeader(object):
class BookHeader(object):
def __init__(self, raw, ident):
def __init__(self, raw, ident, user_encoding, log):
self.log = log
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
self.encryption_type, = struct.unpack('>H', raw[12:14])
@ -92,8 +95,8 @@ class BookHeader(object):
else:
self.ancient = False
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, self.version = \
struct.unpack('>LLLLL', raw[20:40])
self.length, self.type, self.codepage, self.unique_id, \
self.version = struct.unpack('>LLLLL', raw[20:40])
try:
@ -102,8 +105,9 @@ class BookHeader(object):
65001 : 'utf-8',
}[self.codepage]
except (IndexError, KeyError):
print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage
self.codec = 'cp1252'
self.codec = 'cp1252' if user_encoding is None else user_encoding
log.warn('Unknown codepage %d. Assuming %s'%(self.codepage,
self.codec))
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
self.extra_flags = 0
@ -138,9 +142,24 @@ class MobiReader(object):
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, verbose=False):
self.verbose = verbose
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None):
self.log = log
self.debug = debug
self.embedded_mi = None
self.base_css_rules = '''
blockquote { margin: 0em 0em 0em 1.25em; text-align: justify }
p { margin: 0em; text-align: justify }
.bold { font-weight: bold }
.italic { font-style: italic }
.mbp_pagebreak {
page-break-after: always; margin: 0; display: block
}
'''
self.tag_css_rules = []
if hasattr(filename_or_stream, 'read'):
stream = filename_or_stream
@ -177,17 +196,21 @@ class MobiReader(object):
self.sections.append((section(i), self.section_headers[i]))
self.book_header = BookHeader(self.sections[0][0], self.ident)
self.book_header = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log)
self.name = self.name.decode(self.book_header.codec, 'replace')
def extract_content(self, output_dir=os.getcwdu()):
def extract_content(self, output_dir, parse_cache):
output_dir = os.path.abspath(output_dir)
if self.book_header.encryption_type != 0:
raise DRMError(self.name)
processed_records = self.extract_text()
if self.debug is not None:
self.parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
self.processed_html = self.processed_html.decode(self.book_header.codec,
'ignore')
for pat in ENCODING_PATS:
self.processed_html = pat.sub('', self.processed_html)
e2u = functools.partial(entity_to_unicode,
@ -203,16 +226,10 @@ class MobiReader(object):
self.processed_html = \
re.compile('<head>', re.IGNORECASE).sub(
'\n<head>\n'
'<style type="text/css">\n'
'blockquote { margin: 0em 0em 0em 1.25em; text-align: justify; }\n'
'p { margin: 0em; text-align: justify; }\n'
'.bold { font-weight: bold; }\n'
'.italic { font-style: italic; }\n'
'</style>\n',
'\t<link type="text/css" href="styles.css" />\n',
self.processed_html)
if self.verbose:
print 'Parsing HTML...'
self.log.debug('Parsing HTML...')
root = html.fromstring(self.processed_html)
self.upshift_markup(root)
guides = root.xpath('//guide')
@ -230,25 +247,24 @@ class MobiReader(object):
ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href']
except AttributeError:
pass
if self.verbose:
print 'Serializing...'
with open(htmlfile, 'wb') as f:
raw = html.tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=True, pretty_print=True)
raw = raw.replace('<head>',
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n')
f.write(raw)
parse_cache[htmlfile] = root
self.htmlfile = htmlfile
if self.book_header.exth is not None or self.embedded_mi is not None:
if self.verbose:
print 'Creating OPF...'
ncx = cStringIO.StringIO()
opf = self.create_opf(htmlfile, guide, root)
opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx)
ncx = ncx.getvalue()
if ncx:
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
self.log.debug('Creating OPF...')
ncx = cStringIO.StringIO()
opf = self.create_opf(htmlfile, guide, root)
self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf'
opf.render(open(self.created_opf_path, 'wb'), ncx)
ncx = ncx.getvalue()
if ncx:
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
with open('styles.css', 'wb') as s:
s.write(self.base_css_rules+'\n\n')
for rule in self.tag_css_rules:
if isinstance(rule, unicode):
rule = rule.encode('utf-8')
s.write(rule+'\n\n')
def read_embedded_metadata(self, root, elem, guide):
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
@ -277,8 +293,7 @@ class MobiReader(object):
def cleanup_html(self):
if self.verbose:
print 'Cleaning up HTML...'
self.log.debug('Cleaning up HTML...')
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
@ -286,8 +301,7 @@ class MobiReader(object):
self.processed_html = self.processed_html.replace('> <', '>\n<')
def upshift_markup(self, root):
if self.verbose:
print 'Converting style information to CSS...'
self.log.debug('Converting style information to CSS...')
size_map = {
'xx-small' : '0.5',
'x-small' : '1',
@ -298,7 +312,7 @@ class MobiReader(object):
'xx-large' : '6',
}
mobi_version = self.book_header.mobi_version
for tag in root.iter(etree.Element):
for i, tag in enumerate(root.iter(etree.Element)):
if tag.tag in ('country-region', 'place', 'placetype', 'placename',
'state', 'city'):
tag.tag = 'span'
@ -352,8 +366,7 @@ class MobiReader(object):
elif tag.tag == 'pre':
if not tag.text:
tag.tag = 'div'
if styles:
attrib['style'] = '; '.join(styles)
if 'filepos-id' in attrib:
attrib['id'] = attrib.pop('filepos-id')
if 'filepos' in attrib:
@ -363,14 +376,23 @@ class MobiReader(object):
except ValueError:
pass
if styles:
attrib['id'] = attrib.get('id', 'calibre_mr_gid%d'%i)
self.tag_css_rules.append('#%s {%s}'%(attrib['id'],
'; '.join(styles)))
def create_opf(self, htmlfile, guide=None, root=None):
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
if mi is None:
mi = MetaInformation(self.title, [_('Unknown')])
opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
elif mi.cover is not None:
opf.cover = mi.cover
manifest = [(htmlfile, 'text/x-oeb1-document')]
manifest = [(htmlfile, 'text/x-oeb1-document'),
(os.path.abspath('styles.css'), 'text/css')]
bp = os.path.dirname(htmlfile)
for i in getattr(self, 'image_names', []):
manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
@ -441,8 +463,7 @@ class MobiReader(object):
return data[:len(data)-trail_size]
def extract_text(self):
if self.verbose:
print 'Extracting text...'
self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
processed_records = list(range(0, self.book_header.records+1))
@ -472,12 +493,11 @@ class MobiReader(object):
def replace_page_breaks(self):
self.processed_html = self.PAGE_BREAK_PAT.sub(
'<div class="mbp_pagebreak" style="page-break-after: always; margin: 0; display: block" />',
'<div class="mbp_pagebreak" />',
self.processed_html)
def add_anchors(self):
if self.verbose:
print 'Adding anchors...'
self.log.debug('Adding anchors...')
positions = set([])
link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
re.IGNORECASE)
@ -507,8 +527,7 @@ class MobiReader(object):
def extract_images(self, processed_records, output_dir):
if self.verbose:
print 'Extracting images...'
self.log.debug('Extracting images...')
output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
@ -535,14 +554,17 @@ class MobiReader(object):
im.convert('RGB').save(open(path, 'wb'), format='JPEG')
def get_metadata(stream):
mr = MobiReader(stream)
from calibre.utils.logging import Log
log = Log()
mr = MobiReader(stream, log)
if mr.book_header.exth is None:
mi = MetaInformation(mr.name, [_('Unknown')])
else:
mi = mr.create_opf('dummy.html')
try:
if hasattr(mr.book_header.exth, 'cover_offset'):
cover_index = mr.book_header.first_image_index + mr.book_header.exth.cover_offset
cover_index = mr.book_header.first_image_index + \
mr.book_header.exth.cover_offset
data = mr.sections[int(cover_index)][0]
else:
data = mr.sections[mr.book_header.first_image_index][0]
@ -552,42 +574,7 @@ def get_metadata(stream):
im.convert('RGBA').save(obuf, format='JPEG')
mi.cover_data = ('jpg', obuf.getvalue())
except:
import traceback
traceback.print_exc()
log.exception()
return mi
def option_parser():
from calibre.utils.config import OptionParser
parser = OptionParser(usage=_('%prog [options] myebook.mobi'))
parser.add_option('-o', '--output-dir', default='.',
help=_('Output directory. Defaults to current directory.'))
parser.add_option('-v', '--verbose', default=False, action='store_true',
help='Useful for debugging.')
return parser
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
return 1
mr = MobiReader(args[1], verbose=opts.verbose)
opts.output_dir = os.path.abspath(opts.output_dir)
mr.extract_content(opts.output_dir)
if opts.verbose:
oname = os.path.join(opts.output_dir, 'debug-raw.html')
dat = mr.mobi_html
if isinstance(dat, unicode):
dat = dat.encode('utf-8')
open(oname, 'wb').write(dat)
print _('Raw MOBI HTML saved in'), oname
print _('OEB ebook created in'), opts.output_dir
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,92 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'A simplified logging system'
DEBUG = 0
INFO = 1
WARN = 2
ERROR = 3
import sys, traceback
from functools import partial
from calibre import prints
from calibre.utils.terminfo import TerminalController
class ANSIStream:
def __init__(self, stream=sys.stdout):
self.stream = stream
tc = TerminalController(stream)
self.color = {
DEBUG: tc.GREEN,
INFO:'',
WARN: tc.YELLOW,
ERROR: tc.RED
}
self.normal = tc.NORMAL
def prints(self, level, *args, **kwargs):
self.stream.write(self.color[level])
kwargs['file'] = self.stream
prints(*args, **kwargs)
self.stream.write(self.normal)
def flush(self):
self.stream.flush()
class HTMLStream:
def __init__(self, stream=sys.stdout):
self.stream = stream
self.color = {
DEBUG: '<span style="color:green">',
INFO:'<span>',
WARN: '<span style="color:yellow">',
ERROR: '<span style="color:red">'
}
self.normal = '</span>'
def prints(self, level, *args, **kwargs):
self.stream.write(self.color[level])
kwargs['file'] = self.stream
prints(*args, **kwargs)
self.stream.write(self.normal)
def flush(self):
self.stream.flush()
class Log(object):
DEBUG = DEBUG
INFO = INFO
WARN = WARN
ERROR = ERROR
def __init__(self, level=INFO):
self.filter_level = level
default_output = ANSIStream()
self.outputs = [default_output]
self.debug = partial(self.prints, DEBUG)
self.info = partial(self.prints, INFO)
self.warn = self.warning = partial(self.prints, WARN)
self.error = partial(self.prints, ERROR)
def prints(self, level, *args, **kwargs):
if level < self.filter_level:
return
for output in self.outputs:
output.prints(level, *args, **kwargs)
def exception(self, *args, **kwargs):
limit = kwargs.pop('limit', None)
self.prints(ERROR, *args, **kwargs)
self.prints(DEBUG, traceback.format_exc(limit))
def __call__(self, *args, **kwargs):
self.prints(INFO, *args, **kwargs)

View File

@ -33,7 +33,7 @@ class TerminalController:
>>> term = TerminalController()
>>> if term.CLEAR_SCREEN:
... print 'This terminal supports clearning the screen.'
... print 'This terminal supports clearing the screen.'
Finally, if the width and height of the terminal are known, then
they will be stored in the `COLS` and `LINES` attributes.