mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Beginnings of the new conversion framework. Input plugins for MOBI and EPUB.
This commit is contained in:
parent
30bd23ee38
commit
925a86fb0c
@ -90,28 +90,11 @@ def prints(*args, **kwargs):
|
||||
if i != len(args)-1:
|
||||
file.write(sep)
|
||||
file.write(end)
|
||||
file.flush()
|
||||
|
||||
class CommandLineError(Exception):
|
||||
pass
|
||||
|
||||
class ColoredFormatter(Formatter):
|
||||
|
||||
def format(self, record):
|
||||
ln = record.__dict__['levelname']
|
||||
col = ''
|
||||
if ln == 'CRITICAL':
|
||||
col = terminal_controller.YELLOW
|
||||
elif ln == 'ERROR':
|
||||
col = terminal_controller.RED
|
||||
elif ln in ['WARN', 'WARNING']:
|
||||
col = terminal_controller.BLUE
|
||||
elif ln == 'INFO':
|
||||
col = terminal_controller.GREEN
|
||||
elif ln == 'DEBUG':
|
||||
col = terminal_controller.CYAN
|
||||
record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
|
||||
return Formatter.format(self, record)
|
||||
|
||||
|
||||
def setup_cli_handlers(logger, level):
|
||||
@ -335,66 +318,23 @@ def english_sort(x, y):
|
||||
'''
|
||||
return cmp(_spat.sub('', x), _spat.sub('', y))
|
||||
|
||||
class LoggingInterface:
|
||||
class ColoredFormatter(Formatter):
|
||||
|
||||
def __init__(self, logger):
|
||||
self.__logger = self.logger = logger
|
||||
|
||||
def setup_cli_handler(self, verbosity):
|
||||
for handler in self.__logger.handlers:
|
||||
if isinstance(handler, logging.StreamHandler):
|
||||
return
|
||||
if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers:
|
||||
return
|
||||
stream = sys.stdout
|
||||
formatter = logging.Formatter()
|
||||
level = logging.INFO
|
||||
if verbosity > 0:
|
||||
formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \
|
||||
ColoredFormatter('%(levelname)s: %(message)s')
|
||||
level = logging.DEBUG
|
||||
if verbosity > 1:
|
||||
stream = sys.stderr
|
||||
|
||||
handler = logging.StreamHandler(stream)
|
||||
handler.setFormatter(formatter)
|
||||
handler.setLevel(level)
|
||||
self.__logger.addHandler(handler)
|
||||
self.__logger.setLevel(level)
|
||||
|
||||
|
||||
def ___log(self, func, msg, args, kwargs):
|
||||
args = [msg] + list(args)
|
||||
for i in range(len(args)):
|
||||
if not isinstance(args[i], basestring):
|
||||
continue
|
||||
if sys.version_info[:2] > (2, 5):
|
||||
if not isinstance(args[i], unicode):
|
||||
args[i] = args[i].decode(preferred_encoding, 'replace')
|
||||
elif isinstance(args[i], unicode):
|
||||
args[i] = args[i].encode(preferred_encoding, 'replace')
|
||||
func(*args, **kwargs)
|
||||
|
||||
def log_debug(self, msg, *args, **kwargs):
|
||||
self.___log(self.__logger.debug, msg, args, kwargs)
|
||||
|
||||
def log_info(self, msg, *args, **kwargs):
|
||||
self.___log(self.__logger.info, msg, args, kwargs)
|
||||
|
||||
def log_warning(self, msg, *args, **kwargs):
|
||||
self.___log(self.__logger.warning, msg, args, kwargs)
|
||||
|
||||
def log_warn(self, msg, *args, **kwargs):
|
||||
self.___log(self.__logger.warning, msg, args, kwargs)
|
||||
|
||||
def log_error(self, msg, *args, **kwargs):
|
||||
self.___log(self.__logger.error, msg, args, kwargs)
|
||||
|
||||
def log_critical(self, msg, *args, **kwargs):
|
||||
self.___log(self.__logger.critical, msg, args, kwargs)
|
||||
|
||||
def log_exception(self, msg, *args):
|
||||
self.___log(self.__logger.exception, msg, args, {})
|
||||
def format(self, record):
|
||||
ln = record.__dict__['levelname']
|
||||
col = ''
|
||||
if ln == 'CRITICAL':
|
||||
col = terminal_controller.YELLOW
|
||||
elif ln == 'ERROR':
|
||||
col = terminal_controller.RED
|
||||
elif ln in ['WARN', 'WARNING']:
|
||||
col = terminal_controller.BLUE
|
||||
elif ln == 'INFO':
|
||||
col = terminal_controller.GREEN
|
||||
elif ln == 'DEBUG':
|
||||
col = terminal_controller.CYAN
|
||||
record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
|
||||
return Formatter.format(self, record)
|
||||
|
||||
def walk(dir):
|
||||
''' A nice interface to os.walk '''
|
||||
|
@ -242,8 +242,13 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
|
||||
set_metadata(stream, mi)
|
||||
|
||||
|
||||
plugins = [HTML2ZIP]
|
||||
from calibre.ebooks.epub.input import EPUBInput
|
||||
from calibre.ebooks.mobi.input import MOBIInput
|
||||
from calibre.customize.profiles import input_profiles
|
||||
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
x.__name__.endswith('MetadataReader')]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
x.__name__.endswith('MetadataWriter')]
|
||||
plugins += input_profiles
|
@ -1,28 +1,30 @@
|
||||
from __future__ import with_statement
|
||||
'''
|
||||
Defines the plugin sytem for conversions.
|
||||
'''
|
||||
import re
|
||||
import re, os, shutil
|
||||
|
||||
from lxml import html
|
||||
|
||||
from calibre import CurrentDir
|
||||
from calibre.customize import Plugin
|
||||
|
||||
|
||||
class ConversionOption(object):
|
||||
|
||||
'''
|
||||
Class representing conversion options
|
||||
'''
|
||||
|
||||
def __init__(self, name=None, default=None, help=None, long_switch=None,
|
||||
short_switch=None, choices=None, gui_label=None,
|
||||
category=None):
|
||||
def __init__(self, name=None, help=None, long_switch=None,
|
||||
short_switch=None, choices=None):
|
||||
self.name = name
|
||||
self.default = default
|
||||
self.help = help
|
||||
self.long_switch = long_switch
|
||||
self.short_switch = short_switch
|
||||
self.choices = choices
|
||||
self.gui_label = gui_label
|
||||
self.category = category
|
||||
|
||||
if self.long_switch is None:
|
||||
self.long_switch = '--'+self.name.replace('_', '-')
|
||||
|
||||
self.validate_parameters()
|
||||
|
||||
@ -32,41 +34,156 @@ class ConversionOption(object):
|
||||
'''
|
||||
if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None:
|
||||
raise ValueError(self.name + ' is not a valid Python identifier')
|
||||
if not (isinstance(self.default, (int, float, str, unicode)) or \
|
||||
self.default is None):
|
||||
raise ValueError(unicode(self.default) +
|
||||
' is not a string or a number')
|
||||
if not self.help:
|
||||
raise ValueError('You must set the help text')
|
||||
|
||||
class ConversionPlugin(Plugin):
|
||||
|
||||
'''
|
||||
The base class for all conversion related plugins.
|
||||
'''
|
||||
#: List of options
|
||||
#: Each option must be a dictionary. The dictionary can contain several
|
||||
#: keys defining the option. The ones marked by a * are required, the rest
|
||||
#: are optional. The keys are::
|
||||
#:
|
||||
#: *'name' : A valid python identifier.
|
||||
#: *'default' : The default value for this option.
|
||||
#: *'help' :
|
||||
#: 'short_switch' : A suggestion for a short form of the command line
|
||||
#: switch (for example if name is 'title', this
|
||||
#: could be 't'). It is only used if no prior
|
||||
#: conversion plugin has claimed it.
|
||||
options = []
|
||||
class OptionRecommendation(object):
|
||||
LOW = 1
|
||||
MED = 2
|
||||
HIGH = 3
|
||||
|
||||
type = _('Conversion')
|
||||
def __init__(self, recommeded_value, level=LOW, **kwargs):
|
||||
'''
|
||||
An option recommendation. That is, an option as well as its recommended
|
||||
value and the level of the recommendation.
|
||||
'''
|
||||
self.level = level
|
||||
self.recommended_value = recommeded_value
|
||||
self.option = kwargs.pop('option', None)
|
||||
if self.option is None:
|
||||
self.option = ConversionOption(**kwargs)
|
||||
|
||||
self.validate_parameters()
|
||||
|
||||
def validate_parameters(self):
|
||||
if self.option.choices and self.recommended_value not in \
|
||||
self.option.choices:
|
||||
raise ValueError('Recommended value not in choices')
|
||||
if not (isinstance(self.recommended_value, (int, float, str, unicode))\
|
||||
or self.default is None):
|
||||
raise ValueError(unicode(self.default) +
|
||||
' is not a string or a number')
|
||||
|
||||
|
||||
class InputFormatPlugin(Plugin):
|
||||
'''
|
||||
InputFormatPlugins are responsible for converting a document into
|
||||
HTML+OPF+CSS+etc.
|
||||
The results of the conversion *must* be encoded in UTF-8.
|
||||
The main action happens in :method:`convert`.
|
||||
'''
|
||||
|
||||
type = _('Conversion Input')
|
||||
can_be_disabled = False
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
|
||||
class InputFormatPlugin(ConversionPlugin):
|
||||
|
||||
#: Set of file types for which this plugin should be run
|
||||
#: For example: ``set(['lit', 'mobi', 'prc'])``
|
||||
#: For example: ``set(['azw', 'mobi', 'prc'])``
|
||||
file_types = set([])
|
||||
|
||||
#: Options shared by all Input format plugins. Do not override
|
||||
#: in sub-classes. Use :member:`options` instead. Every option must be an
|
||||
#: instance of :class:`OptionRecommendation`.
|
||||
common_options = set([
|
||||
OptionRecommendation(name='debug_input',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('Save the output from the input plugin to the specified '
|
||||
'directory. Useful if you are unsure at which stage '
|
||||
'of the conversion process a bug is occurring. '
|
||||
'WARNING: This completely deletes the contents of '
|
||||
'the specified directory.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='input_encoding',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the input document. If '
|
||||
'set this option will override any encoding declared by the '
|
||||
'document itself. Particularly useful for documents that '
|
||||
'do not declare an encoding or that have erroneous '
|
||||
'encoding declarations.')
|
||||
),
|
||||
|
||||
])
|
||||
|
||||
#: Options to customize the behavior of this plugin. Every option must be an
|
||||
#: instance of :class:`OptionRecommendation`.
|
||||
options = set([])
|
||||
|
||||
def convert(self, stream, options, file_ext, parse_cache, log):
|
||||
'''
|
||||
This method must be implemented in sub-classes. It must return
|
||||
the path to the created OPF file. All output should be contained in
|
||||
the current directory. If this plugin creates files outside the current
|
||||
directory they must be deleted/marked for deletion before this method
|
||||
returns.
|
||||
|
||||
:param stream: A file like object that contains the input file.
|
||||
|
||||
:param options: Options to customize the conversion process.
|
||||
Guaranteed to have attributes corresponding
|
||||
to all the options declared by this plugin. In
|
||||
addition, it will have a verbose attribute that
|
||||
takes integral values from zero upwards. Higher numbers
|
||||
mean be more verbose. Another useful attribute is
|
||||
``input_profile`` that is an instance of
|
||||
:class:`calibre.customize.profiles.InputProfile`.
|
||||
|
||||
:param file_ext: The extension (without the .) of the input file. It
|
||||
is guaranteed to be one of the `file_types` supported
|
||||
by this plugin.
|
||||
|
||||
:param parse_cache: A dictionary that maps absolute file paths to
|
||||
parsed representations of their contents. For
|
||||
HTML the representation is an lxml element of
|
||||
the root of the tree. For CSS it is a cssutils
|
||||
stylesheet. If this plugin parses any of the
|
||||
output files, it should add them to the cache
|
||||
so that later stages of the conversion wont
|
||||
have to re-parse them. If a parsed representation
|
||||
is in the cache, there is no need to actually
|
||||
write the file to disk.
|
||||
|
||||
:param log: A :class:`calibre.utils.logging.Log` object. All output
|
||||
should use this object.
|
||||
'''
|
||||
raise NotImplementedError
|
||||
|
||||
def __call__(self, stream, options, file_ext, parse_cache, log, output_dir):
|
||||
log('InputFormatPlugin: %s running'%self.name, end=' ')
|
||||
if hasattr(stream, 'name'):
|
||||
log('on', stream.name)
|
||||
|
||||
with CurrentDir(output_dir):
|
||||
for x in os.listdir('.'):
|
||||
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
||||
|
||||
|
||||
ret = self.convert(stream, options, file_ext, parse_cache, log)
|
||||
for key in list(parse_cache.keys()):
|
||||
if os.path.abspath(key) != key:
|
||||
log.warn(('InputFormatPlugin: %s returned a '
|
||||
'relative path: %s')%(self.name, key)
|
||||
)
|
||||
parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
|
||||
|
||||
if options.debug_input is not None:
|
||||
options.debug_input = os.path.abspath(options.debug_input)
|
||||
if not os.path.exists(options.debug_input):
|
||||
os.makedirs(options.debug_input)
|
||||
shutil.rmtree(options.debug_input)
|
||||
for f, obj in parse_cache.items():
|
||||
if hasattr(obj, 'cssText'):
|
||||
raw = obj.cssText
|
||||
else:
|
||||
raw = html.tostring(obj, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=True, pretty_print=True)
|
||||
if isinstance(raw, unicode):
|
||||
raw = raw.encode('utf-8')
|
||||
open(f, 'wb').write(raw)
|
||||
shutil.copytree('.', options.debug_input)
|
||||
|
||||
|
||||
|
||||
return ret
|
||||
|
||||
|
27
src/calibre/customize/profiles.py
Normal file
27
src/calibre/customize/profiles.py
Normal file
@ -0,0 +1,27 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize import Plugin
|
||||
|
||||
class InputProfile(Plugin):
|
||||
|
||||
author = 'Kovid Goyal'
|
||||
supported_platforms = set(['windows', 'osx', 'linux'])
|
||||
can_be_disabled = False
|
||||
type = _('Input profile')
|
||||
|
||||
# TODO: Add some real information to this profile. All other profiles must
|
||||
# inherit from this profile and override as needed
|
||||
|
||||
name = 'Default Input Profile'
|
||||
short_name = 'default' # Used in the CLI so dont spaces etc. in it
|
||||
description = _('This profile tries to provide sane defaults and is useful '
|
||||
'if you know nothing about the input document.')
|
||||
|
||||
input_profiles = [InputProfile]
|
||||
|
||||
|
||||
|
||||
|
@ -6,13 +6,14 @@ import os, shutil, traceback, functools, sys
|
||||
|
||||
from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \
|
||||
MetadataWriterPlugin
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.customize.profiles import InputProfile
|
||||
from calibre.customize.builtins import plugins as builtin_plugins
|
||||
from calibre.constants import __version__, iswindows, isosx
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.config import make_config_dir, Config, ConfigProxy, \
|
||||
plugin_dir, OptionParser
|
||||
|
||||
|
||||
version = tuple([int(x) for x in __version__.split('.')])
|
||||
|
||||
platform = 'linux'
|
||||
@ -70,7 +71,10 @@ _on_import = {}
|
||||
_on_preprocess = {}
|
||||
_on_postprocess = {}
|
||||
|
||||
|
||||
def input_profiles():
|
||||
for plugin in _initialized_plugins:
|
||||
if isinstance(plugin, InputProfile):
|
||||
yield plugin
|
||||
|
||||
def reread_filetype_plugins():
|
||||
global _on_import
|
||||
@ -234,6 +238,17 @@ def find_plugin(name):
|
||||
if plugin.name == name:
|
||||
return plugin
|
||||
|
||||
def input_format_plugins():
|
||||
for plugin in _initialized_plugins:
|
||||
if isinstance(plugin, InputFormatPlugin):
|
||||
yield plugin
|
||||
|
||||
def plugin_for_input_format(fmt):
|
||||
for plugin in input_format_plugins():
|
||||
if fmt in plugin.file_types:
|
||||
return plugin
|
||||
|
||||
|
||||
def disable_plugin(plugin_or_name):
|
||||
x = getattr(plugin_or_name, 'name', plugin_or_name)
|
||||
plugin = find_plugin(x)
|
||||
|
4
src/calibre/ebooks/conversion/__init__.py
Normal file
4
src/calibre/ebooks/conversion/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
30
src/calibre/ebooks/conversion/plumber.py
Normal file
30
src/calibre/ebooks/conversion/plumber.py
Normal file
@ -0,0 +1,30 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre.customize.ui import input_profiles
|
||||
|
||||
pipeline_options = [
|
||||
|
||||
OptionRecommendation(name='verbose',
|
||||
recommended_value=0, level=OptionRecommendation.LOW,
|
||||
short_switch='v',
|
||||
help=_('Level of verbosity. Specify multiple times for greater '
|
||||
'verbosity.')
|
||||
),
|
||||
|
||||
|
||||
OptionRecommendation(name='input_profile',
|
||||
recommended_value='default', level=OptionRecommendation.LOW,
|
||||
choices=[x.short_name for x in input_profiles()],
|
||||
help=_('Specify the input profile. The input profile gives the '
|
||||
'conversion system information on how to interpret '
|
||||
'various information in the input document. For '
|
||||
'example resolution dependent lengths (i.e. lengths in '
|
||||
'pixels).')
|
||||
),
|
||||
|
||||
]
|
@ -40,38 +40,6 @@ def rules(stylesheets):
|
||||
if r.type == r.STYLE_RULE:
|
||||
yield r
|
||||
|
||||
def decrypt_font(key, path):
|
||||
raw = open(path, 'rb').read()
|
||||
crypt = raw[:1024]
|
||||
key = cycle(iter(key))
|
||||
decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
|
||||
with open(path, 'wb') as f:
|
||||
f.write(decrypt)
|
||||
f.write(raw[1024:])
|
||||
|
||||
def process_encryption(encfile, opf):
|
||||
key = None
|
||||
m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
|
||||
if m:
|
||||
key = m.group(1)
|
||||
key = list(map(ord, uuid.UUID(key).bytes))
|
||||
try:
|
||||
root = etree.parse(encfile)
|
||||
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
|
||||
algorithm = em.get('Algorithm', '')
|
||||
if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
|
||||
return False
|
||||
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
|
||||
uri = cr.get('URI')
|
||||
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
|
||||
if os.path.exists(path):
|
||||
decrypt_font(key, path)
|
||||
return True
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def initialize_container(path_to_container, opf_name='metadata.opf'):
|
||||
'''
|
||||
Create an empty EPUB document, with a default skeleton.
|
||||
|
76
src/calibre/ebooks/epub/input.py
Normal file
76
src/calibre/ebooks/epub/input.py
Normal file
@ -0,0 +1,76 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, uuid
|
||||
from itertools import cycle
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
class EPUBInput(InputFormatPlugin):
|
||||
|
||||
name = 'EPUB Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert EPUB files (.epub) to HTML'
|
||||
file_types = set(['epub'])
|
||||
|
||||
@classmethod
|
||||
def decrypt_font(cls, key, path):
|
||||
raw = open(path, 'rb').read()
|
||||
crypt = raw[:1024]
|
||||
key = cycle(iter(key))
|
||||
decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
|
||||
with open(path, 'wb') as f:
|
||||
f.write(decrypt)
|
||||
f.write(raw[1024:])
|
||||
|
||||
@classmethod
|
||||
def process_ecryption(cls, encfile, opf, log):
|
||||
key = None
|
||||
m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
|
||||
if m:
|
||||
key = m.group(1)
|
||||
key = list(map(ord, uuid.UUID(key).bytes))
|
||||
try:
|
||||
root = etree.parse(encfile)
|
||||
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
|
||||
algorithm = em.get('Algorithm', '')
|
||||
if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
|
||||
return False
|
||||
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
|
||||
uri = cr.get('URI')
|
||||
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
|
||||
if os.path.exists(path):
|
||||
cls.decrypt_font(key, path)
|
||||
return True
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def convert(self, stream, options, file_ext, parse_cache, log):
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre import walk
|
||||
from calibre.ebooks import DRMError
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(os.getcwd())
|
||||
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
|
||||
opf = None
|
||||
for f in walk('.'):
|
||||
if f.lower().endswith('.opf'):
|
||||
opf = f
|
||||
break
|
||||
path = getattr(stream, 'name', 'stream')
|
||||
|
||||
if opf is None:
|
||||
raise ValueError('%s is not a valid EPUB file'%path)
|
||||
|
||||
if os.path.exists(encfile):
|
||||
if not self.process_encryption(encfile, opf, log):
|
||||
raise DRMError(os.path.basename(path))
|
||||
|
||||
return opf
|
||||
|
29
src/calibre/ebooks/mobi/input.py
Normal file
29
src/calibre/ebooks/mobi/input.py
Normal file
@ -0,0 +1,29 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
class MOBIInput(InputFormatPlugin):
|
||||
|
||||
name = 'MOBI Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
|
||||
file_types = set(['mobi', 'prc', 'azw'])
|
||||
|
||||
def convert(self, stream, options, file_ext, parse_cache, log):
|
||||
from calibre.ebooks.mobi.reader import MobiReader
|
||||
mr = MobiReader(stream, log, options.input_encoding,
|
||||
options.debug_input)
|
||||
mr.extract_content(output_dir=os.getcwdu(), parse_cache)
|
||||
raw = parse_cache.get('calibre_raw_mobi_markup', False)
|
||||
if raw:
|
||||
if isinstance(raw, unicode):
|
||||
raw = raw.encode('utf-8')
|
||||
open('debug-raw.html', 'wb').write(raw)
|
||||
|
||||
return mr.created_opf_path
|
||||
|
@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
Read data from .mobi files
|
||||
'''
|
||||
|
||||
import sys, struct, os, cStringIO, re, functools
|
||||
import struct, os, cStringIO, re, functools
|
||||
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
@ -35,8 +35,10 @@ class EXTHHeader(object):
|
||||
pos = 0
|
||||
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
||||
self.has_fake_cover = True
|
||||
left = self.num_items
|
||||
|
||||
for i in range(self.num_items):
|
||||
while left > 0:
|
||||
left -= 1
|
||||
id, size = struct.unpack('>LL', raw[pos:pos+8])
|
||||
content = raw[pos+8:pos+size]
|
||||
pos += size
|
||||
@ -76,7 +78,8 @@ class EXTHHeader(object):
|
||||
|
||||
class BookHeader(object):
|
||||
|
||||
def __init__(self, raw, ident):
|
||||
def __init__(self, raw, ident, user_encoding, log):
|
||||
self.log = log
|
||||
self.compression_type = raw[:2]
|
||||
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
|
||||
self.encryption_type, = struct.unpack('>H', raw[12:14])
|
||||
@ -92,8 +95,8 @@ class BookHeader(object):
|
||||
else:
|
||||
self.ancient = False
|
||||
self.doctype = raw[16:20]
|
||||
self.length, self.type, self.codepage, self.unique_id, self.version = \
|
||||
struct.unpack('>LLLLL', raw[20:40])
|
||||
self.length, self.type, self.codepage, self.unique_id, \
|
||||
self.version = struct.unpack('>LLLLL', raw[20:40])
|
||||
|
||||
|
||||
try:
|
||||
@ -102,8 +105,9 @@ class BookHeader(object):
|
||||
65001 : 'utf-8',
|
||||
}[self.codepage]
|
||||
except (IndexError, KeyError):
|
||||
print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage
|
||||
self.codec = 'cp1252'
|
||||
self.codec = 'cp1252' if user_encoding is None else user_encoding
|
||||
log.warn('Unknown codepage %d. Assuming %s'%(self.codepage,
|
||||
self.codec))
|
||||
|
||||
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
|
||||
self.extra_flags = 0
|
||||
@ -138,9 +142,24 @@ class MobiReader(object):
|
||||
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
|
||||
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
||||
|
||||
def __init__(self, filename_or_stream, verbose=False):
|
||||
self.verbose = verbose
|
||||
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None):
|
||||
self.log = log
|
||||
self.debug = debug
|
||||
self.embedded_mi = None
|
||||
self.base_css_rules = '''
|
||||
blockquote { margin: 0em 0em 0em 1.25em; text-align: justify }
|
||||
|
||||
p { margin: 0em; text-align: justify }
|
||||
|
||||
.bold { font-weight: bold }
|
||||
|
||||
.italic { font-style: italic }
|
||||
|
||||
.mbp_pagebreak {
|
||||
page-break-after: always; margin: 0; display: block
|
||||
}
|
||||
'''
|
||||
self.tag_css_rules = []
|
||||
|
||||
if hasattr(filename_or_stream, 'read'):
|
||||
stream = filename_or_stream
|
||||
@ -177,17 +196,21 @@ class MobiReader(object):
|
||||
self.sections.append((section(i), self.section_headers[i]))
|
||||
|
||||
|
||||
self.book_header = BookHeader(self.sections[0][0], self.ident)
|
||||
self.book_header = BookHeader(self.sections[0][0], self.ident,
|
||||
user_encoding, self.log)
|
||||
self.name = self.name.decode(self.book_header.codec, 'replace')
|
||||
|
||||
def extract_content(self, output_dir=os.getcwdu()):
|
||||
def extract_content(self, output_dir, parse_cache):
|
||||
output_dir = os.path.abspath(output_dir)
|
||||
if self.book_header.encryption_type != 0:
|
||||
raise DRMError(self.name)
|
||||
|
||||
processed_records = self.extract_text()
|
||||
if self.debug is not None:
|
||||
self.parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
|
||||
self.add_anchors()
|
||||
self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
|
||||
self.processed_html = self.processed_html.decode(self.book_header.codec,
|
||||
'ignore')
|
||||
for pat in ENCODING_PATS:
|
||||
self.processed_html = pat.sub('', self.processed_html)
|
||||
e2u = functools.partial(entity_to_unicode,
|
||||
@ -203,16 +226,10 @@ class MobiReader(object):
|
||||
self.processed_html = \
|
||||
re.compile('<head>', re.IGNORECASE).sub(
|
||||
'\n<head>\n'
|
||||
'<style type="text/css">\n'
|
||||
'blockquote { margin: 0em 0em 0em 1.25em; text-align: justify; }\n'
|
||||
'p { margin: 0em; text-align: justify; }\n'
|
||||
'.bold { font-weight: bold; }\n'
|
||||
'.italic { font-style: italic; }\n'
|
||||
'</style>\n',
|
||||
'\t<link type="text/css" href="styles.css" />\n',
|
||||
self.processed_html)
|
||||
|
||||
if self.verbose:
|
||||
print 'Parsing HTML...'
|
||||
self.log.debug('Parsing HTML...')
|
||||
root = html.fromstring(self.processed_html)
|
||||
self.upshift_markup(root)
|
||||
guides = root.xpath('//guide')
|
||||
@ -230,26 +247,25 @@ class MobiReader(object):
|
||||
ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href']
|
||||
except AttributeError:
|
||||
pass
|
||||
if self.verbose:
|
||||
print 'Serializing...'
|
||||
with open(htmlfile, 'wb') as f:
|
||||
raw = html.tostring(root, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=True, pretty_print=True)
|
||||
raw = raw.replace('<head>',
|
||||
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n')
|
||||
f.write(raw)
|
||||
parse_cache[htmlfile] = root
|
||||
self.htmlfile = htmlfile
|
||||
|
||||
if self.book_header.exth is not None or self.embedded_mi is not None:
|
||||
if self.verbose:
|
||||
print 'Creating OPF...'
|
||||
self.log.debug('Creating OPF...')
|
||||
ncx = cStringIO.StringIO()
|
||||
opf = self.create_opf(htmlfile, guide, root)
|
||||
opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx)
|
||||
self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf'
|
||||
opf.render(open(self.created_opf_path, 'wb'), ncx)
|
||||
ncx = ncx.getvalue()
|
||||
if ncx:
|
||||
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
|
||||
|
||||
with open('styles.css', 'wb') as s:
|
||||
s.write(self.base_css_rules+'\n\n')
|
||||
for rule in self.tag_css_rules:
|
||||
if isinstance(rule, unicode):
|
||||
rule = rule.encode('utf-8')
|
||||
s.write(rule+'\n\n')
|
||||
|
||||
def read_embedded_metadata(self, root, elem, guide):
|
||||
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
|
||||
stream = cStringIO.StringIO(raw)
|
||||
@ -277,8 +293,7 @@ class MobiReader(object):
|
||||
|
||||
|
||||
def cleanup_html(self):
|
||||
if self.verbose:
|
||||
print 'Cleaning up HTML...'
|
||||
self.log.debug('Cleaning up HTML...')
|
||||
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
|
||||
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
|
||||
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
|
||||
@ -286,8 +301,7 @@ class MobiReader(object):
|
||||
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
||||
|
||||
def upshift_markup(self, root):
|
||||
if self.verbose:
|
||||
print 'Converting style information to CSS...'
|
||||
self.log.debug('Converting style information to CSS...')
|
||||
size_map = {
|
||||
'xx-small' : '0.5',
|
||||
'x-small' : '1',
|
||||
@ -298,7 +312,7 @@ class MobiReader(object):
|
||||
'xx-large' : '6',
|
||||
}
|
||||
mobi_version = self.book_header.mobi_version
|
||||
for tag in root.iter(etree.Element):
|
||||
for i, tag in enumerate(root.iter(etree.Element)):
|
||||
if tag.tag in ('country-region', 'place', 'placetype', 'placename',
|
||||
'state', 'city'):
|
||||
tag.tag = 'span'
|
||||
@ -352,8 +366,7 @@ class MobiReader(object):
|
||||
elif tag.tag == 'pre':
|
||||
if not tag.text:
|
||||
tag.tag = 'div'
|
||||
if styles:
|
||||
attrib['style'] = '; '.join(styles)
|
||||
|
||||
if 'filepos-id' in attrib:
|
||||
attrib['id'] = attrib.pop('filepos-id')
|
||||
if 'filepos' in attrib:
|
||||
@ -363,14 +376,23 @@ class MobiReader(object):
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if styles:
|
||||
attrib['id'] = attrib.get('id', 'calibre_mr_gid%d'%i)
|
||||
self.tag_css_rules.append('#%s {%s}'%(attrib['id'],
|
||||
'; '.join(styles)))
|
||||
|
||||
|
||||
def create_opf(self, htmlfile, guide=None, root=None):
|
||||
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
|
||||
if mi is None:
|
||||
mi = MetaInformation(self.title, [_('Unknown')])
|
||||
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
||||
if hasattr(self.book_header.exth, 'cover_offset'):
|
||||
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
|
||||
elif mi.cover is not None:
|
||||
opf.cover = mi.cover
|
||||
manifest = [(htmlfile, 'text/x-oeb1-document')]
|
||||
manifest = [(htmlfile, 'text/x-oeb1-document'),
|
||||
(os.path.abspath('styles.css'), 'text/css')]
|
||||
bp = os.path.dirname(htmlfile)
|
||||
for i in getattr(self, 'image_names', []):
|
||||
manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
|
||||
@ -441,8 +463,7 @@ class MobiReader(object):
|
||||
return data[:len(data)-trail_size]
|
||||
|
||||
def extract_text(self):
|
||||
if self.verbose:
|
||||
print 'Extracting text...'
|
||||
self.log.debug('Extracting text...')
|
||||
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
|
||||
processed_records = list(range(0, self.book_header.records+1))
|
||||
|
||||
@ -472,12 +493,11 @@ class MobiReader(object):
|
||||
|
||||
def replace_page_breaks(self):
|
||||
self.processed_html = self.PAGE_BREAK_PAT.sub(
|
||||
'<div class="mbp_pagebreak" style="page-break-after: always; margin: 0; display: block" />',
|
||||
'<div class="mbp_pagebreak" />',
|
||||
self.processed_html)
|
||||
|
||||
def add_anchors(self):
|
||||
if self.verbose:
|
||||
print 'Adding anchors...'
|
||||
self.log.debug('Adding anchors...')
|
||||
positions = set([])
|
||||
link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
|
||||
re.IGNORECASE)
|
||||
@ -507,8 +527,7 @@ class MobiReader(object):
|
||||
|
||||
|
||||
def extract_images(self, processed_records, output_dir):
|
||||
if self.verbose:
|
||||
print 'Extracting images...'
|
||||
self.log.debug('Extracting images...')
|
||||
output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
@ -535,14 +554,17 @@ class MobiReader(object):
|
||||
im.convert('RGB').save(open(path, 'wb'), format='JPEG')
|
||||
|
||||
def get_metadata(stream):
|
||||
mr = MobiReader(stream)
|
||||
from calibre.utils.logging import Log
|
||||
log = Log()
|
||||
mr = MobiReader(stream, log)
|
||||
if mr.book_header.exth is None:
|
||||
mi = MetaInformation(mr.name, [_('Unknown')])
|
||||
else:
|
||||
mi = mr.create_opf('dummy.html')
|
||||
try:
|
||||
if hasattr(mr.book_header.exth, 'cover_offset'):
|
||||
cover_index = mr.book_header.first_image_index + mr.book_header.exth.cover_offset
|
||||
cover_index = mr.book_header.first_image_index + \
|
||||
mr.book_header.exth.cover_offset
|
||||
data = mr.sections[int(cover_index)][0]
|
||||
else:
|
||||
data = mr.sections[mr.book_header.first_image_index][0]
|
||||
@ -552,42 +574,7 @@ def get_metadata(stream):
|
||||
im.convert('RGBA').save(obuf, format='JPEG')
|
||||
mi.cover_data = ('jpg', obuf.getvalue())
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
log.exception()
|
||||
return mi
|
||||
|
||||
|
||||
def option_parser():
|
||||
from calibre.utils.config import OptionParser
|
||||
parser = OptionParser(usage=_('%prog [options] myebook.mobi'))
|
||||
parser.add_option('-o', '--output-dir', default='.',
|
||||
help=_('Output directory. Defaults to current directory.'))
|
||||
parser.add_option('-v', '--verbose', default=False, action='store_true',
|
||||
help='Useful for debugging.')
|
||||
return parser
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
opts, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
mr = MobiReader(args[1], verbose=opts.verbose)
|
||||
opts.output_dir = os.path.abspath(opts.output_dir)
|
||||
mr.extract_content(opts.output_dir)
|
||||
if opts.verbose:
|
||||
oname = os.path.join(opts.output_dir, 'debug-raw.html')
|
||||
dat = mr.mobi_html
|
||||
if isinstance(dat, unicode):
|
||||
dat = dat.encode('utf-8')
|
||||
open(oname, 'wb').write(dat)
|
||||
print _('Raw MOBI HTML saved in'), oname
|
||||
|
||||
print _('OEB ebook created in'), opts.output_dir
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
92
src/calibre/utils/logging.py
Normal file
92
src/calibre/utils/logging.py
Normal file
@ -0,0 +1,92 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'A simplified logging system'
|
||||
|
||||
DEBUG = 0
|
||||
INFO = 1
|
||||
WARN = 2
|
||||
ERROR = 3
|
||||
|
||||
import sys, traceback
|
||||
from functools import partial
|
||||
|
||||
from calibre import prints
|
||||
from calibre.utils.terminfo import TerminalController
|
||||
|
||||
class ANSIStream:
|
||||
|
||||
def __init__(self, stream=sys.stdout):
|
||||
self.stream = stream
|
||||
tc = TerminalController(stream)
|
||||
self.color = {
|
||||
DEBUG: tc.GREEN,
|
||||
INFO:'',
|
||||
WARN: tc.YELLOW,
|
||||
ERROR: tc.RED
|
||||
}
|
||||
self.normal = tc.NORMAL
|
||||
|
||||
def prints(self, level, *args, **kwargs):
|
||||
self.stream.write(self.color[level])
|
||||
kwargs['file'] = self.stream
|
||||
prints(*args, **kwargs)
|
||||
self.stream.write(self.normal)
|
||||
|
||||
def flush(self):
|
||||
self.stream.flush()
|
||||
|
||||
class HTMLStream:
|
||||
|
||||
def __init__(self, stream=sys.stdout):
|
||||
self.stream = stream
|
||||
self.color = {
|
||||
DEBUG: '<span style="color:green">',
|
||||
INFO:'<span>',
|
||||
WARN: '<span style="color:yellow">',
|
||||
ERROR: '<span style="color:red">'
|
||||
}
|
||||
self.normal = '</span>'
|
||||
|
||||
def prints(self, level, *args, **kwargs):
|
||||
self.stream.write(self.color[level])
|
||||
kwargs['file'] = self.stream
|
||||
prints(*args, **kwargs)
|
||||
self.stream.write(self.normal)
|
||||
|
||||
def flush(self):
|
||||
self.stream.flush()
|
||||
|
||||
class Log(object):
|
||||
|
||||
DEBUG = DEBUG
|
||||
INFO = INFO
|
||||
WARN = WARN
|
||||
ERROR = ERROR
|
||||
|
||||
def __init__(self, level=INFO):
|
||||
self.filter_level = level
|
||||
default_output = ANSIStream()
|
||||
self.outputs = [default_output]
|
||||
|
||||
self.debug = partial(self.prints, DEBUG)
|
||||
self.info = partial(self.prints, INFO)
|
||||
self.warn = self.warning = partial(self.prints, WARN)
|
||||
self.error = partial(self.prints, ERROR)
|
||||
|
||||
|
||||
def prints(self, level, *args, **kwargs):
|
||||
if level < self.filter_level:
|
||||
return
|
||||
for output in self.outputs:
|
||||
output.prints(level, *args, **kwargs)
|
||||
|
||||
def exception(self, *args, **kwargs):
|
||||
limit = kwargs.pop('limit', None)
|
||||
self.prints(ERROR, *args, **kwargs)
|
||||
self.prints(DEBUG, traceback.format_exc(limit))
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
self.prints(INFO, *args, **kwargs)
|
@ -33,7 +33,7 @@ class TerminalController:
|
||||
|
||||
>>> term = TerminalController()
|
||||
>>> if term.CLEAR_SCREEN:
|
||||
... print 'This terminal supports clearning the screen.'
|
||||
... print 'This terminal supports clearing the screen.'
|
||||
|
||||
Finally, if the width and height of the terminal are known, then
|
||||
they will be stored in the `COLS` and `LINES` attributes.
|
||||
|
Loading…
x
Reference in New Issue
Block a user