Merge pluginize

This commit is contained in:
Kovid Goyal 2009-05-31 08:49:17 -07:00
commit c3db153421
352 changed files with 55719 additions and 18591 deletions

View File

@ -13,6 +13,7 @@ src/calibre/manual/cli/
build
dist
docs
nbproject/
src/calibre/gui2/pictureflow/Makefile.Debug
src/calibre/gui2/pictureflow/Makefile.Release
src/calibre/gui2/pictureflow/debug/

View File

@ -2,9 +2,9 @@
<?eclipse-pydev version="1.0"?>
<pydev_project>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.5</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.6</pydev_property>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/calibre/src</path>
<path>/calibre-pluginize/src</path>
</pydev_pathproperty>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
</pydev_project>

View File

@ -72,6 +72,9 @@ if __name__ == '__main__':
library_dirs=[os.environ.get('PODOFO_LIB_DIR', podofo_lib)],
include_dirs=\
[os.environ.get('PODOFO_INC_DIR', podofo_inc)]))
else:
print 'WARNING: PoDoFo not found on your system. Various PDF related',
print 'functionality will not work.'
ext_modules = optional + [
@ -88,6 +91,9 @@ if __name__ == '__main__':
'src/calibre/utils/msdes/des.c'],
include_dirs=['src/calibre/utils/msdes']),
Extension('calibre.plugins.cPalmdoc',
sources=['src/calibre/ebooks/compression/palmdoc.c']),
PyQtExtension('calibre.plugins.pictureflow',
['src/calibre/gui2/pictureflow/pictureflow.cpp',
'src/calibre/gui2/pictureflow/pictureflow.h'],

View File

@ -2,11 +2,11 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, re, logging, time, subprocess, atexit, mimetypes, warnings
import sys, os, re, logging, time, mimetypes, \
__builtin__, warnings, multiprocessing
__builtin__.__dict__['dynamic_property'] = lambda(func): func(None)
from htmlentitydefs import name2codepoint
from math import floor
from logging import Formatter
warnings.simplefilter('ignore', DeprecationWarning)
@ -45,6 +45,13 @@ def to_unicode(raw, encoding='utf-8', errors='strict'):
return raw
return raw.decode(encoding, errors)
def patheq(p1, p2):
p = os.path
d = lambda x : p.normcase(p.normpath(p.realpath(p.normpath(x))))
if not p1 or not p2:
return False
return d(p1) == d(p2)
def unicode_path(path, abs=False):
if not isinstance(path, unicode):
path = path.decode(sys.getfilesystemencoding())
@ -83,26 +90,33 @@ def sanitize_file_name(name, substitute='_', as_unicode=False):
return one.replace('..', '_')
def prints(*args, **kwargs):
'''
Print unicode arguments safely by encoding them to preferred_encoding
Has the same signature as the print function from Python 3.
'''
file = kwargs.get('file', sys.stdout)
sep = kwargs.get('sep', ' ')
end = kwargs.get('end', '\n')
enc = preferred_encoding
if 'CALIBRE_WORKER' in os.environ:
enc = 'utf-8'
for i, arg in enumerate(args):
if isinstance(arg, unicode):
arg = arg.encode(enc)
if not isinstance(arg, str):
arg = str(arg)
if not isinstance(arg, unicode):
arg = arg.decode(preferred_encoding, 'replace').encode(enc)
file.write(arg)
if i != len(args)-1:
file.write(sep)
file.write(end)
class CommandLineError(Exception):
pass
class ColoredFormatter(Formatter):
def format(self, record):
ln = record.__dict__['levelname']
col = ''
if ln == 'CRITICAL':
col = terminal_controller.YELLOW
elif ln == 'ERROR':
col = terminal_controller.RED
elif ln in ['WARN', 'WARNING']:
col = terminal_controller.BLUE
elif ln == 'INFO':
col = terminal_controller.GREEN
elif ln == 'DEBUG':
col = terminal_controller.CYAN
record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
return Formatter.format(self, record)
def setup_cli_handlers(logger, level):
@ -244,7 +258,7 @@ class CurrentDir(object):
os.chdir(self.cwd)
class FileWrapper(object):
class StreamReadWrapper(object):
'''
Used primarily with pyPdf to ensure the stream is properly closed.
'''
@ -263,40 +277,7 @@ class FileWrapper(object):
def detect_ncpus():
"""Detects the number of effective CPUs in the system"""
try:
from PyQt4.QtCore import QThread
ans = QThread.idealThreadCount()
if ans > 0:
return ans
except:
pass
#for Linux, Unix and MacOS
if hasattr(os, "sysconf"):
if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"):
#Linux and Unix
ncpus = os.sysconf("SC_NPROCESSORS_ONLN")
if isinstance(ncpus, int) and ncpus > 0:
return ncpus
else:
#MacOS X
try:
return int(subprocess.Popen(('sysctl', '-n', 'hw.cpu'), stdout=subprocess.PIPE).stdout.read())
except IOError: # Occassionally the system call gets interrupted
try:
return int(subprocess.Popen(('sysctl', '-n', 'hw.cpu'), stdout=subprocess.PIPE).stdout.read())
except IOError:
return 1
except ValueError: # On some systems the sysctl call fails
return 1
#for Windows
if os.environ.has_key("NUMBER_OF_PROCESSORS"):
ncpus = int(os.environ["NUMBER_OF_PROCESSORS"]);
if ncpus > 0:
return ncpus
#return the default value
return 1
return multiprocessing.cpu_count()
def launch(path_or_url):
if os.path.exists(path_or_url):
@ -343,67 +324,6 @@ def english_sort(x, y):
'''
return cmp(_spat.sub('', x), _spat.sub('', y))
class LoggingInterface:
def __init__(self, logger):
self.__logger = self.logger = logger
def setup_cli_handler(self, verbosity):
for handler in self.__logger.handlers:
if isinstance(handler, logging.StreamHandler):
return
if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers:
return
stream = sys.stdout
formatter = logging.Formatter()
level = logging.INFO
if verbosity > 0:
formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \
ColoredFormatter('%(levelname)s: %(message)s')
level = logging.DEBUG
if verbosity > 1:
stream = sys.stderr
handler = logging.StreamHandler(stream)
handler.setFormatter(formatter)
handler.setLevel(level)
self.__logger.addHandler(handler)
self.__logger.setLevel(level)
def ___log(self, func, msg, args, kwargs):
args = [msg] + list(args)
for i in range(len(args)):
if not isinstance(args[i], basestring):
continue
if sys.version_info[:2] > (2, 5):
if not isinstance(args[i], unicode):
args[i] = args[i].decode(preferred_encoding, 'replace')
elif isinstance(args[i], unicode):
args[i] = args[i].encode(preferred_encoding, 'replace')
func(*args, **kwargs)
def log_debug(self, msg, *args, **kwargs):
self.___log(self.__logger.debug, msg, args, kwargs)
def log_info(self, msg, *args, **kwargs):
self.___log(self.__logger.info, msg, args, kwargs)
def log_warning(self, msg, *args, **kwargs):
self.___log(self.__logger.warning, msg, args, kwargs)
def log_warn(self, msg, *args, **kwargs):
self.___log(self.__logger.warning, msg, args, kwargs)
def log_error(self, msg, *args, **kwargs):
self.___log(self.__logger.error, msg, args, kwargs)
def log_critical(self, msg, *args, **kwargs):
self.___log(self.__logger.critical, msg, args, kwargs)
def log_exception(self, msg, *args):
self.___log(self.__logger.exception, msg, args, {})
def walk(dir):
''' A nice interface to os.walk '''
for record in os.walk(dir):

View File

@ -53,7 +53,7 @@ if plugins is None:
plugin_path = getattr(pkg_resources, 'resource_filename')('calibre', 'plugins')
sys.path.insert(0, plugin_path)
for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo'] + \
for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo', 'cPalmdoc'] + \
(['winutil'] if iswindows else []) + \
(['usbobserver'] if isosx else []):
try:

View File

@ -221,3 +221,4 @@ class MetadataWriterPlugin(Plugin):
'''
pass

View File

@ -1,8 +1,9 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap, os
import textwrap
import os
import glob
from calibre.customize import FileTypePlugin, MetadataReaderPlugin, MetadataWriterPlugin
from calibre.constants import __version__
@ -20,30 +21,55 @@ every time you add an HTML file to the library.\
on_import = True
def run(self, htmlfile):
of = self.temporary_file('_plugin_html2zip.zip')
from calibre.ebooks.html import gui_main as html2oeb
html2oeb(htmlfile, of)
from calibre.ptempfile import TemporaryDirectory
from calibre.gui2.convert.gui_conversion import gui_convert
from calibre.customize.conversion import OptionRecommendation
from calibre.ebooks.epub import initialize_container
with TemporaryDirectory('_plugin_html2zip') as tdir:
gui_convert(htmlfile, tdir, [('debug_input', tdir,
OptionRecommendation.HIGH)])
of = self.temporary_file('_plugin_html2zip.zip')
opf = glob.glob(os.path.join(tdir, '*.opf'))[0]
ncx = glob.glob(os.path.join(tdir, '*.ncx'))
if ncx:
os.remove(ncx[0])
epub = initialize_container(of.name, os.path.basename(opf))
epub.add_dir(tdir)
epub.close()
return of.name
class OPFMetadataReader(MetadataReaderPlugin):
name = 'Read OPF metadata'
file_types = set(['opf'])
description = _('Read metadata from %s files')%'OPF'
class ComicMetadataReader(MetadataReaderPlugin):
name = 'Read comic metadata'
file_types = set(['cbr', 'cbz'])
description = _('Extract cover from comic files')
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.opf2 import OPF
if ftype == 'cbr':
from calibre.libunrar import extract_member as extract_first
extract_first
else:
from calibre.libunzip import extract_member as extract_first
from calibre.ebooks.metadata import MetaInformation
return MetaInformation(OPF(stream, os.getcwd()))
ret = extract_first(stream)
mi = MetaInformation(None, None)
if ret is not None:
path, data = ret
ext = os.path.splitext(path)[1][1:]
mi.cover_data = (ext.lower(), data)
return mi
class RTFMetadataReader(MetadataReaderPlugin):
class EPUBMetadataReader(MetadataReaderPlugin):
name = 'Read RTF metadata'
file_types = set(['rtf'])
description = _('Read metadata from %s files')%'RTF'
name = 'Read EPUB metadata'
file_types = set(['epub'])
description = _('Read metadata from %s files')%'EPUB'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.rtf import get_metadata
from calibre.ebooks.metadata.epub import get_metadata
return get_metadata(stream)
class FB2MetadataReader(MetadataReaderPlugin):
@ -56,35 +82,14 @@ class FB2MetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.fb2 import get_metadata
return get_metadata(stream)
class HTMLMetadataReader(MetadataReaderPlugin):
class LRFMetadataReader(MetadataReaderPlugin):
name = 'Read LRF metadata'
file_types = set(['lrf'])
description = _('Read metadata from %s files')%'LRF'
name = 'Read HTML metadata'
file_types = set(['html'])
description = _('Read metadata from %s files')%'HTML'
def get_metadata(self, stream, ftype):
from calibre.ebooks.lrf.meta import get_metadata
return get_metadata(stream)
class PDFMetadataReader(MetadataReaderPlugin):
name = 'Read PDF metadata'
file_types = set(['pdf'])
description = _('Read metadata from %s files')%'PDF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.pdf import get_metadata
return get_metadata(stream)
class LITMetadataReader(MetadataReaderPlugin):
name = 'Read LIT metadata'
file_types = set(['lit'])
description = _('Read metadata from %s files')%'LIT'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.lit import get_metadata
from calibre.ebooks.metadata.html import get_metadata
return get_metadata(stream)
class IMPMetadataReader(MetadataReaderPlugin):
@ -98,66 +103,24 @@ class IMPMetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.imp import get_metadata
return get_metadata(stream)
class RBMetadataReader(MetadataReaderPlugin):
class LITMetadataReader(MetadataReaderPlugin):
name = 'Read RB metadata'
file_types = set(['rb'])
description = _('Read metadata from %s files')%'RB'
author = 'Ashish Kulkarni'
name = 'Read LIT metadata'
file_types = set(['lit'])
description = _('Read metadata from %s files')%'LIT'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.rb import get_metadata
from calibre.ebooks.metadata.lit import get_metadata
return get_metadata(stream)
class EPUBMetadataReader(MetadataReaderPlugin):
class LRFMetadataReader(MetadataReaderPlugin):
name = 'Read EPUB metadata'
file_types = set(['epub'])
description = _('Read metadata from %s files')%'EPUB'
name = 'Read LRF metadata'
file_types = set(['lrf'])
description = _('Read metadata from %s files')%'LRF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.epub import get_metadata
return get_metadata(stream)
class HTMLMetadataReader(MetadataReaderPlugin):
name = 'Read HTML metadata'
file_types = set(['html'])
description = _('Read metadata from %s files')%'HTML'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.html import get_metadata
return get_metadata(stream)
class MOBIMetadataReader(MetadataReaderPlugin):
name = 'Read MOBI metadata'
file_types = set(['mobi', 'prc', 'azw'])
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):
from calibre.ebooks.mobi.reader import get_metadata
return get_metadata(stream)
class TOPAZMetadataReader(MetadataReaderPlugin):
name = 'Read Topaz metadata'
file_types = set(['tpz', 'azw1'])
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.topaz import get_metadata
return get_metadata(stream)
class ODTMetadataReader(MetadataReaderPlugin):
name = 'Read ODT metadata'
file_types = set(['odt'])
description = _('Read metadata from %s files')%'ODT'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.odt import get_metadata
from calibre.ebooks.lrf.meta import get_metadata
return get_metadata(stream)
class LRXMetadataReader(MetadataReaderPlugin):
@ -170,34 +133,56 @@ class LRXMetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.lrx import get_metadata
return get_metadata(stream)
class ComicMetadataReader(MetadataReaderPlugin):
class MOBIMetadataReader(MetadataReaderPlugin):
name = 'Read comic metadata'
file_types = set(['cbr', 'cbz'])
description = _('Extract cover from comic files')
name = 'Read MOBI metadata'
file_types = set(['mobi', 'prc', 'azw'])
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):
if ftype == 'cbr':
from calibre.libunrar import extract_member as extract_first
else:
from calibre.libunzip import extract_member as extract_first
from calibre.ebooks.mobi.reader import get_metadata
return get_metadata(stream)
class ODTMetadataReader(MetadataReaderPlugin):
name = 'Read ODT metadata'
file_types = set(['odt'])
description = _('Read metadata from %s files')%'ODT'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.odt import get_metadata
return get_metadata(stream)
class OPFMetadataReader(MetadataReaderPlugin):
name = 'Read OPF metadata'
file_types = set(['opf'])
description = _('Read metadata from %s files')%'OPF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.metadata import MetaInformation
ret = extract_first(stream)
mi = MetaInformation(None, None)
if ret is not None:
path, data = ret
ext = os.path.splitext(path)[1][1:]
mi.cover_data = (ext.lower(), data)
return mi
return MetaInformation(OPF(stream, os.getcwd()))
class ZipMetadataReader(MetadataReaderPlugin):
class PDBMetadataReader(MetadataReaderPlugin):
name = 'Read ZIP metadata'
file_types = set(['zip', 'oebzip'])
description = _('Read metadata from ebooks in ZIP archives')
name = 'Read PDB metadata'
file_types = set(['pdb'])
description = _('Read metadata from %s files') % 'PDB'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.zip import get_metadata
from calibre.ebooks.metadata.pdb import get_metadata
return get_metadata(stream)
class PDFMetadataReader(MetadataReaderPlugin):
name = 'Read PDF metadata'
file_types = set(['pdf'])
description = _('Read metadata from %s files')%'PDF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.pdf import get_metadata
return get_metadata(stream)
class RARMetadataReader(MetadataReaderPlugin):
@ -210,6 +195,58 @@ class RARMetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.rar import get_metadata
return get_metadata(stream)
class RBMetadataReader(MetadataReaderPlugin):
name = 'Read RB metadata'
file_types = set(['rb'])
description = _('Read metadata from %s files')%'RB'
author = 'Ashish Kulkarni'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.rb import get_metadata
return get_metadata(stream)
class RTFMetadataReader(MetadataReaderPlugin):
name = 'Read RTF metadata'
file_types = set(['rtf'])
description = _('Read metadata from %s files')%'RTF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.rtf import get_metadata
return get_metadata(stream)
class TOPAZMetadataReader(MetadataReaderPlugin):
name = 'Read Topaz metadata'
file_types = set(['tpz', 'azw1'])
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.topaz import get_metadata
return get_metadata(stream)
class TXTMetadataReader(MetadataReaderPlugin):
name = 'Read TXT metadata'
file_types = set(['txt'])
description = _('Read metadata from %s files') % 'TXT'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.txt import get_metadata
return get_metadata(stream)
class ZipMetadataReader(MetadataReaderPlugin):
name = 'Read ZIP metadata'
file_types = set(['zip', 'oebzip'])
description = _('Read metadata from ebooks in ZIP archives')
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.zip import get_metadata
return get_metadata(stream)
class EPUBMetadataWriter(MetadataWriterPlugin):
@ -231,16 +268,6 @@ class LRFMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.lrf.meta import set_metadata
set_metadata(stream, mi)
class RTFMetadataWriter(MetadataWriterPlugin):
name = 'Set RTF metadata'
file_types = set(['rtf'])
description = _('Set metadata in %s files')%'RTF'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.rtf import set_metadata
set_metadata(stream, mi)
class MOBIMetadataWriter(MetadataWriterPlugin):
name = 'Set MOBI metadata'
@ -252,20 +279,128 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.mobi import set_metadata
set_metadata(stream, mi)
class PDBMetadataWriter(MetadataWriterPlugin):
name = 'Set PDB metadata'
file_types = set(['pdb'])
description = _('Set metadata from %s files') % 'PDB'
author = 'John Schember'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.pdb import set_metadata
set_metadata(stream, mi)
class PDFMetadataWriter(MetadataWriterPlugin):
name = 'Set PDF metadata'
file_types = set(['pdf'])
description = _('Set metadata in %s files') % 'PDF'
author = 'John Schember'
author = 'Kovid Goyal'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.pdf import set_metadata
set_metadata(stream, mi)
class RTFMetadataWriter(MetadataWriterPlugin):
plugins = [HTML2ZIP]
name = 'Set RTF metadata'
file_types = set(['rtf'])
description = _('Set metadata in %s files')%'RTF'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.rtf import set_metadata
set_metadata(stream, mi)
from calibre.ebooks.comic.input import ComicInput
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.fb2.input import FB2Input
from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.lit.input import LITInput
from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.odt.input import ODTInput
from calibre.ebooks.pdb.input import PDBInput
from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.pml.input import PMLInput
from calibre.ebooks.rb.input import RBInput
from calibre.web.feeds.input import RecipeInput
from calibre.ebooks.rtf.input import RTFInput
from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.epub.output import EPUBOutput
from calibre.ebooks.fb2.output import FB2Output
from calibre.ebooks.lit.output import LITOutput
from calibre.ebooks.lrf.output import LRFOutput
from calibre.ebooks.mobi.output import MOBIOutput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.pdb.output import PDBOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.output import PMLOutput
from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.customize.profiles import input_profiles, output_profiles
from calibre.devices.bebook.driver import BEBOOK, BEBOOK_MINI
from calibre.devices.blackberry.driver import BLACKBERRY
from calibre.devices.cybookg3.driver import CYBOOKG3
from calibre.devices.eb600.driver import EB600
from calibre.devices.jetbook.driver import JETBOOK
from calibre.devices.kindle.driver import KINDLE
from calibre.devices.kindle.driver import KINDLE2
from calibre.devices.prs500.driver import PRS500
from calibre.devices.prs505.driver import PRS505
from calibre.devices.prs700.driver import PRS700
plugins = []
plugins += [
ComicInput,
EPUBInput,
FB2Input,
HTMLInput,
LITInput,
MOBIInput,
ODTInput,
PDBInput,
PDFInput,
PMLInput,
RBInput,
RecipeInput,
RTFInput,
TXTInput,
]
plugins += [
EPUBOutput,
FB2Output,
LITOutput,
LRFOutput,
MOBIOutput,
OEBOutput,
PDBOutput,
PDFOutput,
PMLOutput,
RBOutput,
RTFOutput,
TXTOutput,
]
plugins += [
BEBOOK,
BEBOOK_MINI,
BLACKBERRY,
CYBOOKG3,
EB600,
JETBOOK,
KINDLE,
KINDLE2,
PRS500,
PRS505,
PRS700,
]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataWriter')]
plugins += input_profiles + output_profiles

View File

@ -0,0 +1,293 @@
from __future__ import with_statement
'''
Defines the plugin system for conversions.
'''
import re, os, shutil
from calibre import CurrentDir
from calibre.customize import Plugin
class ConversionOption(object):
'''
Class representing conversion options
'''
def __init__(self, name=None, help=None, long_switch=None,
short_switch=None, choices=None):
self.name = name
self.help = help
self.long_switch = long_switch
self.short_switch = short_switch
self.choices = choices
if self.long_switch is None:
self.long_switch = self.name.replace('_', '-')
self.validate_parameters()
def validate_parameters(self):
'''
Validate the parameters passed to :method:`__init__`.
'''
if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None:
raise ValueError(self.name + ' is not a valid Python identifier')
if not self.help:
raise ValueError('You must set the help text')
def __hash__(self):
return hash(self.name)
def __eq__(self, other):
return hash(self) == hash(other)
def clone(self):
return ConversionOption(name=self.name, help=self.help,
long_switch=self.long_switch, short_switch=self.short_switch,
choices=self.choices)
class OptionRecommendation(object):
LOW = 1
MED = 2
HIGH = 3
def __init__(self, recommended_value=None, level=LOW, **kwargs):
'''
An option recommendation. That is, an option as well as its recommended
value and the level of the recommendation.
'''
self.level = level
self.recommended_value = recommended_value
self.option = kwargs.pop('option', None)
if self.option is None:
self.option = ConversionOption(**kwargs)
self.validate_parameters()
@property
def help(self):
return self.option.help
def clone(self):
return OptionRecommendation(recommended_value=self.recommended_value,
level=self.level, option=self.option.clone())
def validate_parameters(self):
if self.option.choices and self.recommended_value not in \
self.option.choices:
raise ValueError('OpRec: %s: Recommended value not in choices'%
self.option.name)
if not (isinstance(self.recommended_value, (int, float, str, unicode))\
or self.recommended_value is None):
raise ValueError('OpRec: %s:'%self.option.name +
repr(self.recommended_value) +
' is not a string or a number')
class DummyReporter(object):
def __call__(self, percent, msg=''):
pass
class InputFormatPlugin(Plugin):
'''
InputFormatPlugins are responsible for converting a document into
HTML+OPF+CSS+etc.
The results of the conversion *must* be encoded in UTF-8.
The main action happens in :method:`convert`.
'''
type = _('Conversion Input')
can_be_disabled = False
supported_platforms = ['windows', 'osx', 'linux']
#: Set of file types for which this plugin should be run
#: For example: ``set(['azw', 'mobi', 'prc'])``
file_types = set([])
#: If True, this input plugin generates a collection of images,
#: one per HTML file. You can obtain access to the images via
#: convenience method, :method:`get_image_collection`.
is_image_collection = False
#: Options shared by all Input format plugins. Do not override
#: in sub-classes. Use :member:`options` instead. Every option must be an
#: instance of :class:`OptionRecommendation`.
common_options = set([
OptionRecommendation(name='debug_input',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Save the output from the input plugin to the specified '
'directory. Useful if you are unsure at which stage '
'of the conversion process a bug is occurring. '
'WARNING: This completely deletes the contents of '
'the specified directory.')
),
OptionRecommendation(name='input_encoding',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the input document. If '
'set this option will override any encoding declared by the '
'document itself. Particularly useful for documents that '
'do not declare an encoding or that have erroneous '
'encoding declarations.')
),
])
#: Options to customize the behavior of this plugin. Every option must be an
#: instance of :class:`OptionRecommendation`.
options = set([])
#: A set of 3-tuples of the form
#: (option_name, recommended_value, recommendation_level)
recommendations = set([])
def __init__(self, *args):
Plugin.__init__(self, *args)
self.report_progress = DummyReporter()
def get_images(self):
'''
Return a list of absolute paths to the images, if this input plugin
represents an image collection. The list of images is in the same order
as the spine and the TOC.
'''
raise NotImplementedError()
def preprocess_html(self, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
the HTML, like removing hard line breaks, etc.
:param html: A unicode string
:return: A unicode string
'''
return html
def convert(self, stream, options, file_ext, log, accelerators):
'''
This method must be implemented in sub-classes. It must return
the path to the created OPF file or an :class:`OEBBook` instance.
All output should be contained in the current directory.
If this plugin creates files outside the current
directory they must be deleted/marked for deletion before this method
returns.
:param stream: A file like object that contains the input file.
:param options: Options to customize the conversion process.
Guaranteed to have attributes corresponding
to all the options declared by this plugin. In
addition, it will have a verbose attribute that
takes integral values from zero upwards. Higher numbers
mean be more verbose. Another useful attribute is
``input_profile`` that is an instance of
:class:`calibre.customize.profiles.InputProfile`.
:param file_ext: The extension (without the .) of the input file. It
is guaranteed to be one of the `file_types` supported
by this plugin.
:param log: A :class:`calibre.utils.logging.Log` object. All output
should use this object.
:param accelarators: A dictionary of various information that the input
plugin can get easily that would speed up the
subsequent stages of the conversion.
'''
raise NotImplementedError
def __call__(self, stream, options, file_ext, log,
accelerators, output_dir):
log('InputFormatPlugin: %s running'%self.name, end=' ')
if hasattr(stream, 'name'):
log('on', stream.name)
with CurrentDir(output_dir):
for x in os.listdir('.'):
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
ret = self.convert(stream, options, file_ext,
log, accelerators)
if options.debug_input is not None:
options.debug_input = os.path.abspath(options.debug_input)
if not os.path.exists(options.debug_input):
os.makedirs(options.debug_input)
if isinstance(ret, basestring):
shutil.rmtree(options.debug_input)
shutil.copytree(output_dir, options.debug_input)
else:
from calibre.ebooks.oeb.writer import OEBWriter
w = OEBWriter(pretty_print=options.pretty_print)
w(ret, options.debug_input)
log.info('Input debug saved to:', options.debug_input)
return ret
class OutputFormatPlugin(Plugin):
'''
OutputFormatPlugins are responsible for converting an OEB document
(OPF+HTML) into an output ebook.
The OEB document can be assumed to be encoded in UTF-8.
The main action happens in :method:`convert`.
'''
type = _('Conversion Output')
can_be_disabled = False
supported_platforms = ['windows', 'osx', 'linux']
#: The file type (extension without leading period) that this
#: plugin outputs
file_type = None
#: Options shared by all Input format plugins. Do not override
#: in sub-classes. Use :member:`options` instead. Every option must be an
#: instance of :class:`OptionRecommendation`.
common_options = set([
OptionRecommendation(name='pretty_print',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('If specified, the output plugin will try to create output '
'that is as human readable as possible. May not have any effect '
'for some output plugins.')
),
])
#: Options to customize the behavior of this plugin. Every option must be an
#: instance of :class:`OptionRecommendation`.
options = set([])
#: A set of 3-tuples of the form
#: (option_name, recommended_value, recommendation_level)
recommendations = set([])
def __init__(self, *args):
Plugin.__init__(self, *args)
self.report_progress = DummyReporter()
def convert(self, oeb_book, output, input_plugin, opts, log):
'''
Render the contents of `oeb_book` (which is an instance of
:class:`calibre.ebooks.oeb.OEBBook` to the file specified by output.
:param output: Either a file like object or a string. If it is a string
it is the path to a directory that may or may not exist. The output
plugin should write its output into that directory. If it is a file like
object, the output plugin should write its output into the file.
:param input_plugin: The input plugin that was used at the beginning of
the conversion pipeline.
:param opts: Conversion options. Guaranteed to have attributes
corresponding to the OptionRecommendations of this plugin.
:param log: The logger. Print debug/info messages etc. using this.
'''
raise NotImplementedError

View File

@ -0,0 +1,241 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from itertools import izip
from calibre.customize import Plugin as _Plugin
FONT_SIZES = [('xx-small', 1),
('x-small', None),
('small', 2),
('medium', 3),
('large', 4),
('x-large', 5),
('xx-large', 6),
(None, 7)]
class Plugin(_Plugin):
fbase = 12
fsizes = [5, 7, 9, 12, 13.5, 17, 20, 22, 24]
screen_size = (1600, 1200)
dpi = 100
def __init__(self, *args, **kwargs):
_Plugin.__init__(self, *args, **kwargs)
self.width, self.height = self.screen_size
fsizes = list(self.fsizes)
self.fkey = list(self.fsizes)
self.fsizes = []
for (name, num), size in izip(FONT_SIZES, fsizes):
self.fsizes.append((name, num, float(size)))
self.fnames = dict((name, sz) for name, _, sz in self.fsizes if name)
self.fnums = dict((num, sz) for _, num, sz in self.fsizes if num)
class InputProfile(Plugin):
author = 'Kovid Goyal'
supported_platforms = set(['windows', 'osx', 'linux'])
can_be_disabled = False
type = _('Input profile')
name = 'Default Input Profile'
short_name = 'default' # Used in the CLI so dont use spaces etc. in it
description = _('This profile tries to provide sane defaults and is useful '
'if you know nothing about the input document.')
class SonyReaderInput(InputProfile):
name = 'Sony Reader'
short_name = 'sony'
description = _('This profile is intended for the SONY PRS line. '
'The 500/505/700 etc.')
screen_size = (584, 754)
dpi = 168.451
fbase = 12
fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
class MSReaderInput(InputProfile):
name = 'Microsoft Reader'
short_name = 'msreader'
description = _('This profile is intended for the Microsoft Reader.')
screen_size = (480, 652)
dpi = 96
fbase = 13
fsizes = [10, 11, 13, 16, 18, 20, 22, 26]
class MobipocketInput(InputProfile):
name = 'Mobipocket Books'
short_name = 'mobipocket'
description = _('This profile is intended for the Mobipocket books.')
# Unfortunately MOBI books are not narrowly targeted, so this information is
# quite likely to be spurious
screen_size = (600, 800)
dpi = 96
fbase = 18
fsizes = [14, 14, 16, 18, 20, 22, 24, 26]
class HanlinV3Input(InputProfile):
name = 'Hanlin V3'
short_name = 'hanlinv3'
description = _('This profile is intended for the Hanlin V3 and its clones.')
# Screen size is a best guess
screen_size = (584, 754)
dpi = 168.451
fbase = 16
fsizes = [12, 12, 14, 16, 18, 20, 22, 24]
class CybookG3Input(InputProfile):
name = 'Cybook G3'
short_name = 'cybookg3'
description = _('This profile is intended for the Cybook G3.')
# Screen size is a best guess
screen_size = (600, 800)
dpi = 168.451
fbase = 16
fsizes = [12, 12, 14, 16, 18, 20, 22, 24]
class KindleInput(InputProfile):
name = 'Kindle'
short_name = 'kindle'
description = _('This profile is intended for the Amazon Kindle.')
# Screen size is a best guess
screen_size = (525, 640)
dpi = 168.451
fbase = 16
fsizes = [12, 12, 14, 16, 18, 20, 22, 24]
input_profiles = [InputProfile, SonyReaderInput, MSReaderInput,
MobipocketInput, HanlinV3Input, CybookG3Input, KindleInput]
class OutputProfile(Plugin):
author = 'Kovid Goyal'
supported_platforms = set(['windows', 'osx', 'linux'])
can_be_disabled = False
type = _('Output profile')
name = 'Default Output Profile'
short_name = 'default' # Used in the CLI so dont use spaces etc. in it
description = _('This profile tries to provide sane defaults and is useful '
'if you want to produce a document intended to be read at a '
'computer or on a range of devices.')
# The image size for comics
comic_screen_size = (584, 754)
@classmethod
def tags_to_string(cls, tags):
return ', '.join(tags)
class SonyReaderOutput(OutputProfile):
name = 'Sony Reader'
short_name = 'sony'
description = _('This profile is intended for the SONY PRS line. '
'The 500/505/700 etc.')
screen_size = (600, 775)
dpi = 168.451
fbase = 12
fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
class SonyReaderLandscapeOutput(SonyReaderOutput):
name = 'Sony Reader Landscape'
short_name = 'sony-landscape'
description = _('This profile is intended for the SONY PRS line. '
'The 500/505/700 etc, in landscape mode. Mainly useful '
'for comics.')
screen_size = (784, 1012)
comic_screen_size = (784, 1012)
class MSReaderOutput(OutputProfile):
name = 'Microsoft Reader'
short_name = 'msreader'
description = _('This profile is intended for the Microsoft Reader.')
screen_size = (480, 652)
dpi = 96
fbase = 13
fsizes = [10, 11, 13, 16, 18, 20, 22, 26]
class MobipocketOutput(OutputProfile):
name = 'Mobipocket Books'
short_name = 'mobipocket'
description = _('This profile is intended for the Mobipocket books.')
# Unfortunately MOBI books are not narrowly targeted, so this information is
# quite likely to be spurious
screen_size = (600, 800)
dpi = 96
fbase = 18
fsizes = [14, 14, 16, 18, 20, 22, 24, 26]
class HanlinV3Output(OutputProfile):
name = 'Hanlin V3'
short_name = 'hanlinv3'
description = _('This profile is intended for the Hanlin V3 and its clones.')
# Screen size is a best guess
screen_size = (584, 754)
dpi = 168.451
fbase = 16
fsizes = [12, 12, 14, 16, 18, 20, 22, 24]
class CybookG3Output(OutputProfile):
name = 'Cybook G3'
short_name = 'cybookg3'
description = _('This profile is intended for the Cybook G3.')
# Screen size is a best guess
screen_size = (600, 800)
dpi = 168.451
fbase = 16
fsizes = [12, 12, 14, 16, 18, 20, 22, 24]
class KindleOutput(OutputProfile):
name = 'Kindle'
short_name = 'kindle'
description = _('This profile is intended for the Amazon Kindle.')
# Screen size is a best guess
screen_size = (525, 640)
dpi = 168.451
fbase = 16
fsizes = [12, 12, 14, 16, 18, 20, 22, 24]
@classmethod
def tags_to_string(cls, tags):
return 'ttt '.join(tags)+'ttt '
output_profiles = [OutputProfile, SonyReaderOutput, MSReaderOutput,
MobipocketOutput, HanlinV3Output, CybookG3Output, KindleOutput,
SonyReaderLandscapeOutput]

View File

@ -6,13 +6,15 @@ import os, shutil, traceback, functools, sys, re
from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \
MetadataWriterPlugin
from calibre.customize.conversion import InputFormatPlugin, OutputFormatPlugin
from calibre.customize.profiles import InputProfile, OutputProfile
from calibre.customize.builtins import plugins as builtin_plugins
from calibre.constants import __version__, iswindows, isosx
from calibre.devices.interface import DevicePlugin
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.config import make_config_dir, Config, ConfigProxy, \
plugin_dir, OptionParser
version = tuple([int(x) for x in __version__.split('.')])
platform = 'linux'
@ -47,7 +49,7 @@ def load_plugin(path_to_zip_file):
:return: A :class:`Plugin` instance.
'''
print 'Loading plugin from', path_to_zip_file
#print 'Loading plugin from', path_to_zip_file
if not os.access(path_to_zip_file, os.R_OK):
raise PluginNotFound
zf = ZipFile(path_to_zip_file)
@ -77,6 +79,15 @@ _on_import = {}
_on_preprocess = {}
_on_postprocess = {}
def input_profiles():
for plugin in _initialized_plugins:
if isinstance(plugin, InputProfile):
yield plugin
def output_profiles():
for plugin in _initialized_plugins:
if isinstance(plugin, OutputProfile):
yield plugin
def reread_filetype_plugins():
@ -121,7 +132,19 @@ def reread_metadata_plugins():
_metadata_writers[ft] = []
_metadata_writers[ft].append(plugin)
def metadata_readers():
ans = set([])
for plugins in _metadata_readers.values():
for plugin in plugins:
ans.add(plugin)
return ans
def metadata_writers():
ans = set([])
for plugins in _metadata_writers.values():
for plugin in plugins:
ans.add(plugin)
return ans
def get_file_type_metadata(stream, ftype):
mi = MetaInformation(None, None)
@ -229,6 +252,47 @@ def find_plugin(name):
if plugin.name == name:
return plugin
def input_format_plugins():
for plugin in _initialized_plugins:
if isinstance(plugin, InputFormatPlugin):
yield plugin
def plugin_for_input_format(fmt):
for plugin in input_format_plugins():
if fmt.lower() in plugin.file_types:
return plugin
def available_input_formats():
formats = set([])
for plugin in input_format_plugins():
if not is_disabled(plugin):
for format in plugin.file_types:
formats.add(format)
return formats
def output_format_plugins():
for plugin in _initialized_plugins:
if isinstance(plugin, OutputFormatPlugin):
yield plugin
def plugin_for_output_format(fmt):
for plugin in output_format_plugins():
if fmt.lower() == plugin.file_type:
return plugin
def available_output_formats():
formats = set([])
for plugin in output_format_plugins():
if not is_disabled(plugin):
formats.add(plugin.file_type)
return formats
def device_plugins():
for plugin in _initialized_plugins:
if isinstance(plugin, DevicePlugin):
if not is_disabled(plugin):
yield plugin
def disable_plugin(plugin_or_name):
x = getattr(plugin_or_name, 'name', plugin_or_name)
plugin = find_plugin(x)

View File

@ -5,21 +5,6 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
Device drivers.
'''
def devices():
from calibre.devices.prs500.driver import PRS500
from calibre.devices.prs505.driver import PRS505
from calibre.devices.prs700.driver import PRS700
from calibre.devices.cybookg3.driver import CYBOOKG3
from calibre.devices.kindle.driver import KINDLE
from calibre.devices.kindle.driver import KINDLE2
from calibre.devices.bebook.driver import BEBOOK
from calibre.devices.bebook.driver import BEBOOKMINI
from calibre.devices.blackberry.driver import BLACKBERRY
from calibre.devices.eb600.driver import EB600
from calibre.devices.jetbook.driver import JETBOOK
return (PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2,
BEBOOK, BEBOOKMINI, BLACKBERRY, EB600, JETBOOK)
import time
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)

View File

@ -7,19 +7,25 @@ Device driver for BeBook
from calibre.devices.usbms.driver import USBMS
class BEBOOK(USBMS):
name = 'BeBook driver'
description = _('Communicate with the BeBook eBook reader.')
author = _('Tijmen Ruizendaal')
supported_platforms = ['windows', 'osx', 'linux']
# Ordered list of supported formats
FORMATS = ['mobi', 'epub', 'pdf', 'txt']
VENDOR_ID = [0x0525]
PRODUCT_ID = [0x8803, 0x6803]
BCD = [0x312]
BCD = [0x312]
VENDOR_NAME = 'LINUX'
VENDOR_NAME = 'LINUX'
WINDOWS_MAIN_MEM = 'FILE-STOR_GADGET'
WINDOWS_CARD_MEM = 'FILE-STOR_GADGET'
OSX_MAIN_MEM = 'BeBook Internal Memory'
OSX_CARD_MEM = 'BeBook Storage Card'
OSX_CARD_A_MEM = 'BeBook Storage Card'
MAIN_MEMORY_VOLUME_LABEL = 'BeBook Internal Memory'
STORAGE_CARD_VOLUME_LABEL = 'BeBook Storage Card'
@ -30,20 +36,22 @@ class BEBOOK(USBMS):
def windows_sort_drives(self, drives):
main = drives.get('main', None)
card = drives.get('card', None)
card = drives.get('carda', None)
if card and main and card < main:
drives['main'] = card
drives['card'] = main
drives['carda'] = main
return drives
class BEBOOK_MINI(BEBOOK):
name = 'BeBook Mini driver'
description = _('Communicate with the BeBook Mini eBook reader.')
class BEBOOKMINI(BEBOOK):
VENDOR_ID = [0x0492]
PRODUCT_ID = [0x8813]
BCD = [0x319]
BCD = [0x319]
OSX_MAIN_MEM = 'BeBook Mini Internal Memory'
OSX_CARD_MEM = 'BeBook Mini Storage Card'

View File

@ -7,6 +7,12 @@ __docformat__ = 'restructuredtext en'
from calibre.devices.usbms.driver import USBMS
class BLACKBERRY(USBMS):
name = 'Blackberry Device Interface'
description = _('Communicate with the Blackberry smart phone.')
author = _('Kovid Goyal')
supported_platforms = ['windows', 'linux']
# Ordered list of supported formats
FORMATS = ['mobi', 'prc']
@ -16,15 +22,11 @@ class BLACKBERRY(USBMS):
VENDOR_NAME = 'RIM'
WINDOWS_MAIN_MEM = 'BLACKBERRY_SD'
#WINDOWS_CARD_MEM = 'CARD_STORAGE'
#OSX_MAIN_MEM = 'Kindle Internal Storage Media'
#OSX_CARD_MEM = 'Kindle Card Storage Media'
MAIN_MEMORY_VOLUME_LABEL = 'Blackberry Main Memory'
#STORAGE_CARD_VOLUME_LABEL = 'Kindle Storage Card'
EBOOK_DIR_MAIN = 'ebooks'
#EBOOK_DIR_CARD = "documents"
SUPPORTS_SUB_DIRS = True

View File

@ -7,11 +7,17 @@ Device driver for Bookeen's Cybook Gen 3
import os, shutil
from itertools import cycle
from calibre.devices.errors import FreeSpaceError
from calibre.devices.errors import DeviceError, FreeSpaceError
from calibre.devices.usbms.driver import USBMS
import calibre.devices.cybookg3.t2b as t2b
class CYBOOKG3(USBMS):
name = 'Cybook Gen 3 Device Interface'
description = _('Communicate with the Cybook eBook reader.')
author = _('John Schember')
supported_platforms = ['windows', 'osx', 'linux']
# Ordered list of supported formats
# Be sure these have an entry in calibre.devices.mime
FORMATS = ['mobi', 'prc', 'html', 'pdf', 'rtf', 'txt']
@ -22,60 +28,45 @@ class CYBOOKG3(USBMS):
VENDOR_NAME = 'BOOKEEN'
WINDOWS_MAIN_MEM = 'CYBOOK_GEN3__-FD'
WINDOWS_CARD_MEM = 'CYBOOK_GEN3__-SD'
WINDOWS_CARD_A_MEM = 'CYBOOK_GEN3__-SD'
OSX_MAIN_MEM = 'Bookeen Cybook Gen3 -FD Media'
OSX_CARD_MEM = 'Bookeen Cybook Gen3 -SD Media'
OSX_CARD_A_MEM = 'Bookeen Cybook Gen3 -SD Media'
MAIN_MEMORY_VOLUME_LABEL = 'Cybook Gen 3 Main Memory'
STORAGE_CARD_VOLUME_LABEL = 'Cybook Gen 3 Storage Card'
EBOOK_DIR_MAIN = "eBooks"
EBOOK_DIR_CARD = "eBooks"
EBOOK_DIR_CARD_A = "eBooks"
THUMBNAIL_HEIGHT = 144
SUPPORTS_SUB_DIRS = True
def upload_books(self, files, names, on_card=False, end_session=True,
def upload_books(self, files, names, on_card=None, end_session=True,
metadata=None):
if on_card and not self._card_prefix:
raise ValueError(_('The reader has no storage card connected.'))
if not on_card:
path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN)
else:
path = os.path.join(self._card_prefix, self.EBOOK_DIR_CARD)
def get_size(obj):
if hasattr(obj, 'seek'):
obj.seek(0, os.SEEK_END)
size = obj.tell()
obj.seek(0)
return size
return os.path.getsize(obj)
sizes = [get_size(f) for f in files]
size = sum(sizes)
if on_card and size > self.free_space()[2] - 1024*1024:
raise FreeSpaceError(_("There is insufficient free space on the storage card"))
if not on_card and size > self.free_space()[0] - 2*1024*1024:
raise FreeSpaceError(_("There is insufficient free space in main memory"))
path = self._sanity_check(on_card, files)
paths = []
names = iter(names)
metadata = iter(metadata)
for infile in files:
for i, infile in enumerate(files):
newpath = path
mdata = metadata.next()
if self.SUPPORTS_SUB_DIRS:
if 'tags' in mdata.keys():
for tag in mdata['tags']:
if tag.startswith('/'):
newpath += tag
newpath = os.path.normpath(newpath)
break
if 'tags' in mdata.keys():
for tag in mdata['tags']:
if tag.startswith(_('News')):
newpath = os.path.join(newpath, 'news')
newpath = os.path.join(newpath, mdata.get('title', ''))
newpath = os.path.join(newpath, mdata.get('timestamp', ''))
elif tag.startswith('/'):
newpath += tag
newpath = os.path.normpath(newpath)
break
if newpath == path:
newpath = os.path.join(newpath, mdata.get('authors', _('Unknown')))
newpath = os.path.join(newpath, mdata.get('title', _('Unknown')))
if not os.path.exists(newpath):
os.makedirs(newpath)
@ -103,10 +94,15 @@ class CYBOOKG3(USBMS):
t2b.write_t2b(t2bfile, coverdata)
t2bfile.close()
self.report_progress(i / float(len(files)), _('Transferring books to device...'))
self.report_progress(1.0, _('Transferring books to device...'))
return zip(paths, cycle([on_card]))
def delete_books(self, paths, end_session=True):
for path in paths:
for i, path in enumerate(paths):
self.report_progress((i+1) / float(len(paths)), _('Removing books from device...'))
if os.path.exists(path):
os.unlink(path)
@ -115,6 +111,8 @@ class CYBOOKG3(USBMS):
# Delete the ebook auxiliary file
if os.path.exists(filepath + '.mbp'):
os.unlink(filepath + '.mbp')
if os.path.exists(filepath + '.dat'):
os.unlink(filepath + '.dat')
# Delete the thumbnails file auto generated for the ebook
if os.path.exists(filepath + '_6090.t2b'):
@ -124,4 +122,4 @@ class CYBOOKG3(USBMS):
os.removedirs(os.path.dirname(path))
except:
pass
self.report_progress(1.0, _('Removing books from device...'))

15
src/calibre/devices/eb600/driver.py Executable file → Normal file
View File

@ -14,6 +14,11 @@ Windows PNP strings:
from calibre.devices.usbms.driver import USBMS
class EB600(USBMS):
name = 'Netronix EB600 Device Interface'
description = _('Communicate with the EB600 eBook reader.')
author = _('Kovid Goyal')
supported_platforms = ['windows', 'osx', 'linux']
# Ordered list of supported formats
FORMATS = ['epub', 'prc', 'chm', 'djvu', 'html', 'rtf', 'txt', 'pdf']
DRM_FORMATS = ['prc', 'mobi', 'html', 'pdf', 'txt']
@ -24,24 +29,24 @@ class EB600(USBMS):
VENDOR_NAME = 'NETRONIX'
WINDOWS_MAIN_MEM = 'EBOOK'
WINDOWS_CARD_MEM = 'EBOOK'
WINDOWS_CARD_A_MEM = 'EBOOK'
OSX_MAIN_MEM = 'EB600 Internal Storage Media'
OSX_CARD_MEM = 'EB600 Card Storage Media'
OSX_CARD_A_MEM = 'EB600 Card Storage Media'
MAIN_MEMORY_VOLUME_LABEL = 'EB600 Main Memory'
STORAGE_CARD_VOLUME_LABEL = 'EB600 Storage Card'
EBOOK_DIR_MAIN = ''
EBOOK_DIR_CARD = ''
EBOOK_DIR_CARD_A = ''
SUPPORTS_SUB_DIRS = True
def windows_sort_drives(self, drives):
main = drives.get('main', None)
card = drives.get('card', None)
card = drives.get('carda', None)
if card and main and card < main:
drives['main'] = card
drives['card'] = main
drives['carda'] = main
return drives

View File

@ -6,8 +6,9 @@ the GUI. A device backend must subclass the L{Device} class. See prs500.py for
a backend that implement the Device interface for the SONY PRS500 Reader.
"""
from calibre.customize import Plugin
class Device(object):
class DevicePlugin(Plugin):
"""
Defines the interface that should be implemented by backends that
communicate with an ebook reader.
@ -16,6 +17,8 @@ class Device(object):
the front-end needs to call several methods one after another, in which case
the USB session should not be closed after each method call.
"""
type = _('Device Interface')
# Ordered list of supported formats
FORMATS = ["lrf", "rtf", "pdf", "txt"]
VENDOR_ID = 0x0000
@ -27,7 +30,7 @@ class Device(object):
# Whether the metadata on books can be set via the GUI.
CAN_SET_METADATA = True
def __init__(self, key='-1', log_packets=False, report_progress=None) :
def reset(self, key='-1', log_packets=False, report_progress=None) :
"""
@param key: The key to unlock the device
@param log_packets: If true the packet stream to/from the device is logged
@ -87,7 +90,13 @@ class Device(object):
def card_prefix(self, end_session=True):
'''
Return prefix to paths on the card or '' if no cards present.
Return a 2 element list of the prefix to paths on the cards.
If no card is present None is set for the card's prefix.
E.G.
('/place', '/place2')
(None, 'place2')
('place', None)
(None, None)
'''
raise NotImplementedError()
@ -95,8 +104,8 @@ class Device(object):
"""
Get total space available on the mountpoints:
1. Main memory
2. Memory Stick
3. SD Card
2. Memory Card A
3. Memory Card B
@return: A 3 element list with total space in bytes of (1, 2, 3). If a
particular device doesn't have any of these locations it should return 0.
@ -115,24 +124,25 @@ class Device(object):
"""
raise NotImplementedError()
def books(self, oncard=False, end_session=True):
def books(self, oncard=None, end_session=True):
"""
Return a list of ebooks on the device.
@param oncard: If True return a list of ebooks on the storage card,
otherwise return list of ebooks in main memory of device.
If True and no books on card return empty list.
@param oncard: If 'carda' or 'cardb' return a list of ebooks on the
specific storage card, otherwise return list of ebooks
in main memory of device. If a card is specified and no
books are on the card return empty list.
@return: A BookList.
"""
raise NotImplementedError()
def upload_books(self, files, names, on_card=False, end_session=True,
def upload_books(self, files, names, on_card=None, end_session=True,
metadata=None):
'''
Upload a list of books to the device. If a file already
exists on the device, it should be replaced.
This method should raise a L{FreeSpaceError} if there is not enough
free space on the device. The text of the FreeSpaceError must contain the
word "card" if C{on_card} is True otherwise it must contain the word "memory".
word "card" if C{on_card} is not None otherwise it must contain the word "memory".
@param files: A list of paths and/or file-like objects.
@param names: A list of file names that the books should have
once uploaded to the device. len(names) == len(files)
@ -163,7 +173,8 @@ class Device(object):
another dictionary that maps tag names to lists of book ids. The ids are
ids from the book database.
@param booklists: A tuple containing the result of calls to
(L{books}(oncard=False), L{books}(oncard=True)).
(L{books}(oncard=None), L{books}(oncard='carda'),
L{books}(oncard='cardb')).
'''
raise NotImplementedError
@ -180,7 +191,8 @@ class Device(object):
with the device.
@param paths: paths to books on the device.
@param booklists: A tuple containing the result of calls to
(L{books}(oncard=False), L{books}(oncard=True)).
(L{books}(oncard=None), L{books}(oncard='carda'),
L{books}(oncard='cardb')).
'''
raise NotImplementedError()
@ -188,7 +200,8 @@ class Device(object):
'''
Update metadata on device.
@param booklists: A tuple containing the result of calls to
(L{books}(oncard=False), L{books}(oncard=True)).
(L{books}(oncard=None), L{books}(oncard='carda'),
L{books}(oncard='cardb')).
'''
raise NotImplementedError()
@ -199,6 +212,30 @@ class Device(object):
'''
raise NotImplementedError()
@classmethod
def config_widget(cls):
'''
Should return a QWidget. The QWidget contains the settings for the device interface
'''
raise NotImplementedError()
@classmethod
def save_settings(cls, settings_widget):
'''
Should save settings to disk. Takes the widget created in config_widget
and saves all settings to disk.
'''
raise NotImplementedError()
@classmethod
def settings(cls):
'''
Should return an opts object. The opts object should have one attribute
`format_map` which is an ordered list of formats for the device.
'''
raise NotImplementedError()
class BookList(list):

View File

@ -7,10 +7,16 @@ Device driver for Ectaco Jetbook firmware >= JL04_v030e
import os, re, sys, shutil
from itertools import cycle
from calibre.devices.usbms.driver import USBMS, metadata_from_formats
from calibre.devices.usbms.driver import USBMS
from calibre import sanitize_file_name as sanitize
class JETBOOK(USBMS):
name = 'Ectaco JetBook Device Interface'
description = _('Communicate with the JetBook eBook reader.')
author = _('James Ralston')
supported_platforms = ['windows', 'osx', 'linux']
# Ordered list of supported formats
# Be sure these have an entry in calibre.devices.mime
FORMATS = ['epub', 'mobi', 'prc', 'txt', 'rtf', 'pdf']
@ -46,27 +52,34 @@ class JETBOOK(USBMS):
names = iter(names)
metadata = iter(metadata)
for infile in files:
for i, infile in enumerate(files):
newpath = path
if self.SUPPORTS_SUB_DIRS:
mdata = metadata.next()
mdata = metadata.next()
if 'tags' in mdata.keys():
for tag in mdata['tags']:
if tag.startswith('/'):
newpath += tag
newpath = os.path.normpath(newpath)
break
if not os.path.exists(newpath):
os.makedirs(newpath)
if 'tags' in mdata.keys():
for tag in mdata['tags']:
if tag.startswith(_('News')):
newpath = os.path.join(newpath, 'news')
newpath = os.path.join(newpath, mdata.get('title', ''))
newpath = os.path.join(newpath, mdata.get('timestamp', ''))
break
elif tag.startswith('/'):
newpath += tag
newpath = os.path.normpath(newpath)
break
author = sanitize(mdata.get('authors','Unknown')).replace(' ', '_')
title = sanitize(mdata.get('title', 'Unknown')).replace(' ', '_')
fileext = os.path.splitext(os.path.basename(names.next()))[1]
fname = '%s#%s%s' % (author, title, fileext)
if newpath == path:
newpath = os.path.join(newpath, author, title)
if not os.path.exists(newpath):
os.makedirs(newpath)
filepath = os.path.join(newpath, fname)
paths.append(filepath)
@ -81,6 +94,10 @@ class JETBOOK(USBMS):
else:
shutil.copy2(infile, filepath)
self.report_progress((i+1) / float(len(files)), _('Transferring books to device...'))
self.report_progress(1.0, _('Transferring books to device...'))
return zip(paths, cycle([on_card]))
@classmethod
@ -93,6 +110,7 @@ class JETBOOK(USBMS):
return txt
from calibre.devices.usbms.driver import metadata_from_formats
mi = metadata_from_formats([path])
if (mi.title==_('Unknown') or mi.authors==[_('Unknown')]) \
@ -108,10 +126,10 @@ class JETBOOK(USBMS):
def windows_sort_drives(self, drives):
main = drives.get('main', None)
card = drives.get('card', None)
card = drives.get('carda', None)
if card and main and card < main:
drives['main'] = card
drives['card'] = main
drives['carda'] = main
return drives

22
src/calibre/devices/kindle/driver.py Executable file → Normal file
View File

@ -6,9 +6,14 @@ Device driver for Amazon's Kindle
import os, re, sys
from calibre.devices.usbms.driver import USBMS, metadata_from_formats
from calibre.devices.usbms.driver import USBMS
class KINDLE(USBMS):
name = 'Kindle Device Interface'
description = _('Communicate with the Kindle eBook reader.')
author = _('John Schember')
supported_platforms = ['windows', 'osx', 'linux']
# Ordered list of supported formats
FORMATS = ['azw', 'mobi', 'prc', 'azw1', 'tpz', 'txt']
@ -18,23 +23,24 @@ class KINDLE(USBMS):
VENDOR_NAME = 'KINDLE'
WINDOWS_MAIN_MEM = 'INTERNAL_STORAGE'
WINDOWS_CARD_MEM = 'CARD_STORAGE'
WINDOWS_CARD_A_MEM = 'CARD_STORAGE'
OSX_MAIN_MEM = 'Kindle Internal Storage Media'
OSX_CARD_MEM = 'Kindle Card Storage Media'
OSX_CARD_A_MEM = 'Kindle Card Storage Media'
MAIN_MEMORY_VOLUME_LABEL = 'Kindle Main Memory'
STORAGE_CARD_VOLUME_LABEL = 'Kindle Storage Card'
EBOOK_DIR_MAIN = "documents"
EBOOK_DIR_CARD = "documents"
EBOOK_DIR_CARD_A = "documents"
SUPPORTS_SUB_DIRS = True
WIRELESS_FILE_NAME_PATTERN = re.compile(
r'(?P<title>[^-]+)-asin_(?P<asin>[a-zA-Z\d]{10,})-type_(?P<type>\w{4})-v_(?P<index>\d+).*')
def delete_books(self, paths, end_session=True):
for path in paths:
for i, path in enumerate(paths):
self.report_progress((i+1) / float(len(paths)), _('Removing books from device...'))
if os.path.exists(path):
os.unlink(path)
@ -43,9 +49,11 @@ class KINDLE(USBMS):
# Delete the ebook auxiliary file
if os.path.exists(filepath + '.mbp'):
os.unlink(filepath + '.mbp')
self.report_progress(1.0, _('Removing books from device...'))
@classmethod
def metadata_from_path(cls, path):
from calibre.ebooks.metadata.meta import metadata_from_formats
mi = metadata_from_formats([path])
if mi.title == _('Unknown') or ('-asin' in mi.title and '-type' in mi.title):
match = cls.WIRELESS_FILE_NAME_PATTERN.match(os.path.basename(path))
@ -58,6 +66,10 @@ class KINDLE(USBMS):
class KINDLE2(KINDLE):
name = 'Kindle 2 Device Interface'
description = _('Communicate with the Kindle 2 eBook reader.')
author = _('John Schember')
supported_platforms = ['windows', 'osx', 'linux']
PRODUCT_ID = [0x0002]
BCD = [0x0100]

View File

@ -116,8 +116,8 @@ class Device(Structure):
raise Error("Cannot open device")
return handle.contents
@apply
def configurations():
@dynamic_property
def configurations(self):
doc = """ List of device configurations. See L{ConfigDescriptor} """
def fget(self):
ans = []
@ -127,8 +127,8 @@ class Device(Structure):
return property(doc=doc, fget=fget)
class Bus(Structure):
@apply
def device_list():
@dynamic_property
def device_list(self):
doc = \
"""
Flat list of devices on this bus.

View File

@ -55,8 +55,8 @@ class Book(object):
size = book_metadata_field("size", formatter=int)
# When setting this attribute you must use an epoch
datetime = book_metadata_field("date", formatter=strptime, setter=strftime)
@apply
def title_sorter():
@dynamic_property
def title_sorter(self):
doc = '''String to sort the title. If absent, title is returned'''
def fget(self):
src = self.elem.getAttribute('titleSorter').strip()
@ -67,8 +67,8 @@ class Book(object):
self.elem.setAttribute('titleSorter', sortable_title(unicode(val)))
return property(doc=doc, fget=fget, fset=fset)
@apply
def thumbnail():
@dynamic_property
def thumbnail(self):
doc = \
"""
The thumbnail. Should be a height 68 image.
@ -88,15 +88,15 @@ class Book(object):
return decode(rc)
return property(fget=fget, doc=doc)
@apply
def path():
@dynamic_property
def path(self):
doc = """ Absolute path to book on device. Setting not supported. """
def fget(self):
return self.root + self.rpath
return property(fget=fget, doc=doc)
@apply
def db_id():
@dynamic_property
def db_id(self):
doc = '''The database id in the application database that this file corresponds to'''
def fget(self):
match = re.search(r'_(\d+)$', self.rpath.rpartition('.')[0])
@ -116,7 +116,7 @@ class Book(object):
self.authors.encode('utf-8') + " at " + self.path.encode('utf-8')
def fix_ids(media, cache):
def fix_ids(media, cache, *args):
'''
Adjust ids in cache to correspond with media.
'''

View File

@ -13,7 +13,7 @@ from calibre import __version__, iswindows, __appname__
from calibre.devices.errors import PathError
from calibre.utils.terminfo import TerminalController
from calibre.devices.errors import ArgumentError, DeviceError, DeviceLocked
from calibre.devices import devices
from calibre.customize.ui import device_plugins
from calibre.devices.scanner import DeviceScanner
MINIMUM_COL_WIDTH = 12 #: Minimum width of columns in ls output
@ -39,8 +39,8 @@ class FileFormatter(object):
self.name = file.name
self.path = file.path
@apply
def mode_string():
@dynamic_property
def mode_string(self):
doc=""" The mode string for this file. There are only two modes read-only and read-write """
def fget(self):
mode, x = "-", "-"
@ -50,8 +50,8 @@ class FileFormatter(object):
return mode
return property(doc=doc, fget=fget)
@apply
def isdir_name():
@dynamic_property
def isdir_name(self):
doc='''Return self.name + '/' if self is a directory'''
def fget(self):
name = self.name
@ -61,8 +61,8 @@ class FileFormatter(object):
return property(doc=doc, fget=fget)
@apply
def name_in_color():
@dynamic_property
def name_in_color(self):
doc=""" The name in ANSI text. Directories are blue, ebooks are green """
def fget(self):
cname = self.name
@ -75,22 +75,22 @@ class FileFormatter(object):
return cname
return property(doc=doc, fget=fget)
@apply
def human_readable_size():
@dynamic_property
def human_readable_size(self):
doc=""" File size in human readable form """
def fget(self):
return human_readable(self.size)
return property(doc=doc, fget=fget)
@apply
def modification_time():
@dynamic_property
def modification_time(self):
doc=""" Last modified time in the Linux ls -l format """
def fget(self):
return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.wtime))
return property(doc=doc, fget=fget)
@apply
def creation_time():
@dynamic_property
def creation_time(self):
doc=""" Last modified time in the Linux ls -l format """
def fget(self):
return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.ctime))
@ -203,9 +203,10 @@ def main():
_wmi = wmi.WMI()
scanner = DeviceScanner(_wmi)
scanner.scan()
for d in devices():
for d in device_plugins():
if scanner.is_device_connected(d):
dev = d(log_packets=options.log_packets)
dev = d
dev.reset(log_packets=options.log_packets)
if dev is None:
print >>sys.stderr, 'Unable to find a connected ebook reader.'

13
src/calibre/devices/prs500/driver.py Executable file → Normal file
View File

@ -40,13 +40,14 @@ from array import array
from functools import wraps
from StringIO import StringIO
from calibre.devices.interface import Device
from calibre.devices.interface import DevicePlugin
from calibre.devices.libusb import Error as USBError
from calibre.devices.libusb import get_device_by_id
from calibre.devices.prs500.prstypes import *
from calibre.devices.errors import *
from calibre.devices.prs500.books import BookList, fix_ids
from calibre import __author__, __appname__
from calibre.devices.usbms.deviceconfig import DeviceConfig
# Protocol versions this driver has been tested with
KNOWN_USB_PROTOCOL_VERSIONS = [0x3030303030303130L]
@ -76,12 +77,16 @@ class File(object):
return self.name
class PRS500(Device):
class PRS500(DeviceConfig, DevicePlugin):
"""
Implements the backend for communication with the SONY Reader.
Each method decorated by C{safe} performs a task.
"""
name = 'PRS-500 Device Interface'
description = _('Communicate with the Sony PRS-500 eBook reader.')
author = _('Kovid Goyal')
supported_platforms = ['windows', 'osx', 'linux']
VENDOR_ID = 0x054c #: SONY Vendor Id
PRODUCT_ID = 0x029b #: Product Id for the PRS-500
@ -181,7 +186,7 @@ class PRS500(Device):
return run_session
def __init__(self, key='-1', log_packets=False, report_progress=None) :
def reset(self, key='-1', log_packets=False, report_progress=None) :
"""
@param key: The key to unlock the device
@param log_packets: If true the packet stream to/from the device is logged
@ -620,6 +625,8 @@ class PRS500(Device):
data_type=FreeSpaceAnswer, \
command_number=FreeSpaceQuery.NUMBER)[0]
data.append( pkt.free )
data = [x for x in data if x != 0]
data.append(0)
return data
def _exists(self, path):

View File

@ -284,8 +284,8 @@ class Command(TransferBuffer):
# Length of the data part of this packet
length = field(start=12, fmt=DWORD)
@apply
def data():
@dynamic_property
def data(self):
doc = \
"""
The data part of this command. Returned/set as/by a TransferBuffer.
@ -447,8 +447,8 @@ class LongCommand(Command):
self.length = 16
self.command = command
@apply
def command():
@dynamic_property
def command(self):
doc = \
"""
Usually carries extra information needed for the command
@ -568,8 +568,8 @@ class FileOpen(PathCommand):
PathCommand.__init__(self, path, FileOpen.NUMBER, path_len_at_byte=20)
self.mode = mode
@apply
def mode():
@dynamic_property
def mode(self):
doc = \
"""
The file open mode. Is either L{FileOpen.READ}
@ -651,8 +651,8 @@ class Response(Command):
raise PacketError("Response packets must have their number set to " \
+ hex(0x00001000))
@apply
def data():
@dynamic_property
def data(self):
doc = \
"""
The last 3 DWORDs (12 bytes) of data in this
@ -681,43 +681,43 @@ class ListResponse(Response):
PATH_NOT_FOUND = 0xffffffd7 #: Queried path is not found
PERMISSION_DENIED = 0xffffffd6 #: Permission denied
@apply
def is_file():
@dynamic_property
def is_file(self):
doc = """ True iff queried path is a file """
def fget(self):
return self.code == ListResponse.IS_FILE
return property(doc=doc, fget=fget)
@apply
def is_invalid():
@dynamic_property
def is_invalid(self):
doc = """ True iff queried path is invalid """
def fget(self):
return self.code == ListResponse.IS_INVALID
return property(doc=doc, fget=fget)
@apply
def path_not_found():
@dynamic_property
def path_not_found(self):
doc = """ True iff queried path is not found """
def fget(self):
return self.code == ListResponse.PATH_NOT_FOUND
return property(doc=doc, fget=fget)
@apply
def permission_denied():
@dynamic_property
def permission_denied(self):
doc = """ True iff permission is denied for path operations """
def fget(self):
return self.code == ListResponse.PERMISSION_DENIED
return property(doc=doc, fget=fget)
@apply
def is_unmounted():
@dynamic_property
def is_unmounted(self):
doc = """ True iff queried path is unmounted (i.e. removed storage card) """
def fget(self):
return self.code == ListResponse.IS_UNMOUNTED
return property(doc=doc, fget=fget)
@apply
def is_eol():
@dynamic_property
def is_eol(self):
doc = """ True iff there are no more items in the list """
def fget(self):
return self.code == ListResponse.IS_EOL
@ -759,8 +759,8 @@ class FileProperties(Answer):
# 0 = default permissions, 4 = read only
permissions = field(start=36, fmt=DWORD)
@apply
def is_dir():
@dynamic_property
def is_dir(self):
doc = """True if path points to a directory, False if it points to a file."""
def fget(self):
@ -776,8 +776,8 @@ class FileProperties(Answer):
return property(doc=doc, fget=fget, fset=fset)
@apply
def is_readonly():
@dynamic_property
def is_readonly(self):
doc = """ Whether this file is readonly."""
def fget(self):
@ -801,8 +801,8 @@ class IdAnswer(Answer):
""" Defines the structure of packets that contain identifiers for queries. """
@apply
def id():
@dynamic_property
def id(self):
doc = \
"""
The identifier. C{unsigned int} stored in 4 bytes
@ -841,8 +841,8 @@ class ListAnswer(Answer):
name_length = field(start=20, fmt=DWORD)
name = stringfield(name_length, start=24)
@apply
def is_dir():
@dynamic_property
def is_dir(self):
doc = \
"""
True if list item points to a directory, False if it points to a file.
@ -859,4 +859,3 @@ class ListAnswer(Answer):
return property(doc=doc, fget=fget, fset=fset)

View File

@ -64,8 +64,8 @@ class Book(object):
# When setting this attribute you must use an epoch
datetime = book_metadata_field("date", formatter=strptime, setter=strftime)
@apply
def title_sorter():
@dynamic_property
def title_sorter(self):
doc = '''String to sort the title. If absent, title is returned'''
def fget(self):
src = self.elem.getAttribute('titleSorter').strip()
@ -76,8 +76,8 @@ class Book(object):
self.elem.setAttribute('titleSorter', sortable_title(unicode(val)))
return property(doc=doc, fget=fget, fset=fset)
@apply
def thumbnail():
@dynamic_property
def thumbnail(self):
doc = \
"""
The thumbnail. Should be a height 68 image.
@ -99,15 +99,15 @@ class Book(object):
return decode(rc)
return property(fget=fget, doc=doc)
@apply
def path():
@dynamic_property
def path(self):
doc = """ Absolute path to book on device. Setting not supported. """
def fget(self):
return self.mountpath + self.rpath
return property(fget=fget, doc=doc)
@apply
def db_id():
@dynamic_property
def db_id(self):
doc = '''The database id in the application database that this file corresponds to'''
def fget(self):
match = re.search(r'_(\d+)$', self.rpath.rpartition('.')[0])
@ -129,7 +129,7 @@ class Book(object):
class BookList(_BookList):
def __init__(self, xml_file, mountpath):
def __init__(self, xml_file, mountpath, report_progress=None):
_BookList.__init__(self)
xml_file.seek(0)
self.document = dom.parse(xml_file)
@ -144,7 +144,10 @@ class BookList(_BookList):
else:
self.prefix = ''
for book in self.root_element.childNodes:
nodes = self.root_element.childNodes
for i, book in enumerate(nodes):
if report_progress:
report_progress((i+1) / float(len(nodes)), _('Getting list of books on device...'))
if hasattr(book, 'tagName') and book.tagName.endswith('text'):
tags = [i.getAttribute('title') for i in self.get_playlists(book.getAttribute('id'))]
self.append(Book(book, mountpath, tags, prefix=self.prefix))
@ -380,14 +383,16 @@ class BookList(_BookList):
item.setAttribute('id', str(map[id]))
pl.appendChild(item)
def fix_ids(main, card):
def fix_ids(main, carda, cardb):
'''
Adjust ids the XML databases.
'''
if hasattr(main, 'purge_empty_playlists'):
main.purge_empty_playlists()
if hasattr(card, 'purge_empty_playlists'):
card.purge_empty_playlists()
if hasattr(carda, 'purge_empty_playlists'):
carda.purge_empty_playlists()
if hasattr(cardb, 'purge_empty_playlists'):
cardb.purge_empty_playlists()
def regen_ids(db):
if not hasattr(db, 'root_element'):
@ -413,6 +418,7 @@ def fix_ids(main, card):
db.reorder_playlists()
regen_ids(main)
regen_ids(card)
regen_ids(carda)
regen_ids(cardb)
main.set_next_id(str(main.max_id()+1))

View File

@ -1,399 +1,120 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'2009, John Schember <john at nachtimwald.com>'
'''
Device driver for the SONY PRS-505
'''
import sys, os, shutil, time, subprocess, re
import os, time
from itertools import cycle
from calibre.devices.interface import Device
from calibre.devices.usbms.cli import CLI
from calibre.devices.usbms.device import Device
from calibre.devices.errors import DeviceError, FreeSpaceError
from calibre.devices.prs505.books import BookList, fix_ids
from calibre import iswindows, islinux, isosx, __appname__
from calibre.devices.errors import PathError
from calibre import __appname__
class File(object):
def __init__(self, path):
stats = os.stat(path)
self.is_dir = os.path.isdir(path)
self.is_readonly = not os.access(path, os.W_OK)
self.ctime = stats.st_ctime
self.wtime = stats.st_mtime
self.size = stats.st_size
if path.endswith(os.sep):
path = path[:-1]
self.path = path
self.name = os.path.basename(path)
class PRS505(CLI, Device):
name = 'PRS-505 Device Interface'
description = _('Communicate with the Sony PRS-505 eBook reader.')
author = _('Kovid Goyal and John Schember')
supported_platforms = ['windows', 'osx', 'linux']
class PRS505(Device):
VENDOR_ID = 0x054c #: SONY Vendor Id
PRODUCT_ID = 0x031e #: Product Id for the PRS-505
BCD = [0x229] #: Needed to disambiguate 505 and 700 on linux
PRODUCT_NAME = 'PRS-505'
VENDOR_NAME = 'SONY'
FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt']
MEDIA_XML = 'database/cache/media.xml'
CACHE_XML = 'Sony Reader/database/cache.xml'
VENDOR_ID = [0x054c] #: SONY Vendor Id
PRODUCT_ID = [0x031e] #: Product Id for the PRS-505
BCD = [0x229] #: Needed to disambiguate 505 and 700 on linux
VENDOR_NAME = 'SONY'
WINDOWS_MAIN_MEM = 'PRS-505'
WINDOWS_CARD_A_MEM = 'PRS-505/UC:MS'
WINDOWS_CARD_B_MEM = 'PRS-505/UC:SD'
OSX_MAIN_MEM = 'Sony PRS-505/UC Media'
OSX_CARD_A_MEM = 'Sony PRS-505/UC:MS Media'
OSX_CARD_B_MEM = 'Sony PRS-505/UC:SD'
MAIN_MEMORY_VOLUME_LABEL = 'Sony Reader Main Memory'
STORAGE_CARD_VOLUME_LABEL = 'Sony Reader Storage Card'
OSX_NAME = 'Sony PRS-505'
MEDIA_XML = 'database/cache/media.xml'
CACHE_XML = 'Sony Reader/database/cache.xml'
CARD_PATH_PREFIX = __appname__
FDI_TEMPLATE = \
'''
<device>
<match key="info.category" string="volume">
<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.vendor_id" int="%(vendor_id)s">
<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.product_id" int="%(product_id)s">
<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.device_revision_bcd" int="%(bcd)s">
<match key="volume.is_partition" bool="false">
<merge key="volume.label" type="string">%(main_memory)s</merge>
<merge key="%(app)s.mainvolume" type="string">%(deviceclass)s</merge>
</match>
</match>
</match>
</match>
</match>
</device>
<device>
<match key="info.category" string="volume">
<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.vendor_id" int="%(vendor_id)s">
<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.product_id" int="%(product_id)s">
<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.device_revision_bcd" int="%(bcd)s">
<match key="volume.is_partition" bool="true">
<merge key="volume.label" type="string">%(storage_card)s</merge>
<merge key="%(app)s.cardvolume" type="string">%(deviceclass)s</merge>
</match>
</match>
</match>
</match>
</match>
</device>
'''.replace('%(app)s', __appname__)
def __init__(self, log_packets=False):
self._main_prefix = self._card_prefix = None
@classmethod
def get_fdi(cls):
return cls.FDI_TEMPLATE%dict(
deviceclass=cls.__name__,
vendor_id=hex(cls.VENDOR_ID),
product_id=hex(cls.PRODUCT_ID),
bcd=hex(cls.BCD[0]),
main_memory=cls.MAIN_MEMORY_VOLUME_LABEL,
storage_card=cls.STORAGE_CARD_VOLUME_LABEL,
)
@classmethod
def is_device(cls, device_id):
device_id = device_id.upper()
if 'VEN_'+cls.VENDOR_NAME in device_id and \
'PROD_'+cls.PRODUCT_NAME in device_id:
return True
vid, pid = hex(cls.VENDOR_ID)[2:], hex(cls.PRODUCT_ID)[2:]
if len(vid) < 4: vid = '0'+vid
if len(pid) < 4: pid = '0'+pid
if 'VID_'+vid in device_id and \
'PID_'+pid in device_id:
return True
return False
@classmethod
def get_osx_mountpoints(cls, raw=None):
if raw is None:
ioreg = '/usr/sbin/ioreg'
if not os.access(ioreg, os.X_OK):
ioreg = 'ioreg'
raw = subprocess.Popen((ioreg+' -w 0 -S -c IOMedia').split(),
stdout=subprocess.PIPE).communicate()[0]
lines = raw.splitlines()
names = {}
for i, line in enumerate(lines):
if line.strip().endswith('<class IOMedia>') and cls.OSX_NAME in line:
loc = 'stick' if ':MS' in line else 'card' if ':SD' in line else 'main'
for line in lines[i+1:]:
line = line.strip()
if line.endswith('}'):
break
match = re.search(r'"BSD Name"\s+=\s+"(.*?)"', line)
if match is not None:
names[loc] = match.group(1)
break
if len(names.keys()) == 3:
break
return names
def open_osx(self):
mount = subprocess.Popen('mount', shell=True,
stdout=subprocess.PIPE).stdout.read()
names = self.get_osx_mountpoints()
dev_pat = r'/dev/%s(\w*)\s+on\s+([^\(]+)\s+'
if 'main' not in names.keys():
raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__)
main_pat = dev_pat%names['main']
self._main_prefix = re.search(main_pat, mount).group(2) + os.sep
card_pat = names['stick'] if 'stick' in names.keys() else names['card'] if 'card' in names.keys() else None
if card_pat is not None:
card_pat = dev_pat%card_pat
self._card_prefix = re.search(card_pat, mount).group(2) + os.sep
def open_windows(self):
time.sleep(6)
drives = []
wmi = __import__('wmi', globals(), locals(), [], -1)
c = wmi.WMI(find_classes=False)
for drive in c.Win32_DiskDrive():
if self.__class__.is_device(str(drive.PNPDeviceID)):
if drive.Partitions == 0:
continue
try:
partition = drive.associators("Win32_DiskDriveToDiskPartition")[0]
logical_disk = partition.associators('Win32_LogicalDiskToPartition')[0]
prefix = logical_disk.DeviceID+os.sep
drives.append((drive.Index, prefix))
except IndexError:
continue
if not drives:
raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__)
drives.sort(cmp=lambda a, b: cmp(a[0], b[0]))
self._main_prefix = drives[0][1]
if len(drives) > 1:
self._card_prefix = drives[1][1]
def open_linux(self):
import dbus
bus = dbus.SystemBus()
hm = dbus.Interface(bus.get_object("org.freedesktop.Hal", "/org/freedesktop/Hal/Manager"), "org.freedesktop.Hal.Manager")
def conditional_mount(dev, main_mem=True):
mmo = bus.get_object("org.freedesktop.Hal", dev)
label = mmo.GetPropertyString('volume.label', dbus_interface='org.freedesktop.Hal.Device')
is_mounted = mmo.GetPropertyString('volume.is_mounted', dbus_interface='org.freedesktop.Hal.Device')
mount_point = mmo.GetPropertyString('volume.mount_point', dbus_interface='org.freedesktop.Hal.Device')
fstype = mmo.GetPropertyString('volume.fstype', dbus_interface='org.freedesktop.Hal.Device')
if is_mounted:
return str(mount_point)
mmo.Mount(label, fstype, ['umask=077', 'uid='+str(os.getuid()), 'sync'],
dbus_interface='org.freedesktop.Hal.Device.Volume')
return os.path.normpath('/media/'+label)+'/'
mm = hm.FindDeviceStringMatch(__appname__+'.mainvolume', self.__class__.__name__)
if not mm:
raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%(self.__class__.__name__,))
self._main_prefix = None
for dev in mm:
try:
self._main_prefix = conditional_mount(dev)+os.sep
break
except dbus.exceptions.DBusException:
continue
if not self._main_prefix:
raise DeviceError('Could not open device for reading. Try a reboot.')
self._card_prefix = None
cards = hm.FindDeviceStringMatch(__appname__+'.cardvolume', self.__class__.__name__)
keys = []
for card in cards:
keys.append(int('UC_SD' in bus.get_object("org.freedesktop.Hal", card).GetPropertyString('info.parent', dbus_interface='org.freedesktop.Hal.Device')))
cards = zip(cards, keys)
cards.sort(cmp=lambda x, y: cmp(x[1], y[1]))
cards = [i[0] for i in cards]
for dev in cards:
try:
self._card_prefix = conditional_mount(dev, False)+os.sep
break
except:
import traceback
print traceback
continue
def open(self):
time.sleep(5)
self._main_prefix = self._card_prefix = None
if islinux:
Device.open(self)
def write_cache(prefix):
try:
self.open_linux()
except DeviceError:
time.sleep(3)
self.open_linux()
if iswindows:
try:
self.open_windows()
except DeviceError:
time.sleep(3)
self.open_windows()
if isosx:
try:
self.open_osx()
except DeviceError:
time.sleep(3)
self.open_osx()
if self._card_prefix is not None:
try:
cachep = os.path.join(self._card_prefix, self.CACHE_XML)
cachep = os.path.join(prefix, self.CACHE_XML)
if not os.path.exists(cachep):
try:
os.makedirs(os.path.dirname(cachep), mode=0777)
except:
time.sleep(5)
os.makedirs(os.path.dirname(cachep), mode=0777)
f = open(cachep, 'wb')
f.write(u'''<?xml version="1.0" encoding="UTF-8"?>
<cache xmlns="http://www.kinoma.com/FskCache/1">
</cache>
'''.encode('utf8'))
f.close()
with open(cachep, 'wb') as f:
f.write(u'''<?xml version="1.0" encoding="UTF-8"?>
<cache xmlns="http://www.kinoma.com/FskCache/1">
</cache>
'''.encode('utf8'))
return True
except:
self._card_prefix = None
import traceback
traceback.print_exc()
return False
def set_progress_reporter(self, pr):
self.report_progress = pr
if self._card_a_prefix is not None:
if not write_cache(self._card_a_prefix):
self._card_a_prefix = None
if self._card_b_prefix is not None:
if not write_cache(self._card_b_prefix):
self._card_b_prefix = None
def get_device_information(self, end_session=True):
self.report_progress(1.0, _('Get device information...'))
return (self.__class__.__name__, '', '', '')
def card_prefix(self, end_session=True):
return self._card_prefix
@classmethod
def _windows_space(cls, prefix):
if prefix is None:
return 0, 0
win32file = __import__('win32file', globals(), locals(), [], -1)
try:
sectors_per_cluster, bytes_per_sector, free_clusters, total_clusters = \
win32file.GetDiskFreeSpace(prefix[:-1])
except Exception, err:
if getattr(err, 'args', [None])[0] == 21: # Disk not ready
time.sleep(3)
sectors_per_cluster, bytes_per_sector, free_clusters, total_clusters = \
win32file.GetDiskFreeSpace(prefix[:-1])
else: raise
mult = sectors_per_cluster * bytes_per_sector
return total_clusters * mult, free_clusters * mult
def total_space(self, end_session=True):
msz = csz = 0
if not iswindows:
if self._main_prefix is not None:
stats = os.statvfs(self._main_prefix)
msz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
if self._card_prefix is not None:
stats = os.statvfs(self._card_prefix)
csz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
else:
msz = self._windows_space(self._main_prefix)[0]
csz = self._windows_space(self._card_prefix)[0]
return (msz, 0, csz)
def free_space(self, end_session=True):
msz = csz = 0
if not iswindows:
if self._main_prefix is not None:
stats = os.statvfs(self._main_prefix)
msz = stats.f_frsize * stats.f_bavail
if self._card_prefix is not None:
stats = os.statvfs(self._card_prefix)
csz = stats.f_frsize * stats.f_bavail
else:
msz = self._windows_space(self._main_prefix)[1]
csz = self._windows_space(self._card_prefix)[1]
return (msz, 0, csz)
def books(self, oncard=False, end_session=True):
if oncard and self._card_prefix is None:
def books(self, oncard=None, end_session=True):
if oncard == 'carda' and not self._card_a_prefix:
self.report_progress(1.0, _('Getting list of books on device...'))
return []
elif oncard == 'cardb' and not self._card_b_prefix:
self.report_progress(1.0, _('Getting list of books on device...'))
return []
elif oncard and oncard != 'carda' and oncard != 'cardb':
self.report_progress(1.0, _('Getting list of books on device...'))
return []
db = self.__class__.CACHE_XML if oncard else self.__class__.MEDIA_XML
prefix = self._card_prefix if oncard else self._main_prefix
bl = BookList(open(prefix + db, 'rb'), prefix)
prefix = self._card_a_prefix if oncard == 'carda' else self._card_b_prefix if oncard == 'cardb' else self._main_prefix
bl = BookList(open(prefix + db, 'rb'), prefix, self.report_progress)
paths = bl.purge_corrupted_files()
for path in paths:
path = os.path.join(self._card_prefix if oncard else self._main_prefix, path)
path = os.path.join(prefix, path)
if os.path.exists(path):
os.unlink(path)
self.report_progress(1.0, _('Getting list of books on device...'))
return bl
def munge_path(self, path):
if path.startswith('/') and not (path.startswith(self._main_prefix) or \
(self._card_prefix and path.startswith(self._card_prefix))):
path = self._main_prefix + path[1:]
elif path.startswith('card:'):
path = path.replace('card:', self._card_prefix[:-1])
return path
def mkdir(self, path, end_session=True):
""" Make directory """
path = self.munge_path(path)
os.mkdir(path)
def list(self, path, recurse=False, end_session=True, munge=True):
if munge:
path = self.munge_path(path)
if os.path.isfile(path):
return [(os.path.dirname(path), [File(path)])]
entries = [File(os.path.join(path, f)) for f in os.listdir(path)]
dirs = [(path, entries)]
for _file in entries:
if recurse and _file.is_dir:
dirs[len(dirs):] = self.list(_file.path, recurse=True, munge=False)
return dirs
def get_file(self, path, outfile, end_session=True):
path = self.munge_path(path)
src = open(path, 'rb')
shutil.copyfileobj(src, outfile, 10*1024*1024)
def put_file(self, infile, path, replace_file=False, end_session=True):
path = self.munge_path(path)
if os.path.isdir(path):
path = os.path.join(path, infile.name)
if not replace_file and os.path.exists(path):
raise PathError('File already exists: '+path)
dest = open(path, 'wb')
shutil.copyfileobj(infile, dest, 10*1024*1024)
dest.flush()
dest.close()
def rm(self, path, end_session=True):
path = self.munge_path(path)
os.unlink(path)
def touch(self, path, end_session=True):
path = self.munge_path(path)
if not os.path.exists(path):
open(path, 'w').close()
if not os.path.isdir(path):
os.utime(path, None)
def upload_books(self, files, names, on_card=False, end_session=True,
def upload_books(self, files, names, on_card=None, end_session=True,
metadata=None):
if on_card and not self._card_prefix:
raise ValueError(_('The reader has no storage card connected.'))
path = os.path.join(self._card_prefix, self.CARD_PATH_PREFIX) if on_card \
else os.path.join(self._main_prefix, 'database', 'media', 'books')
if on_card == 'carda' and not self._card_a_prefix:
raise ValueError(_('The reader has no storage card in this slot.'))
elif on_card == 'cardb' and not self._card_b_prefix:
raise ValueError(_('The reader has no storage card in this slot.'))
elif on_card and on_card not in ('carda', 'cardb'):
raise DeviceError(_('The reader has no storage card in this slot.'))
if on_card == 'carda':
path = os.path.join(self._card_a_prefix, self.CARD_PATH_PREFIX)
elif on_card == 'cardb':
path = os.path.join(self._card_b_prefix, self.CARD_PATH_PREFIX)
else:
path = os.path.join(self._main_prefix, 'database', 'media', 'books')
def get_size(obj):
if hasattr(obj, 'seek'):
@ -403,34 +124,61 @@ class PRS505(Device):
return size
return os.path.getsize(obj)
sizes = map(get_size, files)
sizes = [get_size(f) for f in files]
size = sum(sizes)
space = self.free_space()
mspace = space[0]
cspace = space[2]
if on_card and size > cspace - 1024*1024:
raise FreeSpaceError("There is insufficient free space "+\
"on the storage card")
if not on_card and size > mspace - 2*1024*1024:
raise FreeSpaceError("There is insufficient free space " +\
"in main memory")
if not on_card and size > self.free_space()[0] - 2*1024*1024:
raise FreeSpaceError(_("There is insufficient free space in main memory"))
if on_card == 'carda' and size > self.free_space()[1] - 1024*1024:
raise FreeSpaceError(_("There is insufficient free space on the storage card"))
if on_card == 'cardb' and size > self.free_space()[2] - 1024*1024:
raise FreeSpaceError(_("There is insufficient free space on the storage card"))
paths, ctimes = [], []
names = iter(names)
for infile in files:
metadata = iter(metadata)
for i, infile in enumerate(files):
close = False
if not hasattr(infile, 'read'):
infile, close = open(infile, 'rb'), True
infile.seek(0)
name = names.next()
paths.append(os.path.join(path, name))
if not os.path.exists(os.path.dirname(paths[-1])):
os.makedirs(os.path.dirname(paths[-1]))
newpath = path
mdata = metadata.next()
if 'tags' in mdata.keys():
for tag in mdata['tags']:
if tag.startswith(_('News')):
newpath = os.path.join(newpath, 'news')
newpath = os.path.join(newpath, mdata.get('title', ''))
newpath = os.path.join(newpath, mdata.get('timestamp', ''))
elif tag.startswith('/'):
newpath = path
newpath += tag
newpath = os.path.normpath(newpath)
break
if newpath == path:
newpath = os.path.join(newpath, mdata.get('authors', _('Unknown')))
newpath = os.path.join(newpath, mdata.get('title', _('Unknown')))
if not os.path.exists(newpath):
os.makedirs(newpath)
filepath = os.path.join(newpath, names.next())
paths.append(filepath)
self.put_file(infile, paths[-1], replace_file=True)
if close:
infile.close()
ctimes.append(os.path.getctime(paths[-1]))
self.report_progress((i+1) / float(len(files)), _('Transferring books to device...'))
self.report_progress(1.0, _('Transferring books to device...'))
return zip(paths, sizes, ctimes, cycle([on_card]))
@classmethod
@ -439,17 +187,19 @@ class PRS505(Device):
for location in locations:
info = metadata.next()
path = location[0]
on_card = 1 if location[3] else 0
blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0
name = path.rpartition(os.sep)[2]
name = (cls.CARD_PATH_PREFIX+'/' if on_card else 'database/media/books/') + name
name = (cls.CARD_PATH_PREFIX+'/' if blist else 'database/media/books/') + name
name = name.replace('//', '/')
booklists[on_card].add_book(info, name, *location[1:-1])
booklists[blist].add_book(info, name, *location[1:-1])
fix_ids(*booklists)
def delete_books(self, paths, end_session=True):
for path in paths:
for i, path in enumerate(paths):
self.report_progress((i+1) / float(len(paths)), _('Removing books from device...'))
if os.path.exists(path):
os.unlink(path)
self.report_progress(1.0, _('Removing books from device...'))
@classmethod
def remove_books_from_metadata(cls, paths, booklists):
@ -466,18 +216,15 @@ class PRS505(Device):
f = open(self._main_prefix + self.__class__.MEDIA_XML, 'wb')
booklists[0].write(f)
f.close()
if self._card_prefix is not None and hasattr(booklists[1], 'write'):
if not os.path.exists(self._card_prefix):
os.makedirs(self._card_prefix)
f = open(self._card_prefix + self.__class__.CACHE_XML, 'wb')
booklists[1].write(f)
f.close()
def write_card_prefix(prefix, listid):
if prefix is not None and hasattr(booklists[listid], 'write'):
if not os.path.exists(prefix):
os.makedirs(prefix)
f = open(prefix + self.__class__.CACHE_XML, 'wb')
booklists[listid].write(f)
f.close()
write_card_prefix(self._card_a_prefix, 1)
write_card_prefix(self._card_b_prefix, 2)
def main(args=sys.argv):
return 0
if __name__ == '__main__':
sys.exit(main())
self.report_progress(1.0, _('Sending metadata to device...'))

View File

@ -9,7 +9,18 @@ from calibre.devices.prs505.driver import PRS505
class PRS700(PRS505):
BCD = [0x31a]
PRODUCT_NAME = 'PRS-700'
OSX_NAME = 'Sony PRS-700'
name = 'PRS-700 Device Interface'
description = _('Communicate with the Sony PRS-700 eBook reader.')
author = _('Kovid Goyal and John Schember')
supported_platforms = ['windows', 'osx', 'linux']
BCD = [0x31a]
WINDOWS_MAIN_MEM = 'PRS-700'
WINDOWS_CARD_A_MEM = 'PRS-700/UC:MS'
WINDOWS_CARD_B_MEM = 'PRS-700/UC:SD'
OSX_MAIN_MEM = 'Sony PRS-700/UC Media'
OSX_CARD_A_MEM = 'Sony PRS-700/UC:MS Media'
OSX_CARD_B_MEM = 'Sony PRS-700/UC:SD'

View File

@ -21,15 +21,15 @@ class Book(object):
def __eq__(self, other):
return self.path == other.path
@apply
def title_sorter():
@dynamic_property
def title_sorter(self):
doc = '''String to sort the title. If absent, title is returned'''
def fget(self):
return re.sub('^\s*A\s+|^\s*The\s+|^\s*An\s+', '', self.title).rstrip()
return property(doc=doc, fget=fget)
@apply
def thumbnail():
@dynamic_property
def thumbnail(self):
return None
def __str__(self):
@ -44,4 +44,3 @@ class BookList(_BookList):
def set_tags(self, book, tags):
pass

View File

@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, shutil
from calibre.devices.errors import PathError
class File(object):
def __init__(self, path):
stats = os.stat(path)
self.is_dir = os.path.isdir(path)
self.is_readonly = not os.access(path, os.W_OK)
self.ctime = stats.st_ctime
self.wtime = stats.st_mtime
self.size = stats.st_size
if path.endswith(os.sep):
path = path[:-1]
self.path = path
self.name = os.path.basename(path)
class CLI(object):
def get_file(self, path, outfile, end_session=True):
path = self.munge_path(path)
with open(path, 'rb') as src:
shutil.copyfileobj(src, outfile, 10*1024*1024)
def put_file(self, infile, path, replace_file=False, end_session=True):
path = self.munge_path(path)
if os.path.isdir(path):
path = os.path.join(path, infile.name)
if not replace_file and os.path.exists(path):
raise PathError('File already exists: ' + path)
dest = open(path, 'wb')
shutil.copyfileobj(infile, dest, 10*1024*1024)
dest.flush()
dest.close()
def munge_path(self, path):
if path.startswith('/') and not (path.startswith(self._main_prefix) or \
(self._card_a_prefix and path.startswith(self._card_a_prefix)) or \
(self._card_b_prefix and path.startswith(self._card_b_prefix))):
path = self._main_prefix + path[1:]
elif path.startswith('carda:'):
path = path.replace('carda:', self._card_prefix[:-1])
elif path.startswith('cardb:'):
path = path.replace('cardb:', self._card_prefix[:-1])
return path
def list(self, path, recurse=False, end_session=True, munge=True):
if munge:
path = self.munge_path(path)
if os.path.isfile(path):
return [(os.path.dirname(path), [File(path)])]
entries = [File(os.path.join(path, f)) for f in os.listdir(path)]
dirs = [(path, entries)]
for _file in entries:
if recurse and _file.is_dir:
dirs[len(dirs):] = self.list(_file.path, recurse=True, munge=False)
return dirs
def mkdir(self, path, end_session=True):
if self.SUPPORTS_SUB_DIRS:
path = self.munge_path(path)
os.mkdir(path)
def rm(self, path, end_session=True):
path = self.munge_path(path)
self.delete_books([path])
def touch(self, path, end_session=True):
path = self.munge_path(path)
if not os.path.exists(path):
open(path, 'w').close()
if not os.path.isdir(path):
os.utime(path, None)

View File

@ -8,11 +8,12 @@ device. This class handles device detection.
import os, subprocess, time, re
from calibre.devices.interface import Device as _Device
from calibre.devices.interface import DevicePlugin
from calibre.devices.errors import DeviceError
from calibre.devices.usbms.deviceconfig import DeviceConfig
from calibre import iswindows, islinux, isosx, __appname__
class Device(_Device):
class Device(DeviceConfig, DevicePlugin):
'''
This class provides logic common to all drivers for devices that export themselves
as USB Mass Storage devices. If you are writing such a driver, inherit from this
@ -25,10 +26,12 @@ class Device(_Device):
VENDOR_NAME = None
WINDOWS_MAIN_MEM = None
WINDOWS_CARD_MEM = None
WINDOWS_CARD_A_MEM = None
WINDOWS_CARD_B_MEM = None
OSX_MAIN_MEM = None
OSX_CARD_MEM = None
OSX_CARD_A_MEM = None
OSX_CARD_B_MEM = None
MAIN_MEMORY_VOLUME_LABEL = ''
STORAGE_CARD_VOLUME_LABEL = ''
@ -63,18 +66,30 @@ class Device(_Device):
</match>
</match>
</device>
<device>
<match key="info.category" string="volume">
<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.vendor_id" int="%(vendor_id)s">
<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.product_id" int="%(product_id)s">
%(BCD_start)s
<match key="@info.parent:storage.lun" int="%(lun2)d">
<merge key="volume.label" type="string">%(storage_card)s</merge>
<merge key="%(app)s.cardvolume" type="string">%(deviceclass)s</merge>
</match>
%(BCD_end)s
</match>
</match>
</match>
</device>
'''
FDI_BCD_TEMPLATE = '<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.device_revision_bcd" int="%(bcd)s">'
FDI_LUNS = {'lun0':0, 'lun1':1, 'lun2':2}
FDI_BCD_TEMPLATE = '<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.device_revision_bcd" int="%(bcd)s">'
def __init__(self, key='-1', log_packets=False, report_progress=None) :
self._main_prefix = self._card_prefix = None
def reset(self, key='-1', log_packets=False, report_progress=None) :
self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
@classmethod
def get_fdi(cls):
fdi = ''
for vid in cls.VENDOR_ID:
for pid in cls.PRODUCT_ID:
fdi_base_values = dict(
@ -85,7 +100,6 @@ class Device(_Device):
main_memory=cls.MAIN_MEMORY_VOLUME_LABEL,
storage_card=cls.STORAGE_CARD_VOLUME_LABEL,
)
fdi_base_values.update(cls.FDI_LUNS)
if cls.BCD is None:
@ -105,7 +119,7 @@ class Device(_Device):
self.report_progress = report_progress
def card_prefix(self, end_session=True):
return self._card_prefix
return (self._card_a_prefix, self._card_b_prefix)
@classmethod
def _windows_space(cls, prefix):
@ -125,34 +139,41 @@ class Device(_Device):
return total_clusters * mult, free_clusters * mult
def total_space(self, end_session=True):
msz = csz = 0
msz = casz = cbsz = 0
if not iswindows:
if self._main_prefix is not None:
stats = os.statvfs(self._main_prefix)
msz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
if self._card_prefix is not None:
stats = os.statvfs(self._card_prefix)
csz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
if self._card_a_prefix is not None:
stats = os.statvfs(self._card_a_prefix)
casz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
if self._card_b_prefix is not None:
stats = os.statvfs(self._card_b_prefix)
cbsz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
else:
msz = self._windows_space(self._main_prefix)[0]
csz = self._windows_space(self._card_prefix)[0]
casz = self._windows_space(self._card_a_prefix)[0]
cbsz = self._windows_space(self._card_b_prefix)[0]
return (msz, 0, csz)
return (msz, casz, cbsz)
def free_space(self, end_session=True):
msz = csz = 0
msz = casz = cbsz = 0
if not iswindows:
if self._main_prefix is not None:
stats = os.statvfs(self._main_prefix)
msz = stats.f_frsize * stats.f_bavail
if self._card_prefix is not None:
stats = os.statvfs(self._card_prefix)
csz = stats.f_frsize * stats.f_bavail
if self._card_a_prefix is not None:
stats = os.statvfs(self._card_a_prefix)
casz = stats.f_frsize * stats.f_bavail
if self._card_b_prefix is not None:
stats = os.statvfs(self._card_b_prefix)
cbsz = stats.f_frsize * stats.f_bavail
else:
msz = self._windows_space(self._main_prefix)[1]
csz = self._windows_space(self._card_prefix)[1]
return (msz, 0, csz)
return (msz, casz, cbsz)
def windows_match_device(self, pnp_id, device_id):
pnp_id = pnp_id.upper()
@ -193,10 +214,12 @@ class Device(_Device):
for drive in c.Win32_DiskDrive():
if self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_MAIN_MEM):
drives['main'] = self.windows_get_drive_prefix(drive)
elif self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_CARD_MEM):
drives['card'] = self.windows_get_drive_prefix(drive)
elif self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_CARD_A_MEM):
drives['carda'] = self.windows_get_drive_prefix(drive)
elif self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_CARD_B_MEM):
drives['cardb'] = self.windows_get_drive_prefix(drive)
if 'main' in drives.keys() and 'card' in drives.keys():
if 'main' in drives.keys() and 'carda' in drives.keys() and 'cardb' in drives.keys():
break
if 'main' not in drives:
@ -206,7 +229,8 @@ class Device(_Device):
drives = self.windows_sort_drives(drives)
self._main_prefix = drives.get('main')
self._card_prefix = drives.get('card', None)
self._card_a_prefix = drives.get('carda', None)
self._card_b_prefix = drives.get('cardb', None)
@classmethod
def run_ioreg(cls, raw=None):
@ -237,9 +261,11 @@ class Device(_Device):
for i, line in enumerate(lines):
if self.OSX_MAIN_MEM is not None and line.strip().endswith('<class IOMedia>') and self.OSX_MAIN_MEM in line:
get_dev_node(lines[i+1:], 'main')
if self.OSX_CARD_MEM is not None and line.strip().endswith('<class IOMedia>') and self.OSX_CARD_MEM in line:
get_dev_node(lines[i+1:], 'card')
if len(names.keys()) == 2:
if self.OSX_CARD_A_MEM is not None and line.strip().endswith('<class IOMedia>') and self.OSX_CARD_A_MEM in line:
get_dev_node(lines[i+1:], 'carda')
if self.OSX_CARD_B_MEM is not None and line.strip().endswith('<class IOMedia>') and self.OSX_CARD_B_MEM in line:
get_dev_node(lines[i+1:], 'cardb')
if len(names.keys()) == 3:
break
return names
@ -251,10 +277,18 @@ class Device(_Device):
raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__)
main_pat = dev_pat % names['main']
self._main_prefix = re.search(main_pat, mount).group(2) + os.sep
card_pat = names['card'] if 'card' in names.keys() else None
if card_pat is not None:
card_pat = dev_pat % card_pat
self._card_prefix = re.search(card_pat, mount).group(2) + os.sep
card_a_pat = names['carda'] if 'carda' in names.keys() else None
card_b_pat = names['cardb'] if 'cardb' in names.keys() else None
def get_card_prefix(pat):
if pat is not None:
pat = dev_pat % pat
return re.search(pat, mount).group(2) + os.sep
else:
return None
self._card_a_prefix = get_card_prefix(card_a_pat)
self._card_b_prefix = get_card_prefix(card_b_pat)
def open_linux(self):
import dbus
@ -287,21 +321,24 @@ class Device(_Device):
if not self._main_prefix:
raise DeviceError('Could not open device for reading. Try a reboot.')
self._card_prefix = None
self._card_a_prefix = self._card_b_prefix = None
cards = hm.FindDeviceStringMatch(__appname__+'.cardvolume', self.__class__.__name__)
for dev in cards:
def mount_card(dev):
try:
self._card_prefix = conditional_mount(dev)+os.sep
break
return conditional_mount(dev)+os.sep
except:
import traceback
print traceback
continue
if len(cards) >= 1:
self._card_a_prefix = mount_card(cards[0])
if len(cards) >=2:
self._card_b_prefix = mount_card(cards[1])
def open(self):
time.sleep(5)
self._main_prefix = self._card_prefix = None
self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
if islinux:
try:
self.open_linux()

View File

@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.utils.config import Config, ConfigProxy
class DeviceConfig(object):
HELP_MESSAGE = _('Ordered list of formats the device will accept')
@classmethod
def _config(cls):
klass = cls if isinstance(cls, type) else cls.__class__
c = Config('device_drivers_%s' % klass.__name__, _('settings for device drivers'))
c.add_opt('format_map', default=cls.FORMATS, help=cls.HELP_MESSAGE)
return c
@classmethod
def _configProxy(cls):
return ConfigProxy(cls._config())
@classmethod
def config_widget(cls):
from calibre.gui2.device_drivers.configwidget import ConfigWidget
cw = ConfigWidget(cls.settings(), cls.FORMATS)
return cw
@classmethod
def save_settings(cls, config_widget):
cls._configProxy()['format_map'] = config_widget.format_map()
@classmethod
def settings(cls):
return cls._config().parse()
def customization_help(cls, gui=False):
return cls.HELP_MESSAGE

View File

@ -10,71 +10,89 @@ for a particular device.
import os, fnmatch, shutil
from itertools import cycle
from calibre.ebooks.metadata.meta import metadata_from_formats, path_to_ext
from calibre.ebooks.metadata import authors_to_string
from calibre.devices.usbms.cli import CLI
from calibre.devices.usbms.device import Device
from calibre.devices.usbms.books import BookList, Book
from calibre.devices.errors import FreeSpaceError, PathError
from calibre.devices.errors import DeviceError, FreeSpaceError
from calibre.devices.mime import mime_type_ext
class File(object):
def __init__(self, path):
stats = os.stat(path)
self.is_dir = os.path.isdir(path)
self.is_readonly = not os.access(path, os.W_OK)
self.ctime = stats.st_ctime
self.wtime = stats.st_mtime
self.size = stats.st_size
if path.endswith(os.sep):
path = path[:-1]
self.path = path
self.name = os.path.basename(path)
# CLI must come before Device as it implments the CLI functions that
# are inherited from the device interface in Device.
class USBMS(CLI, Device):
name = 'USBMS Base Device Interface'
description = _('Communicate with an eBook reader.')
author = _('John Schember')
supported_platforms = ['windows', 'osx', 'linux']
class USBMS(Device):
FORMATS = []
EBOOK_DIR_MAIN = ''
EBOOK_DIR_CARD = ''
EBOOK_DIR_CARD_A = ''
EBOOK_DIR_CARD_B = ''
SUPPORTS_SUB_DIRS = False
CAN_SET_METADATA = False
def __init__(self, key='-1', log_packets=False, report_progress=None):
Device.__init__(self, key=key, log_packets=log_packets,
def reset(self, key='-1', log_packets=False, report_progress=None):
Device.reset(self, key=key, log_packets=log_packets,
report_progress=report_progress)
def get_device_information(self, end_session=True):
self.report_progress(1.0, _('Get device information...'))
return (self.__class__.__name__, '', '', '')
def books(self, oncard=False, end_session=True):
def books(self, oncard=None, end_session=True):
from calibre.ebooks.metadata.meta import path_to_ext
bl = BookList()
if oncard and self._card_prefix is None:
if oncard == 'carda' and not self._card_a_prefix:
self.report_progress(1.0, _('Getting list of books on device...'))
return bl
elif oncard == 'cardb' and not self._card_b_prefix:
self.report_progress(1.0, _('Getting list of books on device...'))
return bl
elif oncard and oncard != 'carda' and oncard != 'cardb':
self.report_progress(1.0, _('Getting list of books on device...'))
return bl
prefix = self._card_prefix if oncard else self._main_prefix
ebook_dir = self.EBOOK_DIR_CARD if oncard else self.EBOOK_DIR_MAIN
prefix = self._card_a_prefix if oncard == 'carda' else self._card_b_prefix if oncard == 'cardb' else self._main_prefix
ebook_dir = self.EBOOK_DIR_CARD_A if oncard == 'carda' else self.EBOOK_DIR_CARD_B if oncard == 'cardb' else self.EBOOK_DIR_MAIN
# Get all books in the ebook_dir directory
if self.SUPPORTS_SUB_DIRS:
for path, dirs, files in os.walk(os.path.join(prefix, ebook_dir)):
# Filter out anything that isn't in the list of supported ebook types
for book_type in self.FORMATS:
for filename in fnmatch.filter(files, '*.%s' % (book_type)):
match = fnmatch.filter(files, '*.%s' % (book_type))
for i, filename in enumerate(match):
self.report_progress((i+1) / float(len(match)), _('Getting list of books on device...'))
bl.append(self.__class__.book_from_path(os.path.join(path, filename)))
else:
path = os.path.join(prefix, ebook_dir)
for filename in os.listdir(path):
paths = os.listdir(path)
for i, filename in enumerate(paths):
self.report_progress((i+1) / float(len(paths)), _('Getting list of books on device...'))
if path_to_ext(filename) in self.FORMATS:
bl.append(self.__class__.book_from_path(os.path.join(path, filename)))
self.report_progress(1.0, _('Getting list of books on device...'))
return bl
def _sanity_check(self, on_card, files):
if on_card and not self._card_prefix:
raise ValueError(_('The reader has no storage card connected.'))
if on_card == 'carda' and not self._card_a_prefix:
raise ValueError(_('The reader has no storage card in this slot.'))
elif on_card == 'cardb' and not self._card_b_prefix:
raise ValueError(_('The reader has no storage card in this slot.'))
elif on_card and on_card not in ('carda', 'cardb'):
raise DeviceError(_('The reader has no storage card in this slot.'))
if not on_card:
path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN)
if on_card == 'carda':
path = os.path.join(self._card_a_prefix, self.EBOOK_DIR_CARD_A)
elif on_card == 'cardb':
path = os.path.join(self._card_b_prefix, self.EBOOK_DIR_CARD_B)
else:
path = os.path.join(self._card_prefix, self.EBOOK_DIR_CARD)
path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN)
def get_size(obj):
if hasattr(obj, 'seek'):
@ -87,13 +105,15 @@ class USBMS(Device):
sizes = [get_size(f) for f in files]
size = sum(sizes)
if on_card and size > self.free_space()[2] - 1024*1024:
raise FreeSpaceError(_("There is insufficient free space on the storage card"))
if not on_card and size > self.free_space()[0] - 2*1024*1024:
raise FreeSpaceError(_("There is insufficient free space in main memory"))
if on_card == 'carda' and size > self.free_space()[1] - 1024*1024:
raise FreeSpaceError(_("There is insufficient free space on the storage card"))
if on_card == 'cardb' and size > self.free_space()[2] - 1024*1024:
raise FreeSpaceError(_("There is insufficient free space on the storage card"))
return path
def upload_books(self, files, names, on_card=False, end_session=True,
def upload_books(self, files, names, on_card=None, end_session=True,
metadata=None):
path = self._sanity_check(on_card, files)
@ -102,7 +122,7 @@ class USBMS(Device):
names = iter(names)
metadata = iter(metadata)
for infile in files:
for i, infile in enumerate(files):
newpath = path
if self.SUPPORTS_SUB_DIRS:
@ -110,11 +130,21 @@ class USBMS(Device):
if 'tags' in mdata.keys():
for tag in mdata['tags']:
if tag.startswith('/'):
if tag.startswith(_('News')):
newpath = os.path.join(newpath, 'news')
newpath = os.path.join(newpath, mdata.get('title', ''))
newpath = os.path.join(newpath, mdata.get('timestamp', ''))
break
elif tag.startswith('/'):
newpath += tag
newpath = os.path.normpath(newpath)
break
if newpath == path:
newpath = os.path.join(newpath,
mdata.get('authors', _('Unknown')),
mdata.get('title', _('Unknown')))
if not os.path.exists(newpath):
os.makedirs(newpath)
@ -132,22 +162,28 @@ class USBMS(Device):
else:
shutil.copy2(infile, filepath)
self.report_progress((i+1) / float(len(files)), _('Transferring books to device...'))
self.report_progress(1.0, _('Transferring books to device...'))
return zip(paths, cycle([on_card]))
@classmethod
def add_books_to_metadata(cls, locations, metadata, booklists):
for location in locations:
def add_books_to_metadata(self, locations, metadata, booklists):
for i, location in enumerate(locations):
self.report_progress((i+1) / float(len(locations)), _('Adding books to device metadata listing...'))
path = location[0]
on_card = 1 if location[1] else 0
blist = 2 if location[1] == 'cardb' else 1 if location[1] == 'carda' else 0
book = cls.book_from_path(path)
book = self.book_from_path(path)
if not book in booklists[on_card]:
booklists[on_card].append(book)
if not book in booklists[blist]:
booklists[blist].append(book)
self.report_progress(1.0, _('Adding books to device metadata listing...'))
def delete_books(self, paths, end_session=True):
for path in paths:
for i, path in enumerate(paths):
self.report_progress((i+1) / float(len(paths)), _('Removing books from device...'))
if os.path.exists(path):
# Delete the ebook
os.unlink(path)
@ -156,79 +192,31 @@ class USBMS(Device):
os.removedirs(os.path.dirname(path))
except:
pass
self.report_progress(1.0, _('Removing books from device...'))
@classmethod
def remove_books_from_metadata(cls, paths, booklists):
for path in paths:
def remove_books_from_metadata(self, paths, booklists):
for i, path in enumerate(paths):
self.report_progress((i+1) / float(len(paths)), _('Removing books from device metadata listing...'))
for bl in booklists:
for book in bl:
if path.endswith(book.path):
bl.remove(book)
self.report_progress(1.0, _('Removing books from device metadata listing...'))
def sync_booklists(self, booklists, end_session=True):
# There is no meta data on the device to update. The device is treated
# as a mass storage device and does not use a meta data xml file like
# the Sony Readers.
pass
def get_file(self, path, outfile, end_session=True):
path = self.munge_path(path)
with open(path, 'rb') as src:
shutil.copyfileobj(src, outfile, 10*1024*1024)
def put_file(self, infile, path, replace_file=False, end_session=True):
path = self.munge_path(path)
if os.path.isdir(path):
path = os.path.join(path, infile.name)
if not replace_file and os.path.exists(path):
raise PathError('File already exists: ' + path)
dest = open(path, 'wb')
shutil.copyfileobj(infile, dest, 10*1024*1024)
dest.flush()
dest.close()
def munge_path(self, path):
if path.startswith('/') and not (path.startswith(self._main_prefix) or \
(self._card_prefix and path.startswith(self._card_prefix))):
path = self._main_prefix + path[1:]
elif path.startswith('card:'):
path = path.replace('card:', self._card_prefix[:-1])
return path
def list(self, path, recurse=False, end_session=True, munge=True):
if munge:
path = self.munge_path(path)
if os.path.isfile(path):
return [(os.path.dirname(path), [File(path)])]
entries = [File(os.path.join(path, f)) for f in os.listdir(path)]
dirs = [(path, entries)]
for _file in entries:
if recurse and _file.is_dir:
dirs[len(dirs):] = self.list(_file.path, recurse=True, munge=False)
return dirs
def mkdir(self, path, end_session=True):
if self.SUPPORTS_SUB_DIRS:
path = self.munge_path(path)
os.mkdir(path)
def rm(self, path, end_session=True):
path = self.munge_path(path)
self.delete_books([path])
def touch(self, path, end_session=True):
path = self.munge_path(path)
if not os.path.exists(path):
open(path, 'w').close()
if not os.path.isdir(path):
os.utime(path, None)
self.report_progress(1.0, _('Sending metadata to device...'))
@classmethod
def metadata_from_path(cls, path):
from calibre.ebooks.metadata.meta import metadata_from_formats
return metadata_from_formats([path])
@classmethod
def book_from_path(cls, path):
from calibre.ebooks.metadata.meta import path_to_ext
fileext = path_to_ext(path)
mi = cls.metadata_from_path(path)
mime = mime_type_ext(fileext)

View File

@ -60,6 +60,8 @@ class HTMLRenderer(object):
def render_html(path_to_html, width=590, height=750):
from PyQt4.QtWebKit import QWebPage
from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize
from calibre.gui2 import is_ok_to_use_qt
if not is_ok_to_use_qt(): return None
path_to_html = os.path.abspath(path_to_html)
with CurrentDir(os.path.dirname(path_to_html)):
page = QWebPage()

473
src/calibre/ebooks/comic/input.py Executable file
View File

@ -0,0 +1,473 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Based on ideas from comiclrf created by FangornUK.
'''
import os, shutil, traceback, textwrap, time
from Queue import Empty
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import extract, CurrentDir, prints
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.ipc.server import Server
from calibre.utils.ipc.job import ParallelJob
def extract_comic(path_to_comic_file):
'''
Un-archive the comic file.
'''
tdir = PersistentTemporaryDirectory(suffix='_comic_extract')
extract(path_to_comic_file, tdir)
return tdir
def find_pages(dir, sort_on_mtime=False, verbose=False):
'''
Find valid comic pages in a previously un-archived comic.
:param dir: Directory in which extracted comic lives
:param sort_on_mtime: If True sort pages based on their last modified time.
Otherwise, sort alphabetically.
'''
extensions = ['jpeg', 'jpg', 'gif', 'png']
pages = []
for datum in os.walk(dir):
for name in datum[-1]:
path = os.path.join(datum[0], name)
if '__MACOSX' in path: continue
for ext in extensions:
if path.lower().endswith('.'+ext):
pages.append(path)
break
if sort_on_mtime:
comparator = lambda x, y : cmp(os.stat(x).st_mtime, os.stat(y).st_mtime)
else:
comparator = lambda x, y : cmp(os.path.basename(x), os.path.basename(y))
pages.sort(cmp=comparator)
if verbose:
prints('Found comic pages...')
prints('\t'+'\n\t'.join([os.path.basename(p) for p in pages]))
return pages
class PageProcessor(list):
'''
Contains the actual image rendering logic. See :method:`render` and
:method:`process_pages`.
'''
def __init__(self, path_to_page, dest, opts, num):
list.__init__(self)
self.path_to_page = path_to_page
self.opts = opts
self.num = num
self.dest = dest
self.rotate = False
self.render()
def render(self):
import calibre.utils.PythonMagickWand as pw
img = pw.NewMagickWand()
if img < 0:
raise RuntimeError('Cannot create wand.')
if not pw.MagickReadImage(img, self.path_to_page):
raise IOError('Failed to read image from: %'%self.path_to_page)
width = pw.MagickGetImageWidth(img)
height = pw.MagickGetImageHeight(img)
if self.num == 0: # First image so create a thumbnail from it
thumb = pw.CloneMagickWand(img)
if thumb < 0:
raise RuntimeError('Cannot create wand.')
pw.MagickThumbnailImage(thumb, 60, 80)
pw.MagickWriteImage(thumb, os.path.join(self.dest, 'thumbnail.png'))
pw.DestroyMagickWand(thumb)
self.pages = [img]
if width > height:
if self.opts.landscape:
self.rotate = True
else:
split1, split2 = map(pw.CloneMagickWand, (img, img))
pw.DestroyMagickWand(img)
if split1 < 0 or split2 < 0:
raise RuntimeError('Cannot create wand.')
pw.MagickCropImage(split1, (width/2)-1, height, 0, 0)
pw.MagickCropImage(split2, (width/2)-1, height, width/2, 0 )
self.pages = [split2, split1] if self.opts.right2left else [split1, split2]
self.process_pages()
def process_pages(self):
import calibre.utils.PythonMagickWand as p
for i, wand in enumerate(self.pages):
pw = p.NewPixelWand()
try:
if pw < 0:
raise RuntimeError('Cannot create wand.')
p.PixelSetColor(pw, 'white')
p.MagickSetImageBorderColor(wand, pw)
if self.rotate:
p.MagickRotateImage(wand, pw, -90)
# 25 percent fuzzy trim?
if not self.opts.disable_trim:
p.MagickTrimImage(wand, 25*65535/100)
p.MagickSetImagePage(wand, 0,0,0,0) #Clear page after trim, like a "+repage"
# Do the Photoshop "Auto Levels" equivalent
if not self.opts.dont_normalize:
p.MagickNormalizeImage(wand)
sizex = p.MagickGetImageWidth(wand)
sizey = p.MagickGetImageHeight(wand)
SCRWIDTH, SCRHEIGHT = self.opts.output_profile.comic_screen_size
if self.opts.keep_aspect_ratio:
# Preserve the aspect ratio by adding border
aspect = float(sizex) / float(sizey)
if aspect <= (float(SCRWIDTH) / float(SCRHEIGHT)):
newsizey = SCRHEIGHT
newsizex = int(newsizey * aspect)
deltax = (SCRWIDTH - newsizex) / 2
deltay = 0
else:
newsizex = SCRWIDTH
newsizey = int(newsizex / aspect)
deltax = 0
deltay = (SCRHEIGHT - newsizey) / 2
p.MagickResizeImage(wand, newsizex, newsizey, p.CatromFilter, 1.0)
p.MagickSetImageBorderColor(wand, pw)
p.MagickBorderImage(wand, pw, deltax, deltay)
elif self.opts.wide:
# Keep aspect and Use device height as scaled image width so landscape mode is clean
aspect = float(sizex) / float(sizey)
screen_aspect = float(SCRWIDTH) / float(SCRHEIGHT)
# Get dimensions of the landscape mode screen
# Add 25px back to height for the battery bar.
wscreenx = SCRHEIGHT + 25
wscreeny = int(wscreenx / screen_aspect)
if aspect <= screen_aspect:
newsizey = wscreeny
newsizex = int(newsizey * aspect)
deltax = (wscreenx - newsizex) / 2
deltay = 0
else:
newsizex = wscreenx
newsizey = int(newsizex / aspect)
deltax = 0
deltay = (wscreeny - newsizey) / 2
p.MagickResizeImage(wand, newsizex, newsizey, p.CatromFilter, 1.0)
p.MagickSetImageBorderColor(wand, pw)
p.MagickBorderImage(wand, pw, deltax, deltay)
else:
p.MagickResizeImage(wand, SCRWIDTH, SCRHEIGHT, p.CatromFilter, 1.0)
if not self.opts.dont_sharpen:
p.MagickSharpenImage(wand, 0.0, 1.0)
p.MagickSetImageType(wand, p.GrayscaleType)
if self.opts.despeckle:
p.MagickDespeckleImage(wand)
p.MagickQuantizeImage(wand, self.opts.colors, p.RGBColorspace, 0, 1, 0)
dest = '%d_%d.png'%(self.num, i)
dest = os.path.join(self.dest, dest)
p.MagickWriteImage(wand, dest+'8')
os.rename(dest+'8', dest)
self.append(dest)
finally:
if pw > 0:
p.DestroyPixelWand(pw)
p.DestroyMagickWand(wand)
def render_pages(tasks, dest, opts, notification=lambda x, y: x):
'''
Entry point for the job server.
'''
failures, pages = [], []
from calibre.utils.PythonMagickWand import ImageMagick
with ImageMagick():
for num, path in tasks:
try:
pages.extend(PageProcessor(path, dest, opts, num))
msg = _('Rendered %s')%path
except:
failures.append(path)
msg = _('Failed %s')%path
if opts.verbose:
msg += '\n' + traceback.format_exc()
prints(msg)
notification(0.5, msg)
return pages, failures
class Progress(object):
def __init__(self, total, update):
self.total = total
self.update = update
self.done = 0
def __call__(self, percent, msg=''):
self.done += 1
#msg = msg%os.path.basename(job.args[0])
self.update(float(self.done)/self.total, msg)
def process_pages(pages, opts, update, tdir):
'''
Render all identified comic pages.
'''
from calibre.utils.PythonMagickWand import ImageMagick
ImageMagick
progress = Progress(len(pages), update)
server = Server()
jobs = []
tasks = [(p, os.path.join(tdir, os.path.basename(p))) for p in pages]
tasks = server.split(pages)
for task in tasks:
jobs.append(ParallelJob('render_pages', '', progress,
args=[task, tdir, opts]))
server.add_job(jobs[-1])
while True:
time.sleep(1)
running = False
for job in jobs:
while True:
try:
x = job.notifications.get_nowait()
progress(*x)
except Empty:
break
job.update()
if not job.is_finished:
running = True
if not running:
break
server.close()
ans, failures = [], []
for job in jobs:
if job.failed:
raw_input()
raise Exception(_('Failed to process comic: \n\n%s')%
job.log_file.read())
pages, failures_ = job.result
ans += pages
failures += failures_
return ans, failures
class ComicInput(InputFormatPlugin):
name = 'Comic Input'
author = 'Kovid Goyal'
description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
file_types = set(['cbz', 'cbr', 'cbc'])
is_image_collection = True
options = set([
OptionRecommendation(name='colors', recommended_value=64,
help=_('Number of colors for grayscale image conversion. Default: %default')),
OptionRecommendation(name='dont_normalize', recommended_value=False,
help=_('Disable normalize (improve contrast) color range '
'for pictures. Default: False')),
OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
OptionRecommendation(name='dont_sharpen', recommended_value=False,
help=_('Disable sharpening.')),
OptionRecommendation(name='disable_trim', recommended_value=False,
help=_('Disable trimming of comic pages. For some comics, '
'trimming might remove content as well as borders.')),
OptionRecommendation(name='landspace', recommended_value=False,
help=_("Don't split landscape images into two portrait images")),
OptionRecommendation(name='wide', recommended_value=False,
help=_("Keep aspect ratio and scale image using screen height as "
"image width for viewing in landscape mode.")),
OptionRecommendation(name='right2left', recommended_value=False,
help=_('Used for right-to-left publications like manga. '
'Causes landscape pages to be split into portrait pages '
'from right to left.')),
OptionRecommendation(name='despeckle', recommended_value=False,
help=_('Enable Despeckle. Reduces speckle noise. '
'May greatly increase processing time.')),
OptionRecommendation(name='no_sort', recommended_value=False,
help=_("Don't sort the files found in the comic "
"alphabetically by name. Instead use the order they were "
"added to the comic.")),
OptionRecommendation(name='no_process', recommended_value=False,
help=_("Apply no processing to the image")),
])
recommendations = set([
('margin_left', 0, OptionRecommendation.HIGH),
('margin_top', 0, OptionRecommendation.HIGH),
('margin_right', 0, OptionRecommendation.HIGH),
('margin_bottom', 0, OptionRecommendation.HIGH),
('insert_blank_line', False, OptionRecommendation.HIGH),
('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
('dont_justify', True, OptionRecommendation.HIGH),
('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
('chapter', None, OptionRecommendation.HIGH),
('page_breaks_brefore', None, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
('page_breaks_before', None, OptionRecommendation.HIGH),
('disable_font_rescaling', True, OptionRecommendation.HIGH),
('linearize_tables', False, OptionRecommendation.HIGH),
])
def get_comics_from_collection(self, stream):
from calibre.libunzip import extract as zipextract
tdir = PersistentTemporaryDirectory('_comic_collection')
zipextract(stream, tdir)
comics = []
with CurrentDir(tdir):
if not os.path.exists('comics.txt'):
raise ValueError('%s is not a valid comic collection'
%stream.name)
for line in open('comics.txt',
'rb').read().decode('utf-8').splitlines():
fname, title = line.partition(':')[0], line.partition(':')[-1]
fname = os.path.join(tdir, *fname.split('/'))
if not title:
title = os.path.basename(fname).rpartition('.')[0]
if os.access(fname, os.R_OK):
comics.append([title, fname])
if not comics:
raise ValueError('%s has no comics'%stream.name)
return comics
def get_pages(self, comic, tdir2):
tdir = extract_comic(comic)
new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
verbose=self.opts.verbose)
thumbnail = None
if not new_pages:
raise ValueError('Could not find any pages in the comic: %s'
%comic)
if self.opts.no_process:
n2 = []
for page in new_pages:
n2.append(os.path.join(tdir2, os.path.basename(page)))
shutil.copyfile(page, n2[-1])
new_pages = n2
else:
new_pages, failures = process_pages(new_pages, self.opts,
self.report_progress, tdir2)
if not new_pages:
raise ValueError('Could not find any valid pages in comic: %s'
% comic)
if failures:
self.log.warning('Could not process the following pages '
'(run with --verbose to see why):')
for f in failures:
self.log.warning('\t', f)
thumbnail = os.path.join(tdir2, 'thumbnail.png')
if not os.access(thumbnail, os.R_OK):
thumbnail = None
return new_pages
def get_images(self):
return self._images
def convert(self, stream, opts, file_ext, log, accelerators):
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
self.opts, self.log= opts, log
if file_ext == 'cbc':
comics_ = self.get_comics_from_collection(stream)
else:
comics_ = [['Comic', os.path.abspath(stream.name)]]
stream.close()
comics = []
for i, x in enumerate(comics_):
title, fname = x
cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
cdir = os.path.abspath(cdir)
if not os.path.exists(cdir):
os.makedirs(cdir)
pages = self.get_pages(fname, cdir)
if not pages: continue
wrappers = self.create_wrappers(pages)
comics.append((title, pages, wrappers))
if not comics:
raise ValueError('No comic pages found in %s'%stream.name)
mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
[_('Unknown')])
opf = OPFCreator(os.path.abspath('.'), mi)
entries = []
def href(x):
if len(comics) == 1: return os.path.basename(x)
return '/'.join(x.split(os.sep)[-2:])
for comic in comics:
pages, wrappers = comic[1:]
entries += [(w, None) for w in map(href, wrappers)] + \
[(x, None) for x in map(href, pages)]
opf.create_manifest(entries)
spine = []
for comic in comics:
spine.extend(map(href, comic[2]))
self._images = []
for comic in comics:
self._images.extend(comic[1])
opf.create_spine(spine)
toc = TOC()
if len(comics) == 1:
wrappers = comics[0][2]
for i, x in enumerate(wrappers):
toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
play_order=i)
else:
po = 0
for comic in comics:
po += 1
wrappers = comic[2]
stoc = toc.add_item(href(wrappers[0]),
None, comic[0], play_order=po)
for i, x in enumerate(wrappers):
stoc.add_item(href(x), None,
_('Page')+' %d'%(i+1), play_order=po)
po += 1
opf.set_toc(toc)
m, n = open('metadata.opf', 'wb'), open('toc.ncx', 'wb')
opf.render(m, n, 'toc.ncx')
return os.path.abspath('metadata.opf')
def create_wrappers(self, pages):
from calibre.ebooks.oeb.base import XHTML_NS
wrappers = []
WRAPPER = textwrap.dedent('''\
<html xmlns="%s">
<head>
<title>Page #%d</title>
<style type="text/css">
@page { margin:0pt; padding: 0pt}
body { margin: 0pt; padding: 0pt}
div { text-align: center }
</style>
</head>
<body>
<div>
<img src="%s" alt="comic page #%d" />
</div>
</body>
</html>
''')
dir = os.path.dirname(pages[0])
for i, page in enumerate(pages):
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
open(page, 'wb').write(wrapper)
wrappers.append(page)
return wrappers

View File

@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'

View File

@ -0,0 +1,204 @@
/*
:mod:`cPalmdoc` -- Palmdoc compression/decompression
=====================================================
.. module:: cPalmdoc
:platform: All
:synopsis: Compression decompression of Palmdoc implemented in C for speed
.. moduleauthor:: Kovid Goyal <kovid@kovidgoyal.net> Copyright 2009
*/
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <stdio.h>
#define DELTA sizeof(Byte)*4096
#define BUFFER 6000
#define MIN(x, y) ( ((x) < (y)) ? (x) : (y) )
typedef unsigned short int Byte;
typedef struct {
Byte *data;
Py_ssize_t len;
} buffer;
#ifdef bool
#undef bool
#endif
#define bool int
#ifdef false
#undef false
#endif
#define false 0
#ifdef true
#undef true
#endif
#define true 1
#define CHAR(x) (( (x) > 127 ) ? (x)-256 : (x))
static PyObject *
cpalmdoc_decompress(PyObject *self, PyObject *args) {
const char *_input = NULL; Py_ssize_t input_len = 0;
Py_ssize_t i = 0, o = 0, j = 0, di, n;
if (!PyArg_ParseTuple(args, "t#", &_input, &input_len))
return NULL;
Byte *input = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len);
if (input == NULL) return PyErr_NoMemory();
// Map chars to bytes
for (j = 0; j < input_len; j++)
input[j] = (_input[j] < 0) ? _input[j]+256 : _input[j];
char *output = (char *)PyMem_Malloc(sizeof(char)*BUFFER);
Byte c;
PyObject *ans;
if (output == NULL) return PyErr_NoMemory();
while (i < input_len) {
c = input[i++];
if (c >= 1 && c <= 8) // copy 'c' bytes
while (c--) output[o++] = input[i++];
else if (c <= 0x7F) // 0, 09-7F = self
output[o++] = c;
else if (c >= 0xC0) { // space + ASCII char
output[o++] = ' ';
output[o++] = c ^ 0x80;
}
else { // 80-BF repeat sequences
c = (c << 8) + input[i++];
di = (c & 0x3FFF) >> 3;
for ( n = (c & 7) + 3; n--; ++o )
output[o] = output[o - di];
}
}
ans = Py_BuildValue("s#", output, o);
if (output != NULL) PyMem_Free(output);
if (input != NULL) PyMem_Free(input);
return ans;
}
static bool
cpalmdoc_memcmp( Byte *a, Byte *b, Py_ssize_t len) {
Py_ssize_t i;
for (i = 0; i < len; i++) if (a[i] != b[i]) return false;
return true;
}
static Py_ssize_t
cpalmdoc_rfind(Byte *data, Py_ssize_t pos, Py_ssize_t chunk_length) {
Py_ssize_t i;
for (i = pos - chunk_length; i > -1; i--)
if (cpalmdoc_memcmp(data+i, data+pos, chunk_length)) return i;
return pos;
}
static Py_ssize_t
cpalmdoc_do_compress(buffer *b, char *output) {
Py_ssize_t i = 0, j, chunk_len, dist;
unsigned compound;
Byte c, n;
bool found;
char *head;
head = output;
buffer temp;
temp.data = (Byte *)PyMem_Malloc(sizeof(Byte)*8); temp.len = 0;
if (temp.data == NULL) return 0;
while (i < b->len) {
c = b->data[i];
//do repeats
if ( i > 10 && (b->len - i) > 10) {
found = false;
for (chunk_len = 10; chunk_len > 2; chunk_len--) {
j = cpalmdoc_rfind(b->data, i, chunk_len);
dist = i - j;
if (j < i && dist <= 2047) {
found = true;
compound = (dist << 3) + chunk_len-3;
*(output++) = CHAR(0x80 + (compound >> 8 ));
*(output++) = CHAR(compound & 0xFF);
i += chunk_len;
break;
}
}
if (found) continue;
}
//write single character
i++;
if (c == 32 && i < b->len) {
n = b->data[i];
if ( n >= 0x40 && n <= 0x7F) {
*(output++) = CHAR(n^0x80); i++; continue;
}
}
if (c == 0 || (c > 8 && c < 0x80))
*(output++) = CHAR(c);
else { // Write binary data
j = i;
temp.data[0] = c; temp.len = 1;
while (j < b->len && temp.len < 8) {
c = b->data[j];
if (c == 0 || (c > 8 && c < 0x80)) break;
temp.data[temp.len++] = c; j++;
}
i += temp.len - 1;
*(output++) = temp.len;
for (j=0; j < temp.len; j++) *(output++) = temp.data[j];
}
}
return output - head;
}
static PyObject *
cpalmdoc_compress(PyObject *self, PyObject *args) {
const char *_input = NULL; Py_ssize_t input_len = 0;
Py_ssize_t j = 0;
buffer b;
if (!PyArg_ParseTuple(args, "t#", &_input, &input_len))
return NULL;
b.data = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len);
if (b.data == NULL) return PyErr_NoMemory();
// Map chars to bytes
for (j = 0; j < input_len; j++)
b.data[j] = (_input[j] < 0) ? _input[j]+256 : _input[j];
b.len = input_len;
char *output = (char *)PyMem_Malloc(sizeof(char) * b.len);
if (output == NULL) return PyErr_NoMemory();
j = cpalmdoc_do_compress(&b, output);
if ( j == 0) return PyErr_NoMemory();
PyObject *ans = Py_BuildValue("s#", output, j);
PyMem_Free(output);
PyMem_Free(b.data);
return ans;
}
static PyMethodDef cPalmdocMethods[] = {
{"decompress", cpalmdoc_decompress, METH_VARARGS,
"decompress(bytestring) -> decompressed bytestring\n\n"
"Decompress a palmdoc compressed byte string. "
},
{"compress", cpalmdoc_compress, METH_VARARGS,
"compress(bytestring) -> compressed bytestring\n\n"
"Palmdoc compress a byte string. "
},
{NULL, NULL, 0, NULL}
};
PyMODINIT_FUNC
initcPalmdoc(void) {
PyObject *m;
m = Py_InitModule3("cPalmdoc", cPalmdocMethods,
"Compress and decompress palmdoc strings."
);
if (m == NULL) return;
}

View File

@ -2,41 +2,46 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'and Marshall T. Vandegrift <llasram@gmail.com>'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
from cStringIO import StringIO
from struct import pack
COUNT_BITS = 3
from calibre.constants import plugins
cPalmdoc = plugins['cPalmdoc'][0]
if not cPalmdoc:
raise RuntimeError(('Failed to load required cPalmdoc module: '
'%s')%plugins['cPalmdoc'][1])
def decompress_doc(data):
buffer = [ord(i) for i in data]
res = []
i = 0
while i < len(buffer):
c = buffer[i]
i += 1
if c >= 1 and c <= 8:
res.extend(buffer[i:i+c])
i += c
elif c <= 0x7f:
res.append(c)
elif c >= 0xc0:
res.extend( (ord(' '), c^0x80) )
else:
c = (c << 8) + buffer[i]
i += 1
di = (c & 0x3fff) >> COUNT_BITS
j = len(res)
num = (c & ((1 << COUNT_BITS) - 1)) + 3
for k in range( num ):
res.append(res[j - di+k])
return ''.join([chr(i) for i in res])
return cPalmdoc.decompress(data)
def compress_doc(data):
return cPalmdoc.compress(data)
def test():
TESTS = [
'abc\x03\x04\x05\x06ms', # Test binary writing
'a b c \xfed ', # Test encoding of spaces
'0123456789axyz2bxyz2cdfgfo9iuyerh',
'0123456789asd0123456789asd|yyzzxxffhhjjkk',
('ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
]
for test in TESTS:
print 'Test:', repr(test)
print '\tTesting compression...'
good = py_compress_doc(test)
x = compress_doc(test)
print '\t\tgood:', repr(good)
print '\t\tx :', repr(x)
assert x == good
print '\tTesting decompression...'
print '\t\t', repr(decompress_doc(x))
assert decompress_doc(x) == test
print
def py_compress_doc(data):
out = StringIO()
i = 0
ldata = len(data)

View File

@ -0,0 +1,4 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@ -0,0 +1,224 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Command line interface to conversion sub-system
'''
USAGE = '%prog ' + _('''\
input_file output_file [options]
Convert an ebook from one format to another.
input_file is the input and output_file is the output. Both must be \
specified as the first two arguments to the command.
The output ebook format is guessed from the file extension of \
output_file. output_file can also be of the special format .EXT where \
EXT is the output file extension. In this case, the name of the output \
file is derived the name of the input file. Note that the filenames must \
not start with a hyphen. Finally, if output_file has no extension, then \
it is treated as a directory and an "open ebook" (OEB) consisting of HTML \
files is written to that directory. These files are the files that would \
normally have been passed to the output plugin.
After specifying the input \
and output file you can customize the conversion by specifying various \
options. the available options depend on the input and output file types. \
To get help on them specify the input and output file and then use the -h \
option.
For full documentation of the conversion system see
''') + 'http://calibre.kovidgoyal.net/user_manual/conversion.html'
import sys, os
from optparse import OptionGroup, Option
from calibre.utils.config import OptionParser
from calibre.utils.logging import Log
from calibre.constants import preferred_encoding
from calibre.customize.conversion import OptionRecommendation
def print_help(parser, log):
help = parser.format_help().encode(preferred_encoding, 'replace')
log(help)
def check_command_line_options(parser, args, log):
if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'):
print_help(parser, log)
log.error('\n\nYou must specify the input AND output files')
raise SystemExit(1)
input = os.path.abspath(args[1])
if not input.endswith('.recipe') and not os.access(input, os.R_OK):
log.error('Cannot read from', input)
raise SystemExit(1)
output = args[2]
if output.startswith('.') and output != '.':
output = os.path.splitext(os.path.basename(input))[0]+output
output = os.path.abspath(output)
return input, output
def option_recommendation_to_cli_option(add_option, rec):
opt = rec.option
switches = ['-'+opt.short_switch] if opt.short_switch else []
switches.append('--'+opt.long_switch)
attrs = dict(dest=opt.name, help=opt.help,
choices=opt.choices, default=rec.recommended_value)
if isinstance(rec.recommended_value, type(True)):
attrs['action'] = 'store_false' if rec.recommended_value else \
'store_true'
add_option(Option(*switches, **attrs))
def add_input_output_options(parser, plumber):
input_options, output_options = \
plumber.input_options, plumber.output_options
def add_options(group, options):
for opt in options:
option_recommendation_to_cli_option(group, opt)
if input_options:
title = _('INPUT OPTIONS')
io = OptionGroup(parser, title, _('Options to control the processing'
' of the input %s file')%plumber.input_fmt)
add_options(io.add_option, input_options)
parser.add_option_group(io)
if output_options:
title = _('OUTPUT OPTIONS')
oo = OptionGroup(parser, title, _('Options to control the processing'
' of the output %s')%plumber.output_fmt)
add_options(oo.add_option, output_options)
parser.add_option_group(oo)
def add_pipeline_options(parser, plumber):
groups = {
'' : ('',
[
'input_profile',
'output_profile',
]
),
'LOOK AND FEEL' : (
_('Options to control the look and feel of the output'),
[
'base_font_size', 'disable_font_rescaling',
'font_size_mapping',
'line_height',
'linearize_tables',
'extra_css',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'dont_justify',
'insert_blank_line', 'remove_paragraph_spacing',
]
),
'STRUCTURE DETECTION' : (
_('Control auto-detection of document structure.'),
[
'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before',
'preprocess_html',
]
),
'TABLE OF CONTENTS' : (
_('Control the automatic generation of a Table of Contents. By '
'default, if the source file has a Table of Contents, it will '
'be used in preference to the automatically generated one.'),
[
'level1_toc', 'level2_toc', 'level3_toc',
'toc_threshold', 'max_toc_links', 'no_chapters_in_toc',
'use_auto_toc', 'toc_filter',
]
),
'METADATA' : (_('Options to set metadata in the output'),
plumber.metadata_option_names,
),
'DEBUG': (_('Options to help with debugging the conversion'),
[
'verbose',
]),
}
group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
for group in group_order:
desc, options = groups[group]
if group:
group = OptionGroup(parser, group, desc)
parser.add_option_group(group)
add_option = group.add_option if group != '' else parser.add_option
for name in options:
rec = plumber.get_option_by_name(name)
if rec.level < rec.HIGH:
option_recommendation_to_cli_option(add_option, rec)
option_recommendation_to_cli_option(parser.add_option,
plumber.get_option_by_name('list_recipes'))
def option_parser():
return OptionParser(usage=USAGE)
class ProgressBar(object):
def __init__(self, log):
self.log = log
def __call__(self, frac, msg=''):
if msg:
percent = int(frac*100)
self.log('%d%% %s'%(percent, msg))
def create_option_parser(args, log):
parser = option_parser()
if len(args) < 3:
print_help(parser, log)
raise SystemExit(1)
input, output = check_command_line_options(parser, args, log)
from calibre.ebooks.conversion.plumber import Plumber
reporter = ProgressBar(log)
plumber = Plumber(input, output, log, reporter)
add_input_output_options(parser, plumber)
add_pipeline_options(parser, plumber)
return parser, plumber
def main(args=sys.argv):
log = Log()
parser, plumber = create_option_parser(args, log)
opts = parser.parse_args(args)[0]
y = lambda q : os.path.abspath(os.path.expanduser(q))
for x in ('read_metadata_from_opf', 'cover'):
if getattr(opts, x, None) is not None:
setattr(opts, x, y(getattr(opts, x)))
recommendations = [(n.dest, getattr(opts, n.dest),
OptionRecommendation.HIGH) \
for n in parser.options_iter()
if n.dest]
plumber.merge_ui_recommendations(recommendations)
plumber.run()
if plumber.opts.debug_input is None:
log(_('Output saved to'), ' ', plumber.output)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,96 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.utils.config import config_dir
from calibre.utils.lock import ExclusiveFile
from calibre import sanitize_file_name
from calibre.customize.conversion import OptionRecommendation
config_dir = os.path.join(config_dir, 'conversion')
if not os.path.exists(config_dir):
os.makedirs(config_dir)
def name_to_path(name):
return os.path.join(config_dir, sanitize_file_name(name)+'.py')
def save_defaults(name, recs):
path = name_to_path(name)
raw = str(recs)
with open(path, 'wb'):
pass
with ExclusiveFile(path) as f:
f.write(raw)
def load_defaults(name):
path = name_to_path(name)
if not os.path.exists(path):
open(path, 'wb').close()
with ExclusiveFile(path) as f:
raw = f.read()
r = GuiRecommendations()
if raw:
r.from_string(raw)
return r
def save_specifics(db, book_id, recs):
raw = str(recs)
db.set_conversion_options(book_id, 'PIPE', raw)
def load_specifics(db, book_id):
raw = db.conversion_options(book_id, 'PIPE')
r = GuiRecommendations()
if raw:
r.from_string(raw)
return r
class GuiRecommendations(dict):
def __new__(cls, *args):
dict.__new__(cls)
obj = super(GuiRecommendations, cls).__new__(cls, *args)
obj.disabled_options = set([])
return obj
def to_recommendations(self, level=OptionRecommendation.LOW):
ans = []
for key, val in self.items():
ans.append((key, val, level))
return ans
def __str__(self):
ans = ['{']
for key, val in self.items():
ans.append('\t'+repr(key)+' : '+repr(val)+',')
ans.append('}')
return '\n'.join(ans)
def from_string(self, raw):
try:
d = eval(raw)
except SyntaxError:
d = None
if d:
self.update(d)
def merge_recommendations(self, get_option, level, options,
only_existing=False):
for name in options:
if only_existing and name not in self:
continue
opt = get_option(name)
if opt is None: continue
if opt.level == OptionRecommendation.HIGH:
self[name] = opt.recommended_value
self.disabled_options.add(name)
elif opt.level > level or name not in self:
self[name] = opt.recommended_value

View File

@ -0,0 +1,690 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re
from calibre.customize.conversion import OptionRecommendation, DummyReporter
from calibre.customize.ui import input_profiles, output_profiles, \
plugin_for_input_format, plugin_for_output_format
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre import extract, walk
def supported_input_formats():
from calibre.customize.ui import available_input_formats
fmts = available_input_formats()
for x in ('zip', 'rar', 'oebzip'):
fmts.add(x)
return fmts
INPUT_FORMAT_PREFERENCES = ['cbr', 'cbz', 'cbc', 'lit', 'mobi', 'prc', 'azw', 'fb2', 'html',
'rtf', 'pdf', 'txt', 'pdb']
OUTPUT_FORMAT_PREFERENCES = ['epub', 'mobi', 'lit', 'pdf', 'pdb', 'txt']
class OptionValues(object):
pass
class CompositeProgressReporter(object):
def __init__(self, global_min, global_max, global_reporter):
self.global_min, self.global_max = global_min, global_max
self.global_reporter = global_reporter
def __call__(self, fraction, msg=''):
global_frac = self.global_min + fraction * \
(self.global_max - self.global_min)
self.global_reporter(global_frac, msg)
class Plumber(object):
'''
The `Plumber` manages the conversion pipeline. An UI should call the methods
:method:`merge_ui_recommendations` and then :method:`run`. The plumber will
take care of the rest.
'''
metadata_option_names = [
'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments',
'publisher', 'series', 'series_index', 'rating', 'isbn',
'tags', 'book_producer', 'language'
]
def __init__(self, input, output, log, report_progress=DummyReporter()):
'''
:param input: Path to input file.
:param output: Path to output file/directory
'''
self.input = os.path.abspath(input)
self.output = os.path.abspath(output)
self.log = log
self.ui_reporter = report_progress
# Initialize the conversion options that are independent of input and
# output formats. The input and output plugins can still disable these
# options via recommendations.
self.pipeline_options = [
OptionRecommendation(name='verbose',
recommended_value=0, level=OptionRecommendation.LOW,
short_switch='v',
help=_('Level of verbosity. Specify multiple times for greater '
'verbosity.')
),
OptionRecommendation(name='input_profile',
recommended_value='default', level=OptionRecommendation.LOW,
choices=[x.short_name for x in input_profiles()],
help=_('Specify the input profile. The input profile gives the '
'conversion system information on how to interpret '
'various information in the input document. For '
'example resolution dependent lengths (i.e. lengths in '
'pixels). Choices are:')+\
', '.join([x.short_name for x in input_profiles()])
),
OptionRecommendation(name='output_profile',
recommended_value='default', level=OptionRecommendation.LOW,
choices=[x.short_name for x in output_profiles()],
help=_('Specify the output profile. The output profile '
'tells the conversion system how to optimize the '
'created document for the specified device. In some cases, '
'an output profile is required to produce documents that '
'will work on a device. For example EPUB on the SONY reader. '
'Choices are:') + \
', '.join([x.short_name for x in output_profiles()])
),
OptionRecommendation(name='base_font_size',
recommended_value=0, level=OptionRecommendation.LOW,
help=_('The base font size in pts. All font sizes in the produced book '
'will be rescaled based on this size. By choosing a larger '
'size you can make the fonts in the output bigger and vice '
'versa. By default, the base font size is chosen based on '
'the output profile you chose.'
)
),
OptionRecommendation(name='font_size_mapping',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Mapping from CSS font names to font sizes in pts. '
'An example setting is 12,12,14,16,18,20,22,24. '
'These are the mappings for the sizes xx-small to xx-large, '
'with the final size being for huge fonts. The font '
'rescaling algorithm uses these sizes to intelligently '
'rescale fonts. The default is to use a mapping based on '
'the output profile you chose.'
)
),
OptionRecommendation(name='disable_font_rescaling',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable all rescaling of font sizes.'
)
),
OptionRecommendation(name='line_height',
recommended_value=0, level=OptionRecommendation.LOW,
help=_('The line height in pts. Controls spacing between consecutive '
'lines of text. By default no line height manipulation is '
'performed.'
)
),
OptionRecommendation(name='linearize_tables',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Some badly designed documents use tables to control the '
'layout of text on the page. When converted these documents '
'often have text that runs off the page and other artifacts. '
'This option will extract the content from the tables and '
'present it in a linear fashion.'
)
),
OptionRecommendation(name='level1_toc',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('XPath expression that specifies all tags that '
'should be added to the Table of Contents at level one. If '
'this is specified, it takes precedence over other forms '
'of auto-detection.'
)
),
OptionRecommendation(name='level2_toc',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('XPath expression that specifies all tags that should be '
'added to the Table of Contents at level two. Each entry is added '
'under the previous level one entry.'
)
),
OptionRecommendation(name='level3_toc',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('XPath expression that specifies all tags that should be '
'added to the Table of Contents at level three. Each entry '
'is added under the previous level two entry.'
)
),
OptionRecommendation(name='use_auto_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Normally, if the source file already has a Table of '
'Contents, it is used in preference to the auto-generated one. '
'With this option, the auto-generated one is always used.'
)
),
OptionRecommendation(name='no_chapters_in_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_("Don't add auto-detected chapters to the Table of "
'Contents.'
)
),
OptionRecommendation(name='toc_threshold',
recommended_value=6, level=OptionRecommendation.LOW,
help=_(
'If fewer than this number of chapters is detected, then links '
'are added to the Table of Contents. Default: %default')
),
OptionRecommendation(name='max_toc_links',
recommended_value=50, level=OptionRecommendation.LOW,
help=_('Maximum number of links to insert into the TOC. Set to 0 '
'to disable. Default is: %default. Links are only added to the '
'TOC if less than the threshold number of chapters were detected.'
)
),
OptionRecommendation(name='toc_filter',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Remove entries from the Table of Contents whose titles '
'match the specified regular expression. Matching entries and all '
'their children are removed.'
)
),
OptionRecommendation(name='chapter',
recommended_value="//*[((name()='h1' or name()='h2') and "
r"re:test(., 'chapter|book|section|part\s+', 'i')) or @class "
"= 'chapter']", level=OptionRecommendation.LOW,
help=_('An XPath expression to detect chapter titles. The default '
'is to consider <h1> or <h2> tags that contain the words '
'"chapter","book","section" or "part" as chapter titles as '
'well as any tags that have class="chapter". The expression '
'used must evaluate to a list of elements. To disable chapter '
'detection, use the expression "/". See the XPath Tutorial '
'in the calibre User Manual for further help on using this '
'feature.'
)
),
OptionRecommendation(name='chapter_mark',
recommended_value='pagebreak', level=OptionRecommendation.LOW,
choices=['pagebreak', 'rule', 'both', 'none'],
help=_('Specify how to mark detected chapters. A value of '
'"pagebreak" will insert page breaks before chapters. '
'A value of "rule" will insert a line before chapters. '
'A value of "none" will disable chapter marking and a '
'value of "both" will use both page breaks and lines '
'to mark chapters.')
),
OptionRecommendation(name='extra_css',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Either the path to a CSS stylesheet or raw CSS. '
'This CSS will be appended to the style rules from '
'the source file, so it can be used to override those '
'rules.')
),
OptionRecommendation(name='page_breaks_before',
recommended_value="//*[name()='h1' or name()='h2']",
level=OptionRecommendation.LOW,
help=_('An XPath expression. Page breaks are inserted '
'before the specified elements.')
),
OptionRecommendation(name='margin_top',
recommended_value=5.0, level=OptionRecommendation.LOW,
help=_('Set the top margin in pts. Default is %default. '
'Note: 72 pts equals 1 inch')),
OptionRecommendation(name='margin_bottom',
recommended_value=5.0, level=OptionRecommendation.LOW,
help=_('Set the bottom margin in pts. Default is %default. '
'Note: 72 pts equals 1 inch')),
OptionRecommendation(name='margin_left',
recommended_value=5.0, level=OptionRecommendation.LOW,
help=_('Set the left margin in pts. Default is %default. '
'Note: 72 pts equals 1 inch')),
OptionRecommendation(name='margin_right',
recommended_value=5.0, level=OptionRecommendation.LOW,
help=_('Set the right margin in pts. Default is %default. '
'Note: 72 pts equals 1 inch')),
OptionRecommendation(name='dont_justify',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not force text to be justified in output. Whether text '
'is actually displayed justified or not depends on whether '
'the ebook format and reading device support justification.')
),
OptionRecommendation(name='remove_paragraph_spacing',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Remove spacing between paragraphs. Also sets an indent on '
'paragraphs of 1.5em. Spacing removal will not work '
'if the source file does not use paragraphs (<p> or <div> tags).')
),
OptionRecommendation(name='prefer_metadata_cover',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Use the cover detected from the source file in preference '
'to the specified cover.')
),
OptionRecommendation(name='insert_blank_line',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Insert a blank line between paragraphs. Will not work '
'if the source file does not use paragraphs (<p> or <div> tags).'
)
),
OptionRecommendation(name='remove_first_image',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Remove the first image from the input ebook. Useful if the '
'first image in the source file is a cover and you are specifying '
'an external cover.'
)
),
OptionRecommendation(name='insert_metadata',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Insert the book metadata at the start of '
'the book. This is useful if your ebook reader does not support '
'displaying/searching metadata directly.'
)
),
OptionRecommendation(name='preprocess_html',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Attempt to detect and correct hard line breaks and other '
'problems in the source file. This may make things worse, so use '
'with care.'
)
),
OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW,
short_switch='m',
help=_('Read metadata from the specified OPF file. Metadata read '
'from this file will override any metadata in the source '
'file.')
),
OptionRecommendation(name='title',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the title.')),
OptionRecommendation(name='authors',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the authors. Multiple authors should be separated by '
'ampersands.')),
OptionRecommendation(name='title_sort',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('The version of the title to be used for sorting. ')),
OptionRecommendation(name='author_sort',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('String to be used when sorting by author. ')),
OptionRecommendation(name='cover',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the cover to the specified file.')),
OptionRecommendation(name='comments',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the ebook description.')),
OptionRecommendation(name='publisher',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the ebook publisher.')),
OptionRecommendation(name='series',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the series this ebook belongs to.')),
OptionRecommendation(name='series_index',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the index of the book in this series.')),
OptionRecommendation(name='rating',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the rating. Should be a number between 1 and 5.')),
OptionRecommendation(name='isbn',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the ISBN of the book.')),
OptionRecommendation(name='tags',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the tags for the book. Should be a comma separated list.')),
OptionRecommendation(name='book_producer',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the book producer.')),
OptionRecommendation(name='language',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the language.')),
OptionRecommendation(name='list_recipes',
recommended_value=False, help=_('List available recipes.')),
]
input_fmt = os.path.splitext(self.input)[1]
if not input_fmt:
raise ValueError('Input file must have an extension')
input_fmt = input_fmt[1:].lower()
if input_fmt in ('zip', 'rar', 'oebzip'):
self.log('Processing archive...')
tdir = PersistentTemporaryDirectory('_plumber')
self.input, input_fmt = self.unarchive(self.input, tdir)
if os.path.exists(self.output) and os.path.isdir(self.output):
output_fmt = 'oeb'
else:
output_fmt = os.path.splitext(self.output)[1]
if not output_fmt:
output_fmt = '.oeb'
output_fmt = output_fmt[1:].lower()
self.input_plugin = plugin_for_input_format(input_fmt)
self.output_plugin = plugin_for_output_format(output_fmt)
if self.input_plugin is None:
raise ValueError('No plugin to handle input format: '+input_fmt)
if self.output_plugin is None:
raise ValueError('No plugin to handle output format: '+output_fmt)
self.input_fmt = input_fmt
self.output_fmt = output_fmt
# Build set of all possible options. Two options are equal if their
# names are the same.
self.input_options = self.input_plugin.options.union(
self.input_plugin.common_options)
self.output_options = self.output_plugin.options.union(
self.output_plugin.common_options)
# Remove the options that have been disabled by recommendations from the
# plugins.
self.merge_plugin_recommendations()
@classmethod
def unarchive(self, path, tdir):
extract(path, tdir)
files = list(walk(tdir))
from calibre.customize.ui import available_input_formats
fmts = available_input_formats()
for x in ('htm', 'html', 'xhtm', 'xhtml'): fmts.remove(x)
for ext in fmts:
for f in files:
if f.lower().endswith('.'+ext):
if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
continue
return f, ext
return self.find_html_index(files)
@classmethod
def find_html_index(self, files):
'''
Given a list of files, find the most likely root HTML file in the
list.
'''
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
html_files = [f for f in files if html_pat.search(f) is not None]
if not html_files:
raise ValueError(_('Could not find an ebook inside the archive'))
html_files = [(f, os.stat(f).st_size) for f in html_files]
html_files.sort(cmp = lambda x, y: cmp(x[1], y[1]))
html_files = [f[0] for f in html_files]
for q in ('toc', 'index'):
for f in html_files:
if os.path.splitext(os.path.basename(f))[0].lower() == q:
return f, os.path.splitext(f)[1].lower()[1:]
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
def get_option_by_name(self, name):
for group in (self.input_options, self.pipeline_options,
self.output_options):
for rec in group:
if rec.option == name:
return rec
def get_option_help(self, name):
rec = self.get_option_by_name(name)
help = getattr(rec, 'help', None)
if help is not None:
return help.replace('%default', str(rec.recommended_value))
def merge_plugin_recommendations(self):
for source in (self.input_plugin, self.output_plugin):
for name, val, level in source.recommendations:
rec = self.get_option_by_name(name)
if rec is not None and rec.level <= level:
rec.recommended_value = val
rec.level = level
def merge_ui_recommendations(self, recommendations):
'''
Merge recommendations from the UI. As long as the UI recommendation
level is >= the baseline recommended level, the UI value is used,
*except* if the baseline has a recommendation level of `HIGH`.
'''
for name, val, level in recommendations:
rec = self.get_option_by_name(name)
if rec is not None and rec.level <= level and rec.level < rec.HIGH:
rec.recommended_value = val
rec.level = level
def read_user_metadata(self):
'''
Read all metadata specified by the user. Command line options override
metadata from a specified OPF file.
'''
from calibre.ebooks.metadata import MetaInformation, string_to_authors
from calibre.ebooks.metadata.opf2 import OPF
mi = MetaInformation(None, [])
if self.opts.read_metadata_from_opf is not None:
self.opts.read_metadata_from_opf = os.path.abspath(
self.opts.read_metadata_from_opf)
opf = OPF(open(self.opts.read_metadata_from_opf, 'rb'),
os.path.dirname(self.opts.read_metadata_from_opf))
mi = MetaInformation(opf)
for x in self.metadata_option_names:
val = getattr(self.opts, x, None)
if val is not None:
if x == 'authors':
val = string_to_authors(val)
elif x == 'tags':
val = [i.strip() for i in val.split(',')]
elif x in ('rating', 'series_index'):
val = float(val)
setattr(mi, x, val)
if mi.cover:
mi.cover_data = ('', open(mi.cover, 'rb').read())
mi.cover = None
self.user_metadata = mi
def setup_options(self):
'''
Setup the `self.opts` object.
'''
self.opts = OptionValues()
for group in (self.input_options, self.pipeline_options,
self.output_options):
for rec in group:
setattr(self.opts, rec.option.name, rec.recommended_value)
for x in input_profiles():
if x.short_name == self.opts.input_profile:
self.opts.input_profile = x
break
for x in output_profiles():
if x.short_name == self.opts.output_profile:
self.opts.output_profile = x
break
self.read_user_metadata()
def run(self):
'''
Run the conversion pipeline
'''
# Setup baseline option values
self.setup_options()
if self.opts.verbose:
self.log.filter_level = self.log.DEBUG
if self.opts.list_recipes:
from calibre.web.feeds.recipes import titles
self.log('Available recipes:')
for title in sorted(titles):
self.log('\t'+title)
self.log('%d recipes available'%len(titles))
raise SystemExit(0)
# Run any preprocess plugins
from calibre.customize.ui import run_plugins_on_preprocess
self.input = run_plugins_on_preprocess(self.input)
# Create an OEBBook from the input file. The input plugin does all the
# heavy lifting.
accelerators = {}
tdir = PersistentTemporaryDirectory('_plumber')
stream = self.input if self.input_fmt == 'recipe' else \
open(self.input, 'rb')
if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf':
self.opts.lrf = True
self.ui_reporter(0.01, _('Converting input to HTML...'))
ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)
self.input_plugin.report_progress = ir
self.oeb = self.input_plugin(stream, self.opts,
self.input_fmt, self.log,
accelerators, tdir)
if self.opts.debug_input is not None:
self.log('Debug input called, aborting the rest of the pipeline.')
return
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
self.input_plugin)
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
pr(0., _('Running transforms on ebook...'))
from calibre.ebooks.oeb.transforms.guide import Clean
Clean()(self.oeb, self.opts)
pr(0.1)
self.opts.source = self.opts.input_profile
self.opts.dest = self.opts.output_profile
from calibre.ebooks.oeb.transforms.metadata import MergeMetadata
MergeMetadata()(self.oeb, self.user_metadata,
self.opts.prefer_metadata_cover)
pr(0.2)
from calibre.ebooks.oeb.transforms.structure import DetectStructure
DetectStructure()(self.oeb, self.opts)
pr(0.35)
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
fbase = self.opts.base_font_size
if fbase < 1e-4:
fbase = float(self.opts.dest.fbase)
fkey = self.opts.font_size_mapping
if fkey is None:
fkey = self.opts.dest.fkey
else:
fkey = map(float, fkey.split(','))
from calibre.ebooks.oeb.transforms.jacket import Jacket
Jacket()(self.oeb, self.opts, self.user_metadata)
pr(0.4)
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
self.opts.extra_css = open(self.opts.extra_css, 'rb').read()
oibl = self.opts.insert_blank_line
orps = self.opts.remove_paragraph_spacing
if self.output_plugin.file_type == 'lrf':
self.opts.insert_blank_line = False
self.opts.remove_paragraph_spacing = False
line_height = self.opts.line_height
if line_height < 1e-4:
line_height = None
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
lineh=line_height,
untable=self.output_plugin.file_type in ('mobi','lit'),
unfloat=self.output_plugin.file_type in ('mobi', 'lit'))
flattener(self.oeb, self.opts)
self.opts.insert_blank_line = oibl
self.opts.remove_paragraph_spacing = orps
if self.opts.linearize_tables and \
self.output_plugin.file_type not in ('mobi', 'lrf'):
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
LinearizeTables()(self.oeb, self.opts)
pr(0.9)
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
self.log.info('Cleaning up manifest...')
trimmer = ManifestTrimmer()
trimmer(self.oeb, self.opts)
self.oeb.toc.rationalize_play_orders()
pr(1.)
self.log.info('Creating %s...'%self.output_plugin.name)
our = CompositeProgressReporter(0.67, 1., self.ui_reporter)
self.output_plugin.report_progress = our
our(0., _('Creating')+' %s'%self.output_plugin.name)
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
self.opts, self.log)
self.ui_reporter(1.)
self.log(self.output_fmt.upper(), 'output written to', self.output)
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
encoding='utf-8'):
'''
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html)
oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print, input_encoding=encoding)
# Read OEB Book into OEBBook
log('Parsing all content...')
if reader is None:
from calibre.ebooks.oeb.reader import OEBReader
reader = OEBReader
reader()(oeb, path_or_stream)
return oeb

View File

@ -0,0 +1,202 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, functools
from calibre import entity_to_unicode
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
def sanitize_head(match):
x = match.group(1)
x = _span_pat.sub('', x)
return '<head>\n'+x+'\n</head>'
def chap_head(match):
chap = match.group('chap')
title = match.group('title')
if not title:
return '<h1>'+chap+'</h1><br/>\n'
else:
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
def wrap_lines(match):
ital = match.group('ital')
if not ital:
return ' '
else:
return ital+' '
def line_length(raw, percent):
'''
raw is the raw text to find the line length to use for wrapping.
percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use.
'''
raw = raw.replace('&nbsp;', ' ')
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
for line in lines:
if len(line) > 0:
lengths.append(len(line))
total = sum(lengths)
avg = total / len(lengths)
max_line = avg * 2
lengths = sorted(lengths)
for i in range(len(lengths) - 1, -1, -1):
if lengths[i] > max_line:
del lengths[i]
if percent > 1:
percent = 1
if percent < 0:
percent = 0
index = int(len(lengths) * percent) - 1
return lengths[index]
class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
def __call__(self, data):
data = self.PAGE_PAT.sub('', data)
return data
class HTMLPreProcessor(object):
PREPROCESS = [
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
lambda match: ''),
]
# Fix pdftohtml markup
PDFTOHTML = [
# Fix umlauts
(re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'),
(re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'),
(re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'),
(re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'),
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
(re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'),
(re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'),
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
# Remove page links
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
# Replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
# Remove hyphenation
(re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
# Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
# Have paragraphs show better
(re.compile(r'<br.*?>'), lambda match : '<p>'),
# Clean up spaces
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
]
# Fix Book Designer markup
BOOK_DESIGNER = [
# HR
(re.compile('<hr>', re.IGNORECASE),
lambda match : '<span style="page-break-after:always"> </span>'),
# Create header tags
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def __init__(self, input_plugin_preprocess, plugin_preprocess):
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
re.IGNORECASE).search(src) is not None
def is_book_designer(self, raw):
return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None):
if remove_special_chars is not None:
html = remove_special_chars.sub('', html)
html = html.replace('\0', '')
if self.is_baen(html):
rules = []
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
line_length_rules = [
# Un wrap using punctuation
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .3), re.UNICODE), wrap_lines),
]
rules = self.PDFTOHTML + line_length_rules
else:
rules = []
for rule in self.PREPROCESS + rules:
html = rule[0].sub(rule[1], html)
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in html and SVG_NS not in html:
html = html.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in html and XLINK_NS not in html:
html = html.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
html = XMLDECL_RE.sub('', html)
if self.plugin_preprocess:
html = self.input_plugin_preprocess(html)
return html

View File

@ -6,32 +6,7 @@ __docformat__ = 'restructuredtext en'
'''
Conversion to EPUB.
'''
import sys, textwrap, re, os, uuid
from itertools import cycle
from calibre.utils.config import Config, StringConfig
from calibre.utils.zipfile import ZipFile, ZIP_STORED
from calibre.ebooks.html import config as common_config, tostring
from lxml import etree
class DefaultProfile(object):
flow_size = sys.maxint
screen_size = None
remove_special_chars = False
remove_object_tags = False
class PRS505(DefaultProfile):
flow_size = 270000
screen_size = (590, 765)
remove_special_chars = re.compile(u'[\u200b\u00ad]')
remove_object_tags = True
PROFILES = {
'PRS505' : PRS505,
'None' : DefaultProfile,
}
def rules(stylesheets):
for s in stylesheets:
@ -40,38 +15,6 @@ def rules(stylesheets):
if r.type == r.STYLE_RULE:
yield r
def decrypt_font(key, path):
raw = open(path, 'rb').read()
crypt = raw[:1024]
key = cycle(iter(key))
decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
with open(path, 'wb') as f:
f.write(decrypt)
f.write(raw[1024:])
def process_encryption(encfile, opf):
key = None
m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
if m:
key = m.group(1)
key = list(map(ord, uuid.UUID(key).bytes))
try:
root = etree.parse(encfile)
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
algorithm = em.get('Algorithm', '')
if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
return False
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
uri = cr.get('URI')
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
if os.path.exists(path):
decrypt_font(key, path)
return True
except:
import traceback
traceback.print_exc()
return False
def initialize_container(path_to_container, opf_name='metadata.opf'):
'''
Create an empty EPUB document, with a default skeleton.
@ -90,152 +33,4 @@ def initialize_container(path_to_container, opf_name='metadata.opf'):
zf.writestr('META-INF/container.xml', CONTAINER)
return zf
def config(defaults=None, name='epub'):
desc = _('Options to control the conversion to EPUB')
if defaults is None:
c = Config(name, desc)
else:
c = StringConfig(defaults, desc)
c.update(common_config())
c.remove_opt('output')
c.remove_opt('zip')
c.add_opt('output', ['-o', '--output'], default=None,
help=_('The output EPUB file. If not specified, it is '
'derived from the input file name.'))
c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()),
help=_('Profile of the target device this EPUB is meant for. '
'Set to None to create a device independent EPUB. '
'The profile is used for device specific restrictions '
'on the EPUB. Choices are: ')+str(list(PROFILES.keys())))
c.add_opt('override_css', ['--override-css'], default=None,
help=_('Either the path to a CSS stylesheet or raw CSS. '
'This CSS will override any existing CSS '
'declarations in the source files.'))
structure = c.add_group('structure detection',
_('Control auto-detection of document structure.'))
structure('chapter', ['--chapter'],
default="//*[re:match(name(), 'h[1-2]') and "
"re:test(., 'chapter|book|section|part', 'i')] | "
"//*[@class = 'chapter']",
help=_('''\
An XPath expression to detect chapter titles. The default is to consider <h1> or
<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as
well as any tags that have class="chapter".
The expression used must evaluate to a list of elements. To disable chapter detection,
use the expression "/". See the XPath Tutorial in the calibre User Manual for further
help on using this feature.
''').replace('\n', ' '))
structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'],
default='pagebreak',
help=_('Specify how to mark detected chapters. A value of '
'"pagebreak" will insert page breaks before chapters. '
'A value of "rule" will insert a line before chapters. '
'A value of "none" will disable chapter marking and a '
'value of "both" will use both page breaks and lines '
'to mark chapters.'))
structure('cover', ['--cover'], default=None,
help=_('Path to the cover to be used for this book'))
structure('prefer_metadata_cover', ['--prefer-metadata-cover'], default=False,
action='store_true',
help=_('Use the cover detected from the source file in preference '
'to the specified cover.'))
structure('remove_first_image', ['--remove-first-image'], default=False,
help=_('Remove the first image from the input ebook. Useful if '
'the first image in the source file is a cover and you '
'are specifying an external cover.'))
structure('dont_split_on_page_breaks', ['--dont-split-on-page-breaks'], default=False,
help=_('Turn off splitting at page breaks. Normally, input files '
'are automatically split at every page break into '
'two files. This gives an output ebook that can be parsed '
'faster and with less resources. However, splitting is '
'slow and if your source file contains a very large '
'number of page breaks, you should turn off splitting '
'on page breaks.'))
structure('page', ['--page'], default=None,
help=_('XPath expression to detect page boundaries for building '
'a custom pagination map, as used by AdobeDE. Default is '
'not to build an explicit pagination map.'))
structure('page_names', ['--page-names'], default=None,
help=_('XPath expression to find the name of each page in the '
'pagination map relative to its boundary element. '
'Default is to number all pages staring with 1.'))
toc = c.add_group('toc',
_('''\
Control the automatic generation of a Table of Contents. If an OPF file is detected
and it specifies a Table of Contents, then that will be used rather than trying
to auto-generate a Table of Contents.
''').replace('\n', ' '))
toc('max_toc_links', ['--max-toc-links'], default=50,
help=_('Maximum number of links to insert into the TOC. Set to 0 '
'to disable. Default is: %default. Links are only added to the '
'TOC if less than the --toc-threshold number of chapters were detected.'))
toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
help=_("Don't add auto-detected chapters to the Table of Contents."))
toc('toc_threshold', ['--toc-threshold'], default=6,
help=_('If fewer than this number of chapters is detected, then links '
'are added to the Table of Contents. Default: %default'))
toc('level1_toc', ['--level1-toc'], default=None,
help=_('XPath expression that specifies all tags that should be added '
'to the Table of Contents at level one. If this is specified, '
'it takes precedence over other forms of auto-detection.'))
toc('level2_toc', ['--level2-toc'], default=None,
help=_('XPath expression that specifies all tags that should be added '
'to the Table of Contents at level two. Each entry is added '
'under the previous level one entry.'))
toc('level3_toc', ['--level3-toc'], default=None,
help=_('XPath expression that specifies all tags that should be added '
'to the Table of Contents at level three. Each entry is added '
'under the previous level two entry.'))
toc('from_ncx', ['--from-ncx'], default=None,
help=_('Path to a .ncx file that contains the table of contents to use '
'for this ebook. The NCX file should contain links relative to '
'the directory it is placed in. See '
'http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for '
'an overview of the NCX format.'))
toc('use_auto_toc', ['--use-auto-toc'], default=False,
help=_('Normally, if the source file already has a Table of Contents, '
'it is used in preference to the auto-generated one. '
'With this option, the auto-generated one is always used.'))
layout = c.add_group('page layout', _('Control page layout'))
layout('margin_top', ['--margin-top'], default=5.0,
help=_('Set the top margin in pts. Default is %default'))
layout('margin_bottom', ['--margin-bottom'], default=5.0,
help=_('Set the bottom margin in pts. Default is %default'))
layout('margin_left', ['--margin-left'], default=5.0,
help=_('Set the left margin in pts. Default is %default'))
layout('margin_right', ['--margin-right'], default=5.0,
help=_('Set the right margin in pts. Default is %default'))
layout('base_font_size2', ['--base-font-size'], default=12.0,
help=_('The base font size in pts. Default is %defaultpt. '
'Set to 0 to disable rescaling of fonts.'))
layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=False,
help=_('Remove spacing between paragraphs. '
'Also sets a indent on paragraphs of 1.5em. '
'You can override this by adding p {text-indent: 0cm} to '
'--override-css. Spacing removal will not work if the source '
'file forces inter-paragraph spacing.'))
layout('no_justification', ['--no-justification'], default=False,
help=_('Do not force text to be justified in output.'))
layout('linearize_tables', ['--linearize-tables'], default=False,
help=_('Remove table markup, converting it into paragraphs. '
'This is useful if your source file uses a table to manage layout.'))
layout('preserve_tag_structure', ['--preserve-tag-structure'], default=False,
help=_('Preserve the HTML tag structure while splitting large HTML files. '
'This is only neccessary if the HTML files contain CSS that '
'uses sibling selectors. Enabling this greatly slows down '
'processing of large HTML files.'))
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
help=_('Print generated OPF file to stdout'))
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
help=_('Print generated NCX file to stdout'))
c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug',
default=False,
help=_('Keep intermediate files during processing by html2epub'))
c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,
help=_('Extract the contents of the produced EPUB file to the '
'specified directory.'))
return c

View File

@ -1,300 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Font size rationalization. See :function:`relativize`.
'''
import logging, re, operator, functools, collections, unittest, copy, sys
from xml.dom import SyntaxErr
from lxml.cssselect import CSSSelector
from lxml import etree
from lxml.html import HtmlElement
from calibre.ebooks.html import fromstring
from calibre.ebooks.epub import rules
from cssutils import CSSParser
num = r'[-]?\d+|[-]?\d*\.\d+'
length = r'(?P<zero>0)|(?P<num>{num})(?P<unit>%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num)
absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)'
relative_size = r'(?P<rel>smaller|larger)'
font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I)
line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))
PTU = {
'in' : 72.,
'cm' : 72/2.54,
'mm' : 72/25.4,
'pt' : 1.0,
'pc' : 1/12.,
}
DEFAULT_FONT_SIZE = 12
class Rationalizer(object):
@classmethod
def specificity(cls, s):
'''Map CSS specificity tuple to a single integer'''
return sum([10**(4-i) + x for i,x in enumerate(s)])
@classmethod
def compute_font_size(cls, elem):
'''
Calculate the effective font size of an element traversing its ancestors as far as
neccessary.
'''
cfs = elem.computed_font_size
if cfs is not None:
return
sfs = elem.specified_font_size
if callable(sfs):
parent = elem.getparent()
cls.compute_font_size(parent)
elem.computed_font_size = sfs(parent.computed_font_size)
else:
elem.computed_font_size = sfs
@classmethod
def calculate_font_size(cls, style):
'Return font size in pts from style object. For relative units returns a callable'
match = font_size_pat.search(style.font)
fs = ''
if match:
fs = match.group()
if style.fontSize:
fs = style.fontSize
match = font_size_pat.search(fs)
if match is None:
return None
match = match.groupdict()
unit = match.get('unit', '')
if unit: unit = unit.lower()
if unit in PTU.keys():
return PTU[unit] * float(match['num'])
if unit in ('em', 'ex'):
return functools.partial(operator.mul, float(match['num']))
if unit == '%':
return functools.partial(operator.mul, float(match['num'])/100.)
abs = match.get('abs', '')
if abs: abs = abs.lower()
if abs:
x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1))
return 12 * x
if match.get('zero', False):
return 0.
return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8)
@classmethod
def resolve_rules(cls, stylesheets):
for sheet in stylesheets:
if hasattr(sheet, 'fs_rules'):
continue
sheet.fs_rules = []
sheet.lh_rules = []
for r in sheet:
if r.type == r.STYLE_RULE:
font_size = cls.calculate_font_size(r.style)
if font_size is not None:
for s in r.selectorList:
sheet.fs_rules.append([CSSSelector(s.selectorText), font_size])
orig = line_height_pat.search(r.style.lineHeight)
if orig is not None:
for s in r.selectorList:
sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]])
@classmethod
def apply_font_size_rules(cls, stylesheets, root):
'Add a ``specified_font_size`` attribute to every element that has a specified font size'
cls.resolve_rules(stylesheets)
for sheet in stylesheets:
for selector, font_size in sheet.fs_rules:
elems = selector(root)
for elem in elems:
elem.specified_font_size = font_size
@classmethod
def remove_font_size_information(cls, stylesheets):
for r in rules(stylesheets):
r.style.removeProperty('font-size')
try:
new = font_size_pat.sub('', r.style.font).strip()
if new:
r.style.font = new
else:
r.style.removeProperty('font')
except SyntaxErr:
r.style.removeProperty('font')
if line_height_pat.search(r.style.lineHeight) is not None:
r.style.removeProperty('line-height')
@classmethod
def compute_font_sizes(cls, root, stylesheets, base=12):
stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')]
cls.apply_font_size_rules(stylesheets, root)
# Compute the effective font size of all tags
root.computed_font_size = DEFAULT_FONT_SIZE
for elem in root.iter(etree.Element):
cls.compute_font_size(elem)
extra_css = {}
if base > 0:
# Calculate the "base" (i.e. most common) font size
font_sizes = collections.defaultdict(lambda : 0)
body = root.xpath('//body')[0]
IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')
for elem in body.iter(etree.Element):
if elem.tag not in IGNORE:
t = getattr(elem, 'text', '')
if t: t = t.strip()
if t:
font_sizes[elem.computed_font_size] += len(t)
t = getattr(elem, 'tail', '')
if t: t = t.strip()
if t:
parent = elem.getparent()
if parent.tag not in IGNORE:
font_sizes[parent.computed_font_size] += len(t)
try:
most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0]
scale = base/most_common if most_common > 0 else 1.
except ValueError:
scale = 1.
# rescale absolute line-heights
counter = 0
for sheet in stylesheets:
for selector, lh in sheet.lh_rules:
for elem in selector(root):
elem.set('id', elem.get('id', 'cfs_%d'%counter))
counter += 1
if not extra_css.has_key(elem.get('id')):
extra_css[elem.get('id')] = []
extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale))
# Rescale all computed font sizes
for elem in body.iter(etree.Element):
if isinstance(elem, HtmlElement):
elem.computed_font_size *= scale
# Remove all font size specifications from the last stylesheet
cls.remove_font_size_information(stylesheets[-1:])
# Create the CSS to implement the rescaled font sizes
for elem in body.iter(etree.Element):
cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent()))
if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.:
elem.set('id', elem.get('id', 'cfs_%d'%counter))
counter += 1
if not extra_css.has_key(elem.get('id')):
extra_css[elem.get('id')] = []
extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs)))
css = CSSParser(loglevel=logging.ERROR).parseString('')
for id, r in extra_css.items():
css.add('#%s {%s}'%(id, ';'.join(r)))
return css
@classmethod
def rationalize(cls, stylesheets, root, opts):
logger = logging.getLogger('html2epub')
logger.info('\t\tRationalizing fonts...')
extra_css = None
if opts.base_font_size2 > 0:
try:
extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2)
except:
logger.warning('Failed to rationalize font sizes.')
if opts.verbose > 1:
logger.exception('')
finally:
root.remove_font_size_information()
logger.debug('\t\tDone rationalizing')
return extra_css
################################################################################
############## Testing
################################################################################
class FontTest(unittest.TestCase):
def setUp(self):
from calibre.ebooks.epub import config
self.opts = config(defaults='').parse()
self.html = '''
<html>
<head>
<title>Test document</title>
</head>
<body>
<div id="div1">
<!-- A comment -->
<p id="p1">Some <b>text</b></p>
</div>
<p id="p2">Some other <span class="it">text</span>.</p>
<p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p>
</body>
</html>
'''
self.root = fromstring(self.html)
def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1):
root1 = copy.deepcopy(self.root)
root1.computed_font_size = DEFAULT_FONT_SIZE
stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css)
stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base)
root2 = copy.deepcopy(root1)
root2.remove_font_size_information()
root2.computed_font_size = DEFAULT_FONT_SIZE
Rationalizer.apply_font_size_rules([stylesheet2], root2)
for elem in root2.iter(etree.Element):
Rationalizer.compute_font_size(elem)
for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)):
self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size,
msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\
(root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size))
return stylesheet2.cssText
def testStripping(self):
'Test that any original entries are removed from the CSS'
css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }'
css = CSSParser(loglevel=logging.ERROR).parseString(css)
Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css])
self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''),
'p{font:bolditalic}')
def testIdentity(self):
'Test that no unnecessary font size changes are made'
extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}')
self.assertEqual(extra_css.strip(), '')
def testRelativization(self):
'Test conversion of absolute to relative sizes'
self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}')
def testResizing(self):
'Test resizing of fonts'
self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}')
def suite():
return unittest.TestLoader().loadTestsFromTestCase(FontTest)
def test():
unittest.TextTestRunner(verbosity=2).run(suite())
if __name__ == '__main__':
sys.exit(test())

View File

@ -1,207 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Convert any ebook format to epub.
'''
import sys, os, re
from contextlib import nested
from calibre import extract, walk
from calibre.ebooks import DRMError
from calibre.ebooks.epub import config as common_config, process_encryption
from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.utils.zipfile import ZipFile
from calibre.customize.ui import run_plugins_on_preprocess
def lit2opf(path, tdir, opts):
from calibre.ebooks.lit.reader import LitReader
print 'Exploding LIT file:', path
reader = LitReader(path)
reader.extract_content(tdir, False)
opf = None
for opf in walk(tdir):
if opf.lower().endswith('.opf'):
break
if not opf.endswith('.opf'):
opf = None
if opf is not None: # Check for url-quoted filenames
_opf = OPF(opf, os.path.dirname(opf))
replacements = []
for item in _opf.itermanifest():
href = item.get('href', '')
path = os.path.join(os.path.dirname(opf), *(href.split('/')))
if not os.path.exists(path) and os.path.exists(path.replace('&', '%26')):
npath = path
path = path.replace('&', '%26')
replacements.append((path, npath))
if replacements:
print 'Fixing quoted filenames...'
for path, npath in replacements:
if os.path.exists(path):
os.rename(path, npath)
for f in walk(tdir):
with open(f, 'r+b') as f:
raw = f.read()
for path, npath in replacements:
raw = raw.replace(os.path.basename(path), os.path.basename(npath))
f.seek(0)
f.truncate()
f.write(raw)
return opf
def mobi2opf(path, tdir, opts):
from calibre.ebooks.mobi.reader import MobiReader
print 'Exploding MOBI file:', path.encode('utf-8') if isinstance(path, unicode) else path
reader = MobiReader(path)
reader.extract_content(tdir)
files = list(walk(tdir))
opts.encoding = 'utf-8'
for f in files:
if f.lower().endswith('.opf'):
return f
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE)
hf = [f for f in files if html_pat.match(os.path.splitext(f)[1]) is not None]
mi = MetaInformation(os.path.splitext(os.path.basename(path))[0], [_('Unknown')])
opf = OPFCreator(tdir, mi)
opf.create_manifest([(hf[0], None)])
opf.create_spine([hf[0]])
ans = os.path.join(tdir, 'metadata.opf')
opf.render(open(ans, 'wb'))
return ans
def fb22opf(path, tdir, opts):
from calibre.ebooks.lrf.fb2.convert_from import to_html
print 'Converting FB2 to HTML...'
return to_html(path, tdir)
def rtf2opf(path, tdir, opts):
from calibre.ebooks.lrf.rtf.convert_from import generate_html
generate_html(path, tdir)
return os.path.join(tdir, 'metadata.opf')
def txt2opf(path, tdir, opts):
from calibre.ebooks.lrf.txt.convert_from import generate_html
generate_html(path, opts.encoding, tdir)
opts.encoding = 'utf-8'
return os.path.join(tdir, 'metadata.opf')
def pdf2opf(path, tdir, opts):
from calibre.ebooks.lrf.pdf.convert_from import generate_html
generate_html(path, tdir)
opts.dont_split_on_page_breaks = True
return os.path.join(tdir, 'metadata.opf')
def epub2opf(path, tdir, opts):
zf = ZipFile(path)
zf.extractall(tdir)
opts.chapter_mark = 'none'
encfile = os.path.join(tdir, 'META-INF', 'encryption.xml')
opf = None
for f in walk(tdir):
if f.lower().endswith('.opf'):
opf = f
break
if opf and os.path.exists(encfile):
if not process_encryption(encfile, opf):
raise DRMError(os.path.basename(path))
if opf is None:
raise ValueError('%s is not a valid EPUB file'%path)
return opf
def odt2epub(path, tdir, opts):
from calibre.ebooks.odt.to_oeb import Extract
opts.encoding = 'utf-8'
return Extract()(path, tdir)
MAP = {
'lit' : lit2opf,
'mobi' : mobi2opf,
'prc' : mobi2opf,
'azw' : mobi2opf,
'fb2' : fb22opf,
'rtf' : rtf2opf,
'txt' : txt2opf,
'pdf' : pdf2opf,
'epub' : epub2opf,
'odt' : odt2epub,
}
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf',
'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub']
def unarchive(path, tdir):
extract(path, tdir)
files = list(walk(tdir))
for ext in ['opf'] + list(MAP.keys()):
for f in files:
if f.lower().endswith('.'+ext):
if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
continue
return f, ext
return find_html_index(files)
def any2epub(opts, path, notification=None, create_epub=True,
oeb_cover=False, extract_to=None):
path = run_plugins_on_preprocess(path)
ext = os.path.splitext(path)[1]
if not ext:
raise ValueError('Unknown file type: '+path)
ext = ext.lower()[1:]
if opts.output is None:
opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub'
with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2):
if ext in ['rar', 'zip', 'oebzip']:
path, ext = unarchive(path, tdir1)
print 'Found %s file in archive'%(ext.upper())
if ext in MAP.keys():
path = MAP[ext](path, tdir2, opts)
ext = 'opf'
if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None:
raise ValueError('Conversion from %s is not supported'%ext.upper())
print 'Creating EPUB file...'
html2epub(path, opts, notification=notification,
create_epub=create_epub, oeb_cover=oeb_cover,
extract_to=extract_to)
def config(defaults=None):
return common_config(defaults=defaults)
def formats():
return ['html', 'rar', 'zip', 'oebzip']+list(MAP.keys())
USAGE = _('''\
%%prog [options] filename
Convert any of a large number of ebook formats to a %s file. Supported formats are: %s
''')
def option_parser(usage=USAGE):
return config().option_parser(usage=usage%('EPUB', formats()))
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 2:
parser.print_help()
print 'No input file specified.'
return 1
any2epub(opts, args[1])
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,21 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'Convert a comic in CBR/CBZ format to epub'
import sys
from functools import partial
from calibre.ebooks.lrf.comic.convert_from import do_convert, option_parser, config, main as _main
convert = partial(do_convert, output_format='epub')
main = partial(_main, output_format='epub')
if __name__ == '__main__':
sys.exit(main())
if False:
option_parser
config

View File

@ -1,71 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Convert periodical content into EPUB ebooks.
'''
import sys, glob, os
from calibre.web.feeds.main import config as feeds2disk_config, USAGE, run_recipe
from calibre.ebooks.epub.from_html import config as html2epub_config
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.epub.from_html import convert as html2epub
from calibre import strftime, sanitize_file_name
def config(defaults=None):
c = feeds2disk_config(defaults=defaults)
c.remove('lrf')
c.remove('epub')
c.remove('output_dir')
c.update(html2epub_config(defaults=defaults))
c.remove('chapter_mark')
return c
def option_parser():
c = config()
return c.option_parser(usage=USAGE)
def convert(opts, recipe_arg, notification=None):
opts.lrf = False
opts.epub = True
if opts.debug:
opts.verbose = 2
parser = option_parser()
with TemporaryDirectory('_feeds2epub') as tdir:
opts.output_dir = tdir
recipe = run_recipe(opts, recipe_arg, parser, notification=notification)
c = config()
recipe_opts = c.parse_string(recipe.html2epub_options)
c.smart_update(recipe_opts, opts)
opts = recipe_opts
opts.chapter_mark = 'none'
opts.dont_split_on_page_breaks = True
opf = glob.glob(os.path.join(tdir, '*.opf'))
if not opf:
raise Exception('Downloading of recipe: %s failed'%recipe_arg)
opf = opf[0]
if opts.output is None:
fname = recipe.title + strftime(recipe.timefmt) + '.epub'
opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
print 'Generating epub...'
opts.encoding = 'utf-8'
opts.remove_paragraph_spacing = True
html2epub(opf, opts, notification=notification)
def main(args=sys.argv, notification=None, handler=None):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) != 2 and opts.feeds is None:
parser.print_help()
return 1
recipe_arg = args[1] if len(args) > 1 else None
convert(opts, recipe_arg, notification=notification)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,547 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Conversion of HTML/OPF files follows several stages:
* All links in the HTML files or in the OPF manifest are
followed to build up a list of HTML files to be converted.
This stage is implemented by
:function:`calibre.ebooks.html.traverse` and
:class:`calibre.ebooks.html.HTMLFile`.
* The HTML is pre-processed to make it more semantic.
All links in the HTML files to other resources like images,
stylesheets, etc. are relativized. The resources are copied
into the `resources` sub directory. This is accomplished by
:class:`calibre.ebooks.html.PreProcessor` and
:class:`calibre.ebooks.html.Parser`.
* The HTML is processed. Various operations are performed.
All style declarations are extracted and consolidated into
a single style sheet. Chapters are auto-detected and marked.
Various font related manipulations are performed. See
:class:`HTMLProcessor`.
* The processed HTML is saved and the
:module:`calibre.ebooks.epub.split` module is used to split up
large HTML files into smaller chunks.
* The EPUB container is created.
'''
import os, sys, cStringIO, logging, re, functools, shutil
from lxml.etree import XPath
from lxml import html, etree
from PyQt4.Qt import QApplication, QPixmap, Qt
from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
opf_traverse, create_metadata, rebase_toc, Link, parser
from calibre.ebooks.epub import config as common_config, tostring
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.epub import initialize_container, PROFILES
from calibre.ebooks.epub.split import split
from calibre.ebooks.epub.pages import add_page_map
from calibre.ebooks.epub.fonts import Rationalizer
from calibre.constants import preferred_encoding
from calibre.customize.ui import run_plugins_on_postprocess
from calibre import walk, CurrentDir, to_unicode, fit_image
content = functools.partial(os.path.join, u'content')
def remove_bad_link(element, attribute, link, pos):
if attribute is not None:
if element.tag in ['link']:
element.getparent().remove(element)
else:
element.set(attribute, '')
del element.attrib[attribute]
def check_links(opf_path, pretty_print):
'''
Find and remove all invalid links in the HTML files
'''
logger = logging.getLogger('html2epub')
logger.info('\tChecking files for bad links...')
pathtoopf = os.path.abspath(opf_path)
with CurrentDir(os.path.dirname(pathtoopf)):
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
html_files = []
for item in opf.itermanifest():
if 'html' in item.get('media-type', '').lower():
f = item.get('href').split('/')[-1]
if isinstance(f, str):
f = f.decode('utf-8')
html_files.append(os.path.abspath(content(f)))
for path in html_files:
if not os.access(path, os.R_OK):
continue
base = os.path.dirname(path)
root = html.fromstring(open(content(path), 'rb').read(), parser=parser)
for element, attribute, link, pos in list(root.iterlinks()):
link = to_unicode(link)
plink = Link(link, base)
bad = False
if plink.path is not None and not os.path.exists(plink.path):
bad = True
if bad:
remove_bad_link(element, attribute, link, pos)
open(content(path), 'wb').write(tostring(root, pretty_print))
def find_html_index(files):
'''
Given a list of files, find the most likely root HTML file in the
list.
'''
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
html_files = [f for f in files if html_pat.search(f) is not None]
if not html_files:
raise ValueError(_('Could not find an ebook inside the archive'))
html_files = [(f, os.stat(f).st_size) for f in html_files]
html_files.sort(cmp = lambda x, y: cmp(x[1], y[1]))
html_files = [f[0] for f in html_files]
for q in ('toc', 'index'):
for f in html_files:
if os.path.splitext(os.path.basename(f))[0].lower() == q:
return f, os.path.splitext(f)[1].lower()[1:]
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
def rescale_images(imgdir, screen_size, log):
pwidth, pheight = screen_size
if QApplication.instance() is None:
QApplication([])
for f in os.listdir(imgdir):
path = os.path.join(imgdir, f)
if os.path.splitext(f)[1] in ('.css', '.js'):
continue
p = QPixmap()
p.load(path)
if p.isNull():
continue
width, height = p.width(), p.height()
scaled, new_width, new_height = fit_image(width, height, pwidth,
pheight)
if scaled:
log.info('Rescaling image: '+f)
p.scaled(new_width, new_height, Qt.IgnoreAspectRatio,
Qt.SmoothTransformation).save(path, 'JPEG')
class HTMLProcessor(Processor, Rationalizer):
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets):
Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles,
name='html2epub')
if opts.verbose > 2:
self.debug_tree('parsed')
self.detect_chapters()
self.extract_css(stylesheets)
if self.opts.base_font_size2 > 0:
self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet],
self.root, self.opts)
if opts.verbose > 2:
self.debug_tree('nocss')
if hasattr(self.body, 'xpath'):
for script in list(self.body.xpath('descendant::script')):
script.getparent().remove(script)
self.fix_markup()
def convert_image(self, img):
rpath = img.get('src', '')
path = os.path.join(os.path.dirname(self.save_path()), *rpath.split('/'))
if os.path.exists(path) and os.path.isfile(path):
if QApplication.instance() is None:
app = QApplication([])
app
p = QPixmap()
p.load(path)
if not p.isNull():
p.save(path + '_calibre_converted.jpg')
os.remove(path)
for key, val in self.resource_map.items():
if val == rpath:
self.resource_map[key] = rpath+'_calibre_converted.jpg'
img.set('src', rpath+'_calibre_converted.jpg')
def fix_markup(self):
'''
Perform various markup transforms to get the output to render correctly
in the quirky ADE.
'''
# Replace <br> that are children of <body> as ADE doesn't handle them
if hasattr(self.body, 'xpath'):
for br in self.body.xpath('./br'):
if br.getparent() is None:
continue
try:
sibling = br.itersiblings().next()
except:
sibling = None
br.tag = 'p'
br.text = u'\u00a0'
if (br.tail and br.tail.strip()) or sibling is None or \
getattr(sibling, 'tag', '') != 'br':
style = br.get('style', '').split(';')
style = filter(None, map(lambda x: x.strip(), style))
style.append('margin: 0pt; border:0pt; height:0pt')
br.set('style', '; '.join(style))
else:
sibling.getparent().remove(sibling)
if sibling.tail:
if not br.tail:
br.tail = ''
br.tail += sibling.tail
if self.opts.profile.remove_object_tags:
for tag in self.root.xpath('//embed'):
tag.getparent().remove(tag)
for tag in self.root.xpath('//object'):
if tag.get('type', '').lower().strip() in ('image/svg+xml',):
continue
tag.getparent().remove(tag)
for tag in self.root.xpath('//title|//style'):
if not tag.text:
tag.getparent().remove(tag)
for tag in self.root.xpath('//script'):
if not tag.text and not tag.get('src', False):
tag.getparent().remove(tag)
for tag in self.root.xpath('//form'):
tag.getparent().remove(tag)
for tag in self.root.xpath('//center'):
tag.tag = 'div'
tag.set('style', 'text-align:center')
if self.opts.linearize_tables:
for tag in self.root.xpath('//table | //tr | //th | //td'):
tag.tag = 'div'
# ADE can't handle &amp; in an img url
for tag in self.root.xpath('//img[@src]'):
tag.set('src', tag.get('src', '').replace('&', ''))
def save(self):
for meta in list(self.root.xpath('//meta')):
meta.getparent().remove(meta)
# Strip all comments since Adobe DE is petrified of them
Processor.save(self, strip_comments=True)
def remove_first_image(self):
images = self.root.xpath('//img')
if images:
images[0].getparent().remove(images[0])
return True
return False
def config(defaults=None):
return common_config(defaults=defaults)
def option_parser():
c = config()
return c.option_parser(usage=_('''\
%prog [options] file.html|opf
Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file.
If you specify an OPF file instead of an HTML file, the list of links is takes from
the <spine> element of the OPF file.
'''))
def parse_content(filelist, opts, tdir):
os.makedirs(os.path.join(tdir, 'content', 'resources'))
resource_map, stylesheets = {}, {}
toc = TOC(base_path=tdir, type='root')
stylesheet_map = {}
first_image_removed = False
for htmlfile in filelist:
logging.getLogger('html2epub').debug('Processing %s...'%htmlfile)
hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'),
resource_map, filelist, stylesheets)
if not first_image_removed and opts.remove_first_image:
first_image_removed = hp.remove_first_image()
hp.populate_toc(toc)
hp.save()
stylesheet_map[os.path.basename(hp.save_path())] = \
[s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None]
logging.getLogger('html2epub').debug('Saving stylesheets...')
if opts.base_font_size2 > 0:
Rationalizer.remove_font_size_information(stylesheets.values())
for path, css in stylesheets.items():
raw = getattr(css, 'cssText', css)
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open(path, 'wb').write(raw)
if toc.count('chapter') > opts.toc_threshold:
toc.purge(['file', 'link', 'unknown'])
if toc.count('chapter') + toc.count('file') > opts.toc_threshold:
toc.purge(['link', 'unknown'])
toc.purge(['link'], max=opts.max_toc_links)
return resource_map, hp.htmlfile_map, toc, stylesheet_map
TITLEPAGE = '''\
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>Cover</title>
<style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; }
div { margin: 0pt; padding: 0pt; }
</style>
</head>
<body>
<div>
<img src="%s" alt="cover" style="height: 100%%" />
</div>
</body>
</html>
'''
def create_cover_image(src, dest, screen_size, rescale_cover=True):
try:
from PyQt4.Qt import QImage, Qt
if QApplication.instance() is None:
QApplication([])
im = QImage()
im.load(src)
if im.isNull():
raise ValueError('Invalid cover image')
if rescale_cover and screen_size is not None:
width, height = im.width(), im.height()
dw, dh = (screen_size[0]-width)/float(width), (screen_size[1]-height)/float(height)
delta = min(dw, dh)
if delta > 0:
nwidth = int(width + delta*(width))
nheight = int(height + delta*(height))
im = im.scaled(int(nwidth), int(nheight), Qt.IgnoreAspectRatio, Qt.SmoothTransformation)
im.save(dest)
except:
import traceback
traceback.print_exc()
return False
return True
def process_title_page(mi, filelist, htmlfilemap, opts, tdir):
old_title_page = None
f = lambda x : os.path.normcase(os.path.normpath(x))
if not isinstance(mi.cover, basestring):
mi.cover = None
if mi.cover:
if f(filelist[0].path) == f(mi.cover):
old_title_page = htmlfilemap[filelist[0].path]
#logger = logging.getLogger('html2epub')
metadata_cover = mi.cover
if metadata_cover and not os.path.exists(metadata_cover):
metadata_cover = None
cpath = '/'.join(('resources', '_cover_.jpg'))
cover_dest = os.path.join(tdir, 'content', *cpath.split('/'))
if metadata_cover is not None:
if not create_cover_image(metadata_cover, cover_dest,
opts.profile.screen_size):
metadata_cover = None
specified_cover = opts.cover
if specified_cover and not os.path.exists(specified_cover):
specified_cover = None
if specified_cover is not None:
if not create_cover_image(specified_cover, cover_dest,
opts.profile.screen_size):
specified_cover = None
cover = metadata_cover if specified_cover is None or (opts.prefer_metadata_cover and metadata_cover is not None) else specified_cover
if cover is not None:
titlepage = TITLEPAGE%cpath
tp = 'calibre_title_page.html' if old_title_page is None else old_title_page
tppath = os.path.join(tdir, 'content', tp)
with open(tppath, 'wb') as f:
f.write(titlepage)
return tp if old_title_page is None else None, True
elif os.path.exists(cover_dest):
os.remove(cover_dest)
return None, old_title_page is not None
def find_oeb_cover(htmlfile):
if os.stat(htmlfile).st_size > 2048:
return None
match = re.search(r'(?i)<img[^<>]+src\s*=\s*[\'"](.+?)[\'"]', open(htmlfile, 'rb').read())
if match:
return match.group(1)
def condense_ncx(ncx_path):
tree = etree.parse(ncx_path)
for tag in tree.getroot().iter(tag=etree.Element):
if tag.text:
tag.text = tag.text.strip()
if tag.tail:
tag.tail = tag.tail.strip()
compressed = etree.tostring(tree.getroot(), encoding='utf-8')
open(ncx_path, 'wb').write(compressed)
def convert(htmlfile, opts, notification=None, create_epub=True,
oeb_cover=False, extract_to=None):
htmlfile = os.path.abspath(htmlfile)
if opts.output is None:
opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
opts.profile = PROFILES[opts.profile]
opts.output = os.path.abspath(opts.output)
if opts.override_css is not None:
try:
opts.override_css = open(opts.override_css, 'rb').read().decode(preferred_encoding, 'replace')
except:
opts.override_css = opts.override_css.decode(preferred_encoding, 'replace')
if opts.from_opf:
opts.from_opf = os.path.abspath(opts.from_opf)
if opts.from_ncx:
opts.from_ncx = os.path.abspath(opts.from_ncx)
if htmlfile.lower().endswith('.opf'):
opf = OPF(htmlfile, os.path.dirname(os.path.abspath(htmlfile)))
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
if not filelist:
# Bad OPF look for a HTML file instead
htmlfile = find_html_index(walk(os.path.dirname(htmlfile)))[0]
if htmlfile is None:
raise ValueError('Could not find suitable file to convert.')
filelist = get_filelist(htmlfile, opts)[1]
mi = merge_metadata(None, opf, opts)
else:
opf, filelist = get_filelist(htmlfile, opts)
mi = merge_metadata(htmlfile, opf, opts)
opts.chapter = XPath(opts.chapter,
namespaces={'re':'http://exslt.org/regular-expressions'})
for x in (1, 2, 3):
attr = 'level%d_toc'%x
if getattr(opts, attr):
setattr(opts, attr, XPath(getattr(opts, attr),
namespaces={'re':'http://exslt.org/regular-expressions'}))
else:
setattr(opts, attr, None)
with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir:
if opts.keep_intermediate:
print 'Intermediate files in', tdir
resource_map, htmlfile_map, generated_toc, stylesheet_map = \
parse_content(filelist, opts, tdir)
logger = logging.getLogger('html2epub')
resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
title_page, has_title_page = process_title_page(mi, filelist, htmlfile_map, opts, tdir)
spine = [htmlfile_map[f.path] for f in filelist]
if not oeb_cover and title_page is not None:
spine = [title_page] + spine
mi.cover = None
mi.cover_data = (None, None)
mi = create_metadata(tdir, mi, spine, resources)
buf = cStringIO.StringIO()
if mi.toc:
rebase_toc(mi.toc, htmlfile_map, tdir)
if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2:
mi.toc = generated_toc
if opts.from_ncx:
toc = TOC()
toc.read_ncx_toc(opts.from_ncx)
mi.toc = toc
for item in mi.manifest:
if getattr(item, 'mime_type', None) == 'text/html':
item.mime_type = 'application/xhtml+xml'
opf_path = os.path.join(tdir, 'metadata.opf')
with open(opf_path, 'wb') as f:
mi.render(f, buf, 'toc.ncx')
toc = buf.getvalue()
if toc:
with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f:
f.write(toc)
if opts.show_ncx:
print toc
split(opf_path, opts, stylesheet_map)
if opts.page:
logger.info('\tBuilding page map...')
add_page_map(opf_path, opts)
check_links(opf_path, opts.pretty_print)
opf = OPF(opf_path, tdir)
opf.remove_guide()
oeb_cover_file = None
if oeb_cover and title_page is not None:
oeb_cover_file = find_oeb_cover(os.path.join(tdir, 'content', title_page))
if has_title_page or (oeb_cover and oeb_cover_file):
opf.create_guide_element()
if has_title_page and not oeb_cover:
opf.add_guide_item('cover', 'Cover', 'content/'+spine[0])
if oeb_cover and oeb_cover_file:
opf.add_guide_item('cover', 'Cover', 'content/'+oeb_cover_file)
cpath = os.path.join(tdir, 'content', 'resources', '_cover_.jpg')
if os.path.exists(cpath):
opf.add_path_to_manifest(cpath, 'image/jpeg')
with open(opf_path, 'wb') as f:
f.write(opf.render())
ncx_path = os.path.join(os.path.dirname(opf_path), 'toc.ncx')
if os.path.exists(ncx_path) and os.stat(ncx_path).st_size > opts.profile.flow_size:
logger.info('Condensing NCX from %d bytes...'%os.stat(ncx_path).st_size)
condense_ncx(ncx_path)
if os.stat(ncx_path).st_size > opts.profile.flow_size:
logger.warn('NCX still larger than allowed size at %d bytes. Menu based Table of Contents may not work on device.'%os.stat(ncx_path).st_size)
if opts.profile.screen_size is not None:
rescale_images(os.path.join(tdir, 'content', 'resources'),
opts.profile.screen_size, logger)
if create_epub:
epub = initialize_container(opts.output)
epub.add_dir(tdir)
epub.close()
run_plugins_on_postprocess(opts.output, 'epub')
logger.info(_('Output written to ')+opts.output)
if opts.show_opf:
print open(opf_path, 'rb').read()
if opts.extract_to is not None:
if os.path.exists(opts.extract_to):
shutil.rmtree(opts.extract_to)
shutil.copytree(tdir, opts.extract_to)
if extract_to is not None:
if os.path.exists(extract_to):
shutil.rmtree(extract_to)
shutil.copytree(tdir, extract_to)
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 2:
parser.print_help()
print _('You must specify an input HTML file')
return 1
convert(args[1], opts)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,127 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re, uuid
from itertools import cycle
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
class EPUBInput(InputFormatPlugin):
name = 'EPUB Input'
author = 'Kovid Goyal'
description = 'Convert EPUB files (.epub) to HTML'
file_types = set(['epub'])
@classmethod
def decrypt_font(cls, key, path):
raw = open(path, 'rb').read()
crypt = raw[:1024]
key = cycle(iter(key))
decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
with open(path, 'wb') as f:
f.write(decrypt)
f.write(raw[1024:])
@classmethod
def process_ecryption(cls, encfile, opf, log):
key = None
m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
if m:
key = m.group(1)
key = list(map(ord, uuid.UUID(key).bytes))
try:
root = etree.parse(encfile)
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
algorithm = em.get('Algorithm', '')
if algorithm != 'http://ns.adobe.com/pdf/enc#RC':
return False
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
uri = cr.get('URI')
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
if os.path.exists(path):
cls.decrypt_font(key, path)
return True
except:
import traceback
traceback.print_exc()
return False
@classmethod
def rationalize_cover(self, opf):
guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover':
guide_cover = guide_elem.get('href', '')
break
if not guide_cover:
return
spine = list(opf.iterspine())
if not spine:
return
idref = spine[0].get('idref', '')
manifest = list(opf.itermanifest())
if not manifest:
return
if manifest[0].get('id', False) != idref:
return
spine[0].getparent().remove(spine[0])
guide_elem.set('href', 'calibre_raster_cover.jpg')
for elem in list(opf.iterguide()):
if elem.get('type', '').lower() == 'titlepage':
elem.getparent().remove(elem)
from calibre.ebooks.oeb.base import OPF
t = etree.SubElement(guide_elem.getparent(), OPF('reference'))
t.set('type', 'titlepage')
t.set('href', guide_cover)
t.set('title', 'Title Page')
from calibre.ebooks import render_html
renderer = render_html(guide_cover)
if renderer is not None:
open('calibre_raster_cover.jpg', 'wb').write(
renderer.data)
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.utils.zipfile import ZipFile
from calibre import walk
from calibre.ebooks import DRMError
from calibre.ebooks.metadata.opf2 import OPF
zf = ZipFile(stream)
zf.extractall(os.getcwd())
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
opf = None
for f in walk(u'.'):
if f.lower().endswith('.opf'):
opf = os.path.abspath(f)
break
path = getattr(stream, 'name', 'stream')
if opf is None:
raise ValueError('%s is not a valid EPUB file'%path)
if os.path.exists(encfile):
if not self.process_encryption(encfile, opf, log):
raise DRMError(os.path.basename(path))
opf = os.path.relpath(opf, os.getcwdu())
parts = os.path.split(opf)
opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
if len(parts) > 1 and parts[0]:
delta = '/'.join(parts[:-1])+'/'
for elem in opf.itermanifest():
elem.set('href', delta+elem.get('href'))
for elem in opf.iterguide():
elem.set('href', delta+elem.get('href'))
self.rationalize_cover(opf)
with open('content.opf', 'wb') as nopf:
nopf.write(opf.render())
return os.path.abspath('content.opf')

View File

@ -0,0 +1,294 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, shutil, re
from urllib import unquote
from calibre.customize.conversion import OutputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.constants import __appname__, __version__
from calibre import strftime, guess_type
from calibre.customize.conversion import OptionRecommendation
from lxml import etree
class EPUBOutput(OutputFormatPlugin):
name = 'EPUB Output'
author = 'Kovid Goyal'
file_type = 'epub'
options = set([
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated EPUB file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.')),
OptionRecommendation(name='dont_split_on_page_breaks',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Turn off splitting at page breaks. Normally, input '
'files are automatically split at every page break into '
'two files. This gives an output ebook that can be '
'parsed faster and with less resources. However, '
'splitting is slow and if your source file contains a '
'very large number of page breaks, you should turn off '
'splitting on page breaks.'
)
),
OptionRecommendation(name='flow_size', recommended_value=260,
help=_('Split all HTML files larger than this size (in KB). '
'This is necessary as most EPUB readers cannot handle large '
'file sizes. The default of %defaultKB is the size required '
'for Adobe Digital Editions.')
),
])
TITLEPAGE_COVER = '''\
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>Cover</title>
<style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; }
div { margin: 0pt; padding: 0pt; }
</style>
</head>
<body>
<div>
<img src="%s" alt="cover" style="height: 100%%" />
</div>
</body>
</html>
'''
TITLEPAGE = '''\
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>%(title)s</title>
<style type="text/css">
body {
background: white no-repeat fixed center center;
text-align: center;
vertical-align: center;
overflow: hidden;
font-size: 18px;
}
h1 { font-family: serif; }
h2, h4 { font-family: monospace; }
</style>
</head>
<body>
<h1>%(title)s</h1>
<br/><br/>
<div style="position:relative">
<div style="position: absolute; left: 0; top: 0; width:100%%; height:100%%; vertical-align:center">
<img src="%(img)s" alt="calibre" style="opacity:0.3"/>
</div>
<div style="position: absolute; left: 0; top: 0; width:100%%; height:100%%; vertical-align:center">
<h2>%(date)s</h2>
<br/><br/><br/><br/><br/>
<h3>%(author)s</h3>
<br/><br/><br/><br/><br/><br/><br/><br/><br/>
<h4>Produced by %(app)s</h4>
</div>
</div>
</body>
</html>
'''
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.oeb.transforms.split import Split
split = Split(not self.opts.dont_split_on_page_breaks,
max_flow_size=self.opts.flow_size*1024
)
split(self.oeb, self.opts)
self.workaround_ade_quirks()
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
RescaleImages()(oeb, opts)
self.insert_cover()
with TemporaryDirectory('_epub_output') as tdir:
from calibre.customize.ui import plugin_for_output_format
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\
if x.endswith('.ncx')][0])
from calibre.ebooks.epub import initialize_container
epub = initialize_container(output_path, os.path.basename(opf))
epub.add_dir(tdir)
if opts.extract_to is not None:
if os.path.exists(opts.extract_to):
shutil.rmtree(opts.extract_to)
os.mkdir(opts.extract_to)
epub.extractall(path=opts.extract_to)
self.log.info('EPUB extracted to', opts.extract_to)
epub.close()
def default_cover(self):
'''
Create a generic cover for books that dont have a cover
'''
try:
from calibre.gui2 import images_rc # Needed for access to logo
from PyQt4.Qt import QApplication, QFile, QIODevice
except:
return None
from calibre.ebooks.metadata import authors_to_string
images_rc
m = self.oeb.metadata
title = unicode(m.title[0])
a = [unicode(x) for x in m.creators if m.role == 'aut']
author = authors_to_string(a)
if QApplication.instance() is None: QApplication([])
f = QFile(':/library')
f.open(QIODevice.ReadOnly)
img_data = str(f.readAll())
id, href = self.oeb.manifest.generate('calibre-logo',
'calibre-logo.png')
self.oeb.manifest.add(id, href, 'image/png', data=img_data)
html = self.TITLEPAGE%dict(title=title, author=author,
date=strftime('%d %b, %Y'),
app=__appname__ +' '+__version__,
img=href)
id, href = self.oeb.manifest.generate('calibre-titlepage',
'calibre-titlepage.xhtml')
return self.oeb.manifest.add(id, href, guess_type('t.xhtml')[0],
data=etree.fromstring(html))
def insert_cover(self):
from calibre.ebooks.oeb.base import urldefrag
from calibre import guess_type
g, m = self.oeb.guide, self.oeb.manifest
if 'titlepage' not in g:
if 'cover' in g:
tp = self.TITLEPAGE_COVER%unquote(g['cover'].href)
id, href = m.generate('titlepage', 'titlepage.xhtml')
item = m.add(id, href, guess_type('t.xhtml')[0],
data=etree.fromstring(tp))
else:
item = self.default_cover()
else:
item = self.oeb.manifest.hrefs[
urldefrag(self.oeb.guide['titlepage'].href)[0]]
if item is not None:
self.oeb.spine.insert(0, item, True)
if 'cover' not in self.oeb.guide.refs:
self.oeb.guide.add('cover', 'Title Page', 'a')
self.oeb.guide.refs['cover'].href = item.href
if 'titlepage' in self.oeb.guide.refs:
self.oeb.guide.refs['titlepage'].href = item.href
def condense_ncx(self, ncx_path):
if not self.opts.pretty_print:
tree = etree.parse(ncx_path)
for tag in tree.getroot().iter(tag=etree.Element):
if tag.text:
tag.text = tag.text.strip()
if tag.tail:
tag.tail = tag.tail.strip()
compressed = etree.tostring(tree.getroot(), encoding='utf-8')
open(ncx_path, 'wb').write(compressed)
def workaround_ade_quirks(self):
'''
Perform various markup transforms to get the output to render correctly
in the quirky ADE.
'''
from calibre.ebooks.oeb.base import XPNSMAP, XHTML
from lxml.etree import XPath as _XPath
from functools import partial
XPath = partial(_XPath, namespaces=XPNSMAP)
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
if body:
body = body[0]
# Replace <br> that are children of <body> as ADE doesn't handle them
if hasattr(body, 'xpath'):
for br in XPath('./h:br')(body):
if br.getparent() is None:
continue
try:
sibling = br.itersiblings().next()
except:
sibling = None
br.tag = XHTML('p')
br.text = u'\u00a0'
if (br.tail and br.tail.strip()) or sibling is None or \
getattr(sibling, 'tag', '') != XHTML('br'):
style = br.get('style', '').split(';')
style = filter(None, map(lambda x: x.strip(), style))
style.append('margin: 0pt; border:0pt; height:0pt')
br.set('style', '; '.join(style))
else:
sibling.getparent().remove(sibling)
if sibling.tail:
if not br.tail:
br.tail = ''
br.tail += sibling.tail
for tag in XPath('//h:embed')(root):
tag.getparent().remove(tag)
for tag in XPath('//h:object')(root):
if tag.get('type', '').lower().strip() in ('image/svg+xml',):
continue
tag.getparent().remove(tag)
for tag in XPath('//h:title|//h:style')(root):
if not tag.text:
tag.getparent().remove(tag)
for tag in XPath('//h:script')(root):
if not tag.text and not tag.get('src', False):
tag.getparent().remove(tag)
for tag in XPath('//h:form')(root):
tag.getparent().remove(tag)
for tag in XPath('//h:center')(root):
tag.tag = XHTML('div')
tag.set('style', 'text-align:center')
# ADE can't handle &amp; in an img url
for tag in XPath('//h:img[@src]')(root):
tag.set('src', tag.get('src', '').replace('&', ''))
stylesheet = self.oeb.manifest.hrefs['stylesheet.css']
stylesheet.data.add('a { color: inherit; text-decoration: inherit; '
'cursor: default; }')
stylesheet.data.add('a[href] { color: blue; '
'text-decoration: underline; cursor:pointer; }')
special_chars = re.compile(u'[\u200b\u00ad]')
for elem in root.iterdescendants():
if getattr(elem, 'text', False):
elem.text = special_chars.sub('', elem.text)
elem.text = elem.text.replace(u'\u2011', '-')
if getattr(elem, 'tail', False):
elem.tail = special_chars.sub('', elem.tail)
elem.tail = elem.tail.replace(u'\u2011', '-')

View File

@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en'
import os, re
from itertools import count, chain
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.base import OEBBook, DirWriter
from calibre.ebooks.oeb.base import OEBBook
from lxml import etree, html
from lxml.etree import XPath

View File

@ -1,509 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Split the flows in an epub file to conform to size limitations.
'''
import os, math, logging, functools, collections, re, copy, sys
from lxml.etree import XPath as _XPath
from lxml import etree, html
from lxml.cssselect import CSSSelector
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.epub import tostring, rules
from calibre import CurrentDir, LoggingInterface
XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'})
content = functools.partial(os.path.join, 'content')
SPLIT_ATTR = 'cs'
SPLIT_POINT_ATTR = 'csp'
class SplitError(ValueError):
def __init__(self, path, root):
size = len(tostring(root))/1024.
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
(os.path.basename(path), size))
class Splitter(LoggingInterface):
def __init__(self, path, opts, stylesheet_map, opf):
LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
self.setup_cli_handler(opts.verbose)
self.path = path
self.always_remove = not opts.preserve_tag_structure or \
os.stat(content(path)).st_size > 5*opts.profile.flow_size
self.base = (os.path.splitext(path)[0].replace('%', '%%') + '_split_%d.html')
self.opts = opts
self.orig_size = os.stat(content(path)).st_size
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
root = html.fromstring(open(content(path)).read())
self.page_breaks, self.trees = [], []
self.split_size = 0
# Split on page breaks
self.splitting_on_page_breaks = True
if not opts.dont_split_on_page_breaks:
self.log_info('\tSplitting on page breaks...')
if self.path in stylesheet_map:
self.find_page_breaks(stylesheet_map[self.path], root)
self.split_on_page_breaks(root.getroottree())
trees = list(self.trees)
else:
self.trees = [root.getroottree()]
trees = list(self.trees)
# Split any remaining over-sized trees
self.splitting_on_page_breaks = False
if self.opts.profile.flow_size < sys.maxint:
lt_found = False
self.log_info('\tLooking for large trees...')
self.tree_map = {}
for i, tree in enumerate(list(trees)):
self.split_trees = []
size = len(tostring(tree.getroot()))
if size > self.opts.profile.flow_size:
lt_found = True
try:
self.split_to_size(tree)
self.tree_map[tree] = self.split_trees
except (SplitError, RuntimeError): # Splitting fails
if not self.always_remove:
self.always_remove = True
self.split_trees = []
self.split_to_size(tree)
self.tree_map[tree] = self.split_trees
else:
raise
t = []
for x in trees:
t.extend(self.tree_map.get(x, [x]))
trees = t
if not lt_found:
self.log_info('\tNo large trees found')
self.trees = trees
self.was_split = len(self.trees) > 1
if self.was_split:
self.commit()
self.log_info('\t\tSplit into %d parts.', len(self.trees))
if self.opts.verbose:
for f in self.files:
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
self.fix_opf(opf)
self.trees = None
def split_text(self, text, root, size):
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
rest = text.replace('\r', '')
parts = re.split('\n\n', rest)
self.log_debug('\t\t\t\tFound %d parts'%len(parts))
if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
ans = []
buf = ''
for part in parts:
if len(buf) + len(part) < size:
buf += '\n\n'+part
else:
ans.append(buf)
buf = part
return ans
def split_to_size(self, tree):
self.log_debug('\t\tSplitting...')
root = tree.getroot()
# Split large <pre> tags
for pre in list(root.xpath('//pre')):
text = u''.join(pre.xpath('descendant::text()'))
pre.text = text
for child in list(pre.iterchildren()):
pre.remove(child)
if len(pre.text) > self.opts.profile.flow_size*0.5:
frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
new_pres = []
for frag in frags:
pre2 = copy.copy(pre)
pre2.text = frag
pre2.tail = u''
new_pres.append(pre2)
new_pres[-1].tail = pre.tail
p = pre.getparent()
i = p.index(pre)
p[i:i+1] = new_pres
split_point, before = self.find_split_point(root)
if split_point is None or self.split_size > 6*self.orig_size:
if not self.always_remove:
self.log_warn(_('\t\tToo much markup. Re-splitting without '
'structure preservation. This may cause '
'incorrect rendering.'))
raise SplitError(self.path, root)
for t in self.do_split(tree, split_point, before):
r = t.getroot()
if self.is_page_empty(r):
continue
size = len(tostring(r))
if size <= self.opts.profile.flow_size:
self.split_trees.append(t)
#print tostring(t.getroot(), pretty_print=True)
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
len(self.split_trees), size/1024.)
self.split_size += size
else:
self.split_to_size(t)
def is_page_empty(self, root):
body = root.find('body')
if body is None:
return False
txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
if len(txt) > 4:
#if len(txt) < 100:
# print 1111111, html.tostring(body, method='html', encoding=unicode)
return False
for img in root.xpath('//img'):
if img.get('style', '') != 'display:none':
return False
return True
def do_split(self, tree, split_point, before):
'''
Split ``tree`` into a *before* and *after* tree at ``split_point``,
preserving tag structure, but not duplicating any text.
All tags that have had their text and tail
removed have the attribute ``calibre_split`` set to 1.
:param before: If True tree is split before split_point, otherwise after split_point
:return: before_tree, after_tree
'''
path = tree.getpath(split_point)
tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
root = tree.getroot()
root2 = tree2.getroot()
body, body2 = root.body, root2.body
split_point = root.xpath(path)[0]
split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True):
if self.always_remove:
parent = elem.getparent()
index = parent.index(elem)
if top:
parent.remove(elem)
else:
index = parent.index(elem)
parent[index:index+1] = list(elem.iterchildren())
else:
elem.text = u''
elem.tail = u''
elem.set(SPLIT_ATTR, '1')
if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
elem.set('style', 'display:none')
def fix_split_point(sp):
if not self.splitting_on_page_breaks:
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
# Tree 1
hit_split_point = False
for elem in list(body.iterdescendants(etree.Element)):
if elem.get(SPLIT_ATTR, '0') == '1':
continue
if elem is split_point:
hit_split_point = True
if before:
nix_element(elem)
fix_split_point(elem)
continue
if hit_split_point:
nix_element(elem)
# Tree 2
hit_split_point = False
for elem in list(body2.iterdescendants(etree.Element)):
if elem.get(SPLIT_ATTR, '0') == '1':
continue
if elem is split_point2:
hit_split_point = True
if not before:
nix_element(elem, top=False)
fix_split_point(elem)
continue
if not hit_split_point:
nix_element(elem, top=False)
return tree, tree2
def split_on_page_breaks(self, orig_tree):
ordered_ids = []
for elem in orig_tree.xpath('//*[@id]'):
id = elem.get('id')
if id in self.page_break_ids:
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
self.trees = []
tree = orig_tree
for pattern, before in ordered_ids:
self.log_info('\t\tSplitting on page-break')
elem = pattern(tree)
if elem:
before, after = self.do_split(tree, elem[0], before)
self.trees.append(before)
tree = after
self.trees.append(tree)
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
def find_page_breaks(self, stylesheets, root):
'''
Find all elements that have either page-break-before or page-break-after set.
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
have ids, an id is created).
'''
page_break_selectors = set([])
for rule in rules(stylesheets):
before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
try:
if before and before != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText), True))
except:
pass
try:
if after and after != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText), False))
except:
pass
page_breaks = set([])
for selector, before in page_break_selectors:
for elem in selector(root):
elem.pb_before = before
page_breaks.add(elem)
for i, elem in enumerate(root.iter()):
elem.pb_order = i
page_breaks = list(page_breaks)
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
self.page_break_ids = []
for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i))
id = x.get('id')
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
self.page_break_ids.append(id)
def find_split_point(self, root):
'''
Find the tag at which to split the tree rooted at `root`.
Search order is:
* Heading tags
* <div> tags
* <pre> tags
* <hr> tags
* <p> tags
* <br> tags
* <li> tags
We try to split in the "middle" of the file (as defined by tag counts.
'''
def pick_elem(elems):
if elems:
elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != '1'\
and i.get(SPLIT_ATTR, '0') != '1']
if elems:
i = int(math.floor(len(elems)/2.))
elems[i].set(SPLIT_POINT_ATTR, '1')
return elems[i]
for path in (
'//*[re:match(name(), "h[1-6]", "i")]',
'/html/body/div',
'//pre',
'//hr',
'//p',
'//div',
'//br',
'//li',
):
elems = root.xpath(path,
namespaces={'re':'http://exslt.org/regular-expressions'})
elem = pick_elem(elems)
if elem is not None:
try:
XPath(elem.getroottree().getpath(elem))
except:
continue
return elem, True
return None, True
def commit(self):
'''
Commit all changes caused by the split. This removes the previously
introduced ``calibre_split`` attribute and calculates an *anchor_map* for
all anchors in the original tree. Internal links are re-directed. The
original file is deleted and the split files are saved.
'''
self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.files = []
for i, tree in enumerate(self.trees):
root = tree.getroot()
self.files.append(self.base%i)
for elem in root.xpath('//*[@id]'):
if elem.get(SPLIT_ATTR, '0') == '0':
self.anchor_map[elem.get('id')] = self.files[-1]
for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
elem.attrib.pop(SPLIT_ATTR, None)
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
for current, tree in zip(self.files, self.trees):
for a in tree.getroot().xpath('//a[@href]'):
href = a.get('href').strip()
if href.startswith('#'):
anchor = href[1:]
file = self.anchor_map[anchor]
if file != current:
a.set('href', file+href)
open(content(current), 'wb').\
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
os.remove(content(self.path))
def fix_opf(self, opf):
'''
Fix references to the split file in the OPF.
'''
items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path]
new_items = [('content/'+f, None) for f in self.files]
id_map = {}
for item in items:
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
for id in id_map.keys():
opf.replace_spine_items_by_idref(id, id_map[id])
for ref in opf.iterguide():
href = ref.get('href', '')
if href.startswith('content/'+self.path):
href = href.split('#')
frag = None
if len(href) > 1:
frag = href[1]
if frag not in self.anchor_map:
self.log_warning('\t\tUnable to re-map OPF link', href)
continue
new_file = self.anchor_map[frag]
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
def fix_content_links(html_files, changes, opts):
split_files = [f.path for f in changes]
anchor_maps = [f.anchor_map for f in changes]
files = list(html_files)
for j, f in enumerate(split_files):
try:
i = files.index(f)
files[i:i+1] = changes[j].files
except ValueError:
continue
for htmlfile in files:
changed = False
root = html.fromstring(open(content(htmlfile), 'rb').read())
for a in root.xpath('//a[@href]'):
href = a.get('href')
if not href.startswith('#'):
href = href.split('#')
anchor = href[1] if len(href) > 1 else None
href = href[0]
if href in split_files:
try:
newf = anchor_maps[split_files.index(href)][anchor]
except:
print '\t\tUnable to remap HTML link:', href, anchor
continue
frag = ('#'+anchor) if anchor else ''
a.set('href', newf+frag)
changed = True
if changed:
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
def fix_ncx(path, changes):
split_files = [f.path for f in changes]
anchor_maps = [f.anchor_map for f in changes]
tree = etree.parse(path)
changed = False
for content in tree.getroot().xpath('//x:content[@src]',
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
href = content.get('src')
if not href.startswith('#'):
href = href.split('#')
anchor = href[1] if len(href) > 1 else None
href = href[0].split('/')[-1]
if href in split_files:
try:
newf = anchor_maps[split_files.index(href)][anchor]
except:
print 'Unable to remap NCX link:', href, anchor
frag = ('#'+anchor) if anchor else ''
content.set('src', 'content/'+newf+frag)
changed = True
if changed:
open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
def find_html_files(opf):
'''
Find all HTML files referenced by `opf`.
'''
html_files = []
for item in opf.itermanifest():
if 'html' in item.get('media-type', '').lower():
f = item.get('href').split('/')[-1]
f2 = f.replace('&', '%26')
if not os.path.exists(content(f)) and os.path.exists(content(f2)):
f = f2
item.set('href', item.get('href').replace('&', '%26'))
if os.path.exists(content(f)):
html_files.append(f)
return html_files
def split(pathtoopf, opts, stylesheet_map):
pathtoopf = os.path.abspath(pathtoopf)
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
with CurrentDir(os.path.dirname(pathtoopf)):
html_files = find_html_files(opf)
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
changes = [c for c in changes if c.was_split]
fix_content_links(html_files, changes, opts)
for item in opf.itermanifest():
if item.get('media-type', '') == 'application/x-dtbncx+xml':
fix_ncx(item.get('href'), changes)
break
open(pathtoopf, 'wb').write(opf.render())

View File

@ -0,0 +1,154 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into FB2 markup
'''
import os
import re
from base64 import b64encode
from calibre import entity_to_unicode
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.constants import __appname__, __version__
from BeautifulSoup import BeautifulSoup
TAG_MAP = {
'b' : 'strong',
'i' : 'emphasis',
'p' : 'p',
'div' : 'p',
}
STYLES = [
('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}),
('font-style', {'italic' : 'emphasis'}),
]
class FB2MLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
def extract_content(self, oeb_book, opts):
oeb_book.logger.info('Converting XHTML to FB2 markup...')
self.oeb_book = oeb_book
self.opts = opts
return self.fb2mlize_spine()
def fb2mlize_spine(self):
output = self.fb2_header()
if 'titlepage' in self.oeb_book.guide:
href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
for item in self.oeb_book.spine:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output += self.fb2_body_footer()
output += self.fb2mlize_images()
output += self.fb2_footer()
output = self.clean_text(output)
return BeautifulSoup(output.encode('utf-8')).prettify()
def fb2_header(self):
return u'<?xml version="1.0" encoding="utf-8"?> ' \
'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
'xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"> ' \
'<description><title-info><book-title>%s</book-title> ' \
'</title-info><document-info> ' \
'<program-used>%s - %s</program-used></document-info> ' \
'</description><body><section>' % (self.oeb_book.metadata.title[0].value, __appname__, __version__)
def fb2_body_footer(self):
return u'</section></body>'
def fb2_footer(self):
return u'</FictionBook>'
def fb2mlize_images(self):
images = u''
for item in self.oeb_book.manifest:
if item.media_type in OEB_IMAGES:
data = b64encode(item.data)
images += '<binary id="%s" content-type="%s">%s</binary>' % (os.path.basename(item.href), item.media_type, data)
return images
def clean_text(self, text):
for entity in set(re.findall('&.+?;', text)):
mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
text = text.replace('&', '')
return text
def dump_text(self, elem, stylizer, tag_stack=[]):
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
return u''
fb2_text = u''
style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return u''
tag = barename(elem.tag)
tag_count = 0
if tag == 'img':
fb2_text += '<image xlink:herf="#%s" />' % os.path.basename(elem.attrib['src'])
fb2_tag = TAG_MAP.get(tag, 'p')
if fb2_tag and fb2_tag not in tag_stack:
tag_count += 1
fb2_text += '<%s>' % fb2_tag
tag_stack.append(fb2_tag)
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag:
tag_count += 1
fb2_text += '<%s>' % style_tag
tag_stack.append(style_tag)
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
fb2_text += elem.text
for item in elem:
fb2_text += self.dump_text(item, stylizer, tag_stack)
close_tag_list = []
for i in range(0, tag_count):
close_tag_list.insert(0, tag_stack.pop())
fb2_text += self.close_tags(close_tag_list)
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
if 'p' not in tag_stack:
fb2_text += '<p>%s</p>' % elem.tail
else:
fb2_text += elem.tail
return fb2_text
def close_tags(self, tags):
fb2_text = u''
for i in range(0, len(tags)):
fb2_tag = tags.pop()
fb2_text += '</%s>' % fb2_tag
return fb2_text

View File

@ -0,0 +1,74 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
"""
Convert .fb2 files to .lrf
"""
import os
from base64 import b64decode
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import guess_type
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
class FB2Input(InputFormatPlugin):
name = 'FB2 Input'
author = 'Anatoly Shipitsin'
description = 'Convert FB2 files to HTML'
file_types = set(['fb2'])
recommendations = set([
('level1_toc', '//h:h1', OptionRecommendation.MED),
('level2_toc', '//h:h2', OptionRecommendation.MED),
('level3_toc', '//h:h3', OptionRecommendation.MED),
])
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.resources import fb2_xsl
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.oeb.base import XLINK_NS
NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
log.debug('Parsing XML...')
parser = etree.XMLParser(recover=True, no_network=True)
doc = etree.parse(stream, parser)
self.extract_embedded_content(doc)
log.debug('Converting XML to HTML...')
styledoc = etree.fromstring(fb2_xsl)
transform = etree.XSLT(styledoc)
result = transform(doc)
open('index.xhtml', 'wb').write(transform.tostring(result))
stream.seek(0)
mi = get_metadata(stream, 'fb2')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(os.getcwdu(), mi)
entries = [(f, guess_type(f)[0]) for f in os.listdir('.')]
opf.create_manifest(entries)
opf.create_spine(['index.xhtml'])
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
if href is not None:
if href.startswith('#'):
href = href[1:]
opf.guide.set_cover(os.path.abspath(href))
opf.render(open('metadata.opf', 'wb'))
return os.path.join(os.getcwd(), 'metadata.opf')
def extract_embedded_content(self, doc):
for elem in doc.xpath('./*'):
if 'binary' in elem.tag and elem.attrib.has_key('id'):
fname = elem.attrib['id']
data = b64decode(elem.text.strip())
open(fname, 'wb').write(data)

View File

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin
from calibre.ebooks.fb2.fb2ml import FB2MLizer
class FB2Output(OutputFormatPlugin):
name = 'FB2 Output'
author = 'John Schember'
file_type = 'fb2'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
fb2mlizer = FB2MLizer(ignore_tables=opts.linearize_tables)
fb2_content = fb2mlizer.extract_content(oeb_book, opts)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(fb2_content)
if close:
out_stream.close()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,30 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from lxml.etree import tostring as _tostring
def tostring(root, strip_comments=False, pretty_print=False):
'''
Serialize processed XHTML.
'''
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
for x in root.iter():
if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg':
x.set('xmlns', 'http://www.w3.org/2000/svg')
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
if strip_comments:
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
ans = '<?xml version="1.0" encoding="utf-8" ?>\n'+ans
return ans

View File

@ -0,0 +1,300 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Input plugin for HTML or OPF ebooks.
'''
import os, re, sys
from urlparse import urlparse, urlunparse
from urllib import unquote
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.chardet import xml_to_unicode
from calibre.customize.conversion import OptionRecommendation
from calibre import unicode_path
class Link(object):
'''
Represents a link in a HTML file.
'''
@classmethod
def url_to_local_path(cls, url, base):
path = urlunparse(('', '', url.path, url.params, url.query, ''))
path = unquote(path)
if os.path.isabs(path):
return path
return os.path.abspath(os.path.join(base, path))
def __init__(self, url, base):
'''
:param url: The url this link points to. Must be an unquoted unicode string.
:param base: The base directory that relative URLs are with respect to.
Must be a unicode string.
'''
assert isinstance(url, unicode) and isinstance(base, unicode)
self.url = url
self.parsed_url = urlparse(self.url)
self.is_local = self.parsed_url.scheme in ('', 'file')
self.is_internal = self.is_local and not bool(self.parsed_url.path)
self.path = None
self.fragment = unquote(self.parsed_url.fragment)
if self.is_local and not self.is_internal:
self.path = self.url_to_local_path(self.parsed_url, base)
def __hash__(self):
if self.path is None:
return hash(self.url)
return hash(self.path)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __str__(self):
return u'Link: %s --> %s'%(self.url, self.path)
class IgnoreFile(Exception):
def __init__(self, msg, errno):
Exception.__init__(self, msg)
self.doesnt_exist = errno == 2
self.errno = errno
class HTMLFile(object):
'''
Contains basic information about an HTML file. This
includes a list of links to other files as well as
the encoding of each file. Also tries to detect if the file is not a HTML
file in which case :member:`is_binary` is set to True.
The encoding of the file is available as :member:`encoding`.
'''
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
LINK_PAT = re.compile(
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
re.DOTALL|re.IGNORECASE)
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
'''
:param level: The level of this file. Should be 0 for the root file.
:param encoding: Use `encoding` to decode HTML.
:param referrer: The :class:`HTMLFile` that first refers to this file.
'''
self.path = unicode_path(path_to_html_file, abs=True)
self.title = os.path.splitext(os.path.basename(self.path))[0]
self.base = os.path.dirname(self.path)
self.level = level
self.referrer = referrer
self.links = []
try:
with open(self.path, 'rb') as f:
src = f.read()
except IOError, err:
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
if level == 0:
raise IOError(msg)
raise IgnoreFile(msg, err.errno)
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
if not self.is_binary:
if encoding is None:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
self.encoding = encoding
else:
self.encoding = encoding
src = src.decode(encoding, 'replace')
match = self.TITLE_PAT.search(src)
self.title = match.group(1) if match is not None else self.title
self.find_links(src)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __str__(self):
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
def __repr__(self):
return str(self)
def find_links(self, src):
for match in self.LINK_PAT.finditer(src):
url = None
for i in ('url1', 'url2', 'url3'):
url = match.group(i)
if url:
break
link = self.resolve(url)
if link not in self.links:
self.links.append(link)
def resolve(self, url):
return Link(url, self.base)
def depth_first(root, flat, visited=set([])):
yield root
visited.add(root)
for link in root.links:
if link.path is not None and link not in visited:
try:
index = flat.index(link)
except ValueError: # Can happen if max_levels is used
continue
hf = flat[index]
if hf not in visited:
yield hf
visited.add(hf)
for hf in depth_first(hf, flat, visited):
if hf not in visited:
yield hf
visited.add(hf)
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
'''
Recursively traverse all links in the HTML file.
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
implies that no links in the root HTML file are followed.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
:return: A pair of lists (breadth_first, depth_first). Each list contains
:class:`HTMLFile` objects.
'''
assert max_levels >= 0
level = 0
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
next_level = list(flat)
while level < max_levels and len(next_level) > 0:
level += 1
nl = []
for hf in next_level:
rejects = []
for link in hf.links:
if link.path is None or link.path in flat:
continue
try:
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
if nf.is_binary:
raise IgnoreFile('%s is a binary file'%nf.path, -1)
nl.append(nf)
flat.append(nf)
except IgnoreFile, err:
rejects.append(link)
if not err.doesnt_exist or verbose > 1:
print repr(err)
for link in rejects:
hf.links.remove(link)
next_level = list(nl)
orec = sys.getrecursionlimit()
sys.setrecursionlimit(500000)
try:
return flat, list(depth_first(flat[0], flat))
finally:
sys.setrecursionlimit(orec)
def get_filelist(htmlfile, dir, opts, log):
'''
Build list of files referenced by html file or try to detect and use an
OPF file instead.
'''
log.info('Building file list...')
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose,
encoding=opts.input_encoding)\
[0 if opts.breadth_first else 1]
if opts.verbose:
log.debug('\tFound files...')
for f in filelist:
log.debug('\t\t', f)
return filelist
class HTMLInput(InputFormatPlugin):
name = 'HTML Input'
author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB'
file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm'])
options = set([
OptionRecommendation(name='breadth_first',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Traverse links in HTML files breadth first. Normally, '
'they are traversed depth first.'
)
),
OptionRecommendation(name='max_levels',
recommended_value=5, level=OptionRecommendation.LOW,
help=_('Maximum levels of recursion when following links in '
'HTML files. Must be non-negative. 0 implies that no '
'links in the root HTML file are followed. Default is '
'%default.'
)
),
OptionRecommendation(name='dont_package',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Normally this input plugin re-arranges all the input '
'files into a standard folder hierarchy. Only use this option '
'if you know what you are doing as it can result in various '
'nasty side effects in the rest of of the conversion pipeline.'
)
),
])
def convert(self, stream, opts, file_ext, log,
accelerators):
from calibre.ebooks.metadata.html import get_metadata_
basedir = os.getcwd()
self.opts = opts
if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name)
if file_ext == 'opf':
opfpath = stream.name
else:
filelist = get_filelist(stream.name, basedir, opts, log)
mi = get_metadata_(stream.read(), opts.input_encoding)
mi = OPFCreator(os.getcwdu(), mi)
mi.guide = None
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
mi.create_manifest(entries)
mi.create_spine([f.path for f in filelist])
mi.render(open('metadata.opf', 'wb'), encoding=opts.input_encoding)
opfpath = os.path.abspath('metadata.opf')
if opts.dont_package:
return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts, self,
encoding=opts.input_encoding)
from calibre.ebooks.oeb.transforms.package import Package
Package(os.getcwdu())(oeb, opts)
return oeb

View File

@ -0,0 +1,312 @@
# -*- coding: utf-8 -*-
'''
Maping of non-acii symbols and their corresponding html entity number and name
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
# http://www.w3schools.com/tags/ref_symbols.asp
HTML_SYMBOLS = {
# Math Symbols
u'' : ['&#8704;', '&forall;'], # for all
u'' : ['&#8706;', '&part;'], # part
u'' : ['&#8707;', '&exists;'], # exists
u'' : ['&#8709;', '&empty;'], # empty
u'' : ['&#8711;', '&nabla;'], # nabla
u'' : ['&#8712;', '&isin;'], # isin
u'' : ['&#8713;', '&notin;'], # notin
u'' : ['&#8715;', '&ni;'], # ni
u'' : ['&#8719;', '&prod;'], # prod
u'' : ['&#8721;', '&sum;'], # sum
u'' : ['&#8722;', '&minus;'], # minus
u'' : ['&#8727;', '&lowast;'], # lowast
u'' : ['&#8730;', '&radic;'], # square root
u'' : ['&#8733;', '&prop;'], # proportional to
u'' : ['&#8734;', '&infin;'], # infinity
u'' : ['&#8736;', '&ang;'], # angle
u'' : ['&#8743;', '&and;'], # and
u'' : ['&#8744;', '&or;'], # or
u'' : ['&#8745;', '&cap;'], # cap
u'' : ['&#8746;', '&cup;'], # cup
u'' : ['&#8747;', '&int;'], # integral
u'' : ['&#8756;', '&there4;'], # therefore
u'' : ['&#8764;', '&sim;'], # simular to
u'' : ['&#8773;', '&cong;'], # approximately equal
u'' : ['&#8776;', '&asymp;'], # almost equal
u'' : ['&#8800;', '&ne;'], # not equal
u'' : ['&#8801;', '&equiv;'], # equivalent
u'' : ['&#8804;', '&le;'], # less or equal
u'' : ['&#8805;', '&ge;'], # greater or equal
u'' : ['&#8834;', '&sub;'], # subset of
u'' : ['&#8835;', '&sup;'], # superset of
u'' : ['&#8836;', '&nsub;'], # not subset of
u'' : ['&#8838;', '&sube;'], # subset or equal
u'' : ['&#8839;', '&supe;'], # superset or equal
u'' : ['&#8853;', '&oplus;'], # circled plus
u'' : ['&#8855;', '&otimes;'], # cirled times
u'' : ['&#8869;', '&perp;'], # perpendicular
u'' : ['&#8901;', '&sdot;'], # dot operator
# Greek Letters
u'Α' : ['&#913;', '&Alpha;'], # Alpha
u'Β' : ['&#914;', '&Beta;'], # Beta
u'Γ' : ['&#915;', '&Gamma;'], # Gamma
u'Δ' : ['&#916;', '&Delta;'], # Delta
u'Ε' : ['&#917;', '&Epsilon;'], # Epsilon
u'Ζ' : ['&#918;', '&Zeta;'], # Zeta
u'Η' : ['&#919;', '&Eta;'], # Eta
u'Θ' : ['&#920;', '&Theta;'], # Theta
u'Ι' : ['&#921;', '&Iota;'], # Iota
u'Κ' : ['&#922;', '&Kappa;'], # Kappa
u'Λ' : ['&#923;', '&Lambda;'], # Lambda
u'Μ' : ['&#924;', '&Mu;'], # Mu
u'Ν' : ['&#925;', '&Nu;'], # Nu
u'Ξ' : ['&#926;', '&Xi;'], # Xi
u'Ο' : ['&#927;', '&Omicron;'], # Omicron
u'Π' : ['&#928;', '&Pi;'], # Pi
u'Ρ' : ['&#929;', '&Rho;'], # Rho
u'Σ' : ['&#931;', '&Sigma;'], # Sigma
u'Τ' : ['&#932;', '&Tau;'], # Tau
u'Υ' : ['&#933;', '&Upsilon;'], # Upsilon
u'Φ' : ['&#934;', '&Phi;'], # Phi
u'Χ' : ['&#935;', '&Chi;'], # Chi
u'Ψ' : ['&#936;', '&Psi;'], # Psi
u'ω' : ['&#969;', '&omega;'], # omega
u'ϑ' : ['&#977;', '&thetasym;'], # theta symbol
u'ϒ' : ['&#978;', '&upsih;'], # upsilon symbol
u'ϖ' : ['&#982;', '&piv;'], # pi symbol
# Other
u'Œ' : ['&#338;', '&OElig;'], # capital ligature OE
u'œ' : ['&#339;', '&oelig;'], # small ligature oe
u'Š' : ['&#352;', '&Scaron;'], # capital S with caron
u'š' : ['&#353;', '&scaron;'], # small S with caron
u'Ÿ' : ['&#376;', '&Yuml;'], # capital Y with diaeres
u'ƒ' : ['&#402;', '&fnof;'], # f with hook
u'ˆ' : ['&#710;', '&circ;'], # modifier letter circumflex accent
u'˜' : ['&#732;', '&tilde;'], # small tilde
u'' : ['&#8211;', '&ndash;'], # en dash
u'' : ['&#8212;', '&mdash;'], # em dash
u'' : ['&#8216;', '&lsquo;'], # left single quotation mark
u'' : ['&#8217;', '&rsquo;'], # right single quotation mark
u'' : ['&#8218;', '&sbquo;'], # single low-9 quotation mark
u'' : ['&#8220;', '&ldquo;'], # left double quotation mark
u'' : ['&#8221;', '&rdquo;'], # right double quotation mark
u'' : ['&#8222;', '&bdquo;'], # double low-9 quotation mark
u'' : ['&#8224;', '&dagger;'], # dagger
u'' : ['&#8225;', '&Dagger;'], # double dagger
u'' : ['&#8226;', '&bull;'], # bullet
u'' : ['&#8230;', '&hellip;'], # horizontal ellipsis
u'' : ['&#8240;', '&permil;'], # per mille
u'' : ['&#8242;', '&prime;'], # minutes
u'' : ['&#8243;', '&Prime;'], # seconds
u'' : ['&#8249;', '&lsaquo;'], # single left angle quotation
u'' : ['&#8250;', '&rsaquo;'], # single right angle quotation
u'' : ['&#8254;', '&oline;'], # overline
u'' : ['&#8364;', '&euro;'], # euro
u'' : ['&#8482;', '&trade;'], # trademark
u'' : ['&#8592;', '&larr;'], # left arrow
u'' : ['&#8593;', '&uarr;'], # up arrow
u'' : ['&#8594;', '&rarr;'], # right arrow
u'' : ['&#8595;', '&darr;'], # down arrow
u'' : ['&#8596;', '&harr;'], # left right arrow
u'' : ['&#8629;', '&crarr;'], # carriage return arrow
u'' : ['&#8968;', '&lceil;'], # left ceiling
u'' : ['&#8969;', '&rceil;'], # right ceiling
u'' : ['&#8970;', '&lfloor;'], # left floor
u'' : ['&#8971;', '&rfloor;'], # right floor
u'' : ['&#9674;', '&loz;'], # lozenge
u'' : ['&#9824;', '&spades;'], # spade
u'' : ['&#9827;', '&clubs;'], # club
u'' : ['&#9829;', '&hearts;'], # heart
u'' : ['&#9830;', '&diams;'], # diamond
# Extra http://www.ascii.cl/htmlcodes.htm
u' ' : ['&#32;'], # space
u'!' : ['&#33;'], # exclamation point
u'#' : ['&#35;'], # number sign
u'$' : ['&#36;'], # dollar sign
u'%' : ['&#37;'], # percent sign
u'\'' : ['&#39;'], # single quote
u'(' : ['&#40;'], # opening parenthesis
u')' : ['&#41;'], # closing parenthesis
u'*' : ['&#42;'], # asterisk
u'+' : ['&#43;'], # plus sign
u',' : ['&#44;'], # comma
u'-' : ['&#45;'], # minus sign - hyphen
u'.' : ['&#46;'], # period
u'/' : ['&#47;'], # slash
u'0' : ['&#48;'], # zero
u'1' : ['&#49;'], # one
u'2' : ['&#50;'], # two
u'3' : ['&#51;'], # three
u'4' : ['&#52;'], # four
u'5' : ['&#53;'], # five
u'6' : ['&#54;'], # six
u'7' : ['&#55;'], # seven
u'8' : ['&#56;'], # eight
u'9' : ['&#57;'], # nine
u':' : ['&#58;'], # colon
u';' : ['&#59;'], # semicolon
u'=' : ['&#61;'], # equal sign
u'?' : ['&#63;'], # question mark
u'@' : ['&#64;'], # at symbol
u'A' : ['&#65;'], #
u'B' : ['&#66;'], #
u'C' : ['&#67;'], #
u'D' : ['&#68;'], #
u'E' : ['&#69;'], #
u'F' : ['&#70;'], #
u'G' : ['&#71;'], #
u'H' : ['&#72;'], #
u'I' : ['&#73;'], #
u'J' : ['&#74;'], #
u'K' : ['&#75;'], #
u'L' : ['&#76;'], #
u'M' : ['&#77;'], #
u'N' : ['&#78;'], #
u'O' : ['&#79;'], #
u'P' : ['&#80;'], #
u'Q' : ['&#81;'], #
u'R' : ['&#82;'], #
u'S' : ['&#83;'], #
u'T' : ['&#84;'], #
u'U' : ['&#85;'], #
u'V' : ['&#86;'], #
u'W' : ['&#87;'], #
u'X' : ['&#88;'], #
u'Y' : ['&#89;'], #
u'Z' : ['&#90;'], #
u'[' : ['&#91;'], # opening bracket
u'\\' : ['&#92;'], # backslash
u']' : ['&#93;'], # closing bracket
u'^' : ['&#94;'], # caret - circumflex
u'_' : ['&#95;'], # underscore
u'`' : ['&#96;'], # grave accent
u'a' : ['&#97;'], #
u'b' : ['&#98;'], #
u'c' : ['&#99;'], #
u'd' : ['&#100;'], #
u'e' : ['&#101;'], #
u'f' : ['&#102;'], #
u'g' : ['&#103;'], #
u'h' : ['&#104;'], #
u'i' : ['&#105;'], #
u'j' : ['&#106;'], #
u'k' : ['&#107;'], #
u'l' : ['&#108;'], #
u'm' : ['&#109;'], #
u'n' : ['&#110;'], #
u'o' : ['&#111;'], #
u'p' : ['&#112;'], #
u'q' : ['&#113;'], #
u'r' : ['&#114;'], #
u's' : ['&#115;'], #
u't' : ['&#116;'], #
u'u' : ['&#117;'], #
u'v' : ['&#118;'], #
u'w' : ['&#119;'], #
u'x' : ['&#120;'], #
u'y' : ['&#121;'], #
u'z' : ['&#122;'], #
u'{' : ['&#123;'], # opening brace
u'|' : ['&#124;'], # vertical bar
u'}' : ['&#125;'], # closing brace
u'~' : ['&#126;'], # equivalency sign - tilde
u'<' : ['&#60;', '&lt;'], # less than sign
u'>' : ['&#62;', '&gt;'], # greater than sign
u'¡' : ['&#161;', '&iexcl;'], # inverted exclamation mark
u'¢' : ['&#162;', '&cent;'], # cent sign
u'£' : ['&#163;', '&pound;'], # pound sign
u'¤' : ['&#164;', '&curren;'], # currency sign
u'¥' : ['&#165;', '&yen;'], # yen sign
u'¦' : ['&#166;', '&brvbar;'], # broken vertical bar
u'§' : ['&#167;', '&sect;'], # section sign
u'¨' : ['&#168;', '&uml;'], # spacing diaeresis - umlaut
u'©' : ['&#169;', '&copy;'], # copyright sign
u'ª' : ['&#170;', '&ordf;'], # feminine ordinal indicator
u'«' : ['&#171;', '&laquo;'], # left double angle quotes
u'¬' : ['&#172;', '&not;'], # not sign
u'®' : ['&#174;', '&reg;'], # registered trade mark sign
u'¯' : ['&#175;', '&macr;'], # spacing macron - overline
u'°' : ['&#176;', '&deg;'], # degree sign
u'±' : ['&#177;', '&plusmn;'], # plus-or-minus sign
u'²' : ['&#178;', '&sup2;'], # superscript two - squared
u'³' : ['&#179;', '&sup3;'], # superscript three - cubed
u'´' : ['&#180;', '&acute;'], # acute accent - spacing acute
u'µ' : ['&#181;', '&micro;'], # micro sign
u'' : ['&#182;', '&para;'], # pilcrow sign - paragraph sign
u'·' : ['&#183;', '&middot;'], # middle dot - Georgian comma
u'¸' : ['&#184;', '&cedil;'], # spacing cedilla
u'¹' : ['&#185;', '&sup1;'], # superscript one
u'º' : ['&#186;', '&ordm;'], # masculine ordinal indicator
u'»' : ['&#187;', '&raquo;'], # right double angle quotes
u'¼' : ['&#188;', '&frac14;'], # fraction one quarter
u'½' : ['&#189;', '&frac12;'], # fraction one half
u'¾' : ['&#190;', '&frac34;'], # fraction three quarters
u'¿' : ['&#191;', '&iquest;'], # inverted question mark
u'À' : ['&#192;', '&Agrave;'], # latin capital letter A with grave
u'Á' : ['&#193;', '&Aacute;'], # latin capital letter A with acute
u'Â' : ['&#194;', '&Acirc;'], # latin capital letter A with circumflex
u'Ã' : ['&#195;', '&Atilde;'], # latin capital letter A with tilde
u'Ä' : ['&#196;', '&Auml;'], # latin capital letter A with diaeresis
u'Å' : ['&#197;', '&Aring;'], # latin capital letter A with ring above
u'Æ' : ['&#198;', '&AElig;'], # latin capital letter AE
u'Ç' : ['&#199;', '&Ccedil;'], # latin capital letter C with cedilla
u'È' : ['&#200;', '&Egrave;'], # latin capital letter E with grave
u'É' : ['&#201;', '&Eacute;'], # latin capital letter E with acute
u'Ê' : ['&#202;', '&Ecirc;'], # latin capital letter E with circumflex
u'Ë' : ['&#203;', '&Euml;'], # latin capital letter E with diaeresis
u'Ì' : ['&#204;', '&Igrave;'], # latin capital letter I with grave
u'Í' : ['&#205;', '&Iacute;'], # latin capital letter I with acute
u'Î' : ['&#206;', '&Icirc;'], # latin capital letter I with circumflex
u'Ï' : ['&#207;', '&Iuml;'], # latin capital letter I with diaeresis
u'Ð' : ['&#208;', '&ETH;'], # latin capital letter ETH
u'Ñ' : ['&#209;', '&Ntilde;'], # latin capital letter N with tilde
u'Ò' : ['&#210;', '&Ograve;'], # latin capital letter O with grave
u'Ó' : ['&#211;', '&Oacute;'], # latin capital letter O with acute
u'Ô' : ['&#212;', '&Ocirc;'], # latin capital letter O with circumflex
u'Õ' : ['&#213;', '&Otilde;'], # latin capital letter O with tilde
u'Ö' : ['&#214;', '&Ouml;'], # latin capital letter O with diaeresis
u'×' : ['&#215;', '&times;'], # multiplication sign
u'Ø' : ['&#216;', '&Oslash;'], # latin capital letter O with slash
u'Ù' : ['&#217;', '&Ugrave;'], # latin capital letter U with grave
u'Ú' : ['&#218;', '&Uacute;'], # latin capital letter U with acute
u'Û' : ['&#219;', '&Ucirc;'], # latin capital letter U with circumflex
u'Ü' : ['&#220;', '&Uuml;'], # latin capital letter U with diaeresis
u'Ý' : ['&#221;', '&Yacute;'], # latin capital letter Y with acute
u'Þ' : ['&#222;', '&THORN;'], # latin capital letter THORN
u'ß' : ['&#223;', '&szlig;'], # latin small letter sharp s - ess-zed
u'à' : ['&#224;', '&agrave;'], # latin small letter a with grave
u'á' : ['&#225;', '&aacute;'], # latin small letter a with acute
u'â' : ['&#226;', '&acirc;'], # latin small letter a with circumflex
u'ã' : ['&#227;', '&atilde;'], # latin small letter a with tilde
u'ä' : ['&#228;', '&auml;'], # latin small letter a with diaeresis
u'å' : ['&#229;', '&aring;'], # latin small letter a with ring above
u'æ' : ['&#230;', '&aelig;'], # latin small letter ae
u'ç' : ['&#231;', '&ccedil;'], # latin small letter c with cedilla
u'è' : ['&#232;', '&egrave;'], # latin small letter e with grave
u'é' : ['&#233;', '&eacute;'], # latin small letter e with acute
u'ê' : ['&#234;', '&ecirc;'], # latin small letter e with circumflex
u'ë' : ['&#235;', '&euml;'], # latin small letter e with diaeresis
u'ì' : ['&#236;', '&igrave;'], # latin small letter i with grave
u'í' : ['&#237;', '&iacute;'], # latin small letter i with acute
u'î' : ['&#238;', '&icirc;'], # latin small letter i with circumflex
u'ï' : ['&#239;', '&iuml;'], # latin small letter i with diaeresis
u'ð' : ['&#240;', '&eth;'], # latin small letter eth
u'ñ' : ['&#241;', '&ntilde;'], # latin small letter n with tilde
u'ò' : ['&#242;', '&ograve;'], # latin small letter o with grave
u'ó' : ['&#243;', '&oacute;'], # latin small letter o with acute
u'ô' : ['&#244;', '&ocirc;'], # latin small letter o with circumflex
u'õ' : ['&#245;', '&otilde;'], # latin small letter o with tilde
u'ö' : ['&#246;', '&ouml;'], # latin small letter o with diaeresis
u'÷' : ['&#247;', '&divide;'], # division sign
u'ø' : ['&#248;', '&oslash;'], # latin small letter o with slash
u'ù' : ['&#249;', '&ugrave;'], # latin small letter u with grave
u'ú' : ['&#250;', '&uacute;'], # latin small letter u with acute
u'û' : ['&#251;', '&ucirc;'], # latin small letter u with circumflex
u'ü' : ['&#252;', '&uuml;'], # latin small letter u with diaeresis
u'ý' : ['&#253;', '&yacute;'], # latin small letter y with acute
u'þ' : ['&#254;', '&thorn;'], # latin small letter thorn
u'ÿ' : ['&#255;', '&yuml;'], # latin small letter y with diaeresis
# More
u' ' : ['&#160;'],
}

View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = set(['lit'])
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream, options, self, reader=LitReader)

View File

@ -0,0 +1,45 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
class LITOutput(OutputFormatPlugin):
name = 'LIT Output'
author = 'Marshall T. Vandegrift'
file_type = 'lit'
recommendations = set([
('dont_split_on_page_breaks', False, OptionRecommendation.HIGH),
])
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.oeb.transforms.split import Split
split = Split(not self.opts.dont_split_on_page_breaks,
max_flow_size=0
)
split(self.oeb, self.opts)
tocadder = HTMLTOCAdder()
tocadder(oeb, opts)
mangler = CaseMangler()
mangler(oeb, opts)
rasterizer = SVGRasterizer()
rasterizer(oeb, opts)
lit = LitWriter()
lit(oeb, output_path)

View File

@ -7,21 +7,24 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'and Marshall T. Vandegrift <llasram@gmail.com>'
import sys, struct, cStringIO, os
import struct, os
import functools
import re
from urlparse import urldefrag
from cStringIO import StringIO
from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks import DRMError
from calibre import plugins
lzx, lxzerror = plugins['lzx']
msdes, msdeserror = plugins['msdes']
__all__ = ["LitReader"]
XML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
"""
OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
@ -109,6 +112,9 @@ def consume_sized_utf8_string(bytes, zpad=False):
pos += 1
return u''.join(result), bytes[pos:]
def encode(string):
return unicode(string).encode('ascii', 'xmlcharrefreplace')
class UnBinary(object):
AMPERSAND_RE = re.compile(
r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)')
@ -120,14 +126,14 @@ class UnBinary(object):
def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS):
self.manifest = manifest
self.tag_map, self.attr_map, self.tag_to_attr_map = map
self.is_html = map is HTML_MAP
self.tag_atoms, self.attr_atoms = atoms
self.opf = map is OPF_MAP
self.bin = bin
self.dir = os.path.dirname(path)
self.buf = cStringIO.StringIO()
self.binary_to_text()
self.raw = self.buf.getvalue().lstrip().decode('utf-8')
buf = StringIO()
self.binary_to_text(bin, buf)
self.raw = buf.getvalue().lstrip()
self.escape_reserved()
self._tree = None
def escape_reserved(self):
raw = self.raw
@ -154,18 +160,20 @@ class UnBinary(object):
return '/'.join(relpath)
def __unicode__(self):
return self.raw.decode('utf-8')
def __str__(self):
return self.raw
def binary_to_text(self, base=0, depth=0):
def binary_to_text(self, bin, buf, index=0, depth=0):
tag_name = current_map = None
dynamic_tag = errors = 0
in_censorship = is_goingdown = False
state = 'text'
index = base
flags = 0
while index < len(self.bin):
c, index = read_utf8_char(self.bin, index)
while index < len(bin):
c, index = read_utf8_char(bin, index)
oc = ord(c)
if state == 'text':
@ -178,7 +186,7 @@ class UnBinary(object):
c = '>>'
elif c == '<':
c = '<<'
self.buf.write(c.encode('ascii', 'xmlcharrefreplace'))
buf.write(encode(c))
elif state == 'get flags':
if oc == 0:
@ -191,7 +199,7 @@ class UnBinary(object):
state = 'text' if oc == 0 else 'get attr'
if flags & FLAG_OPENING:
tag = oc
self.buf.write('<')
buf.write('<')
if not (flags & FLAG_CLOSING):
is_goingdown = True
if tag == 0x8000:
@ -199,7 +207,8 @@ class UnBinary(object):
continue
if flags & FLAG_ATOM:
if not self.tag_atoms or tag not in self.tag_atoms:
raise LitError("atom tag %d not in atom tag list" % tag)
raise LitError(
"atom tag %d not in atom tag list" % tag)
tag_name = self.tag_atoms[tag]
current_map = self.attr_atoms
elif tag < len(self.tag_map):
@ -211,7 +220,7 @@ class UnBinary(object):
tag_name = '?'+unichr(tag)+'?'
current_map = self.tag_to_attr_map[tag]
print 'WARNING: tag %s unknown' % unichr(tag)
self.buf.write(unicode(tag_name).encode('utf-8'))
buf.write(encode(tag_name))
elif flags & FLAG_CLOSING:
if depth == 0:
raise LitError('Extra closing tag')
@ -223,15 +232,14 @@ class UnBinary(object):
if not is_goingdown:
tag_name = None
dynamic_tag = 0
self.buf.write(' />')
buf.write(' />')
else:
self.buf.write('>')
index = self.binary_to_text(base=index, depth=depth+1)
buf.write('>')
index = self.binary_to_text(bin, buf, index, depth+1)
is_goingdown = False
if not tag_name:
raise LitError('Tag ends before it begins.')
self.buf.write(u''.join(
('</', tag_name, '>')).encode('utf-8'))
buf.write(encode(u''.join(('</', tag_name, '>'))))
dynamic_tag = 0
tag_name = None
state = 'text'
@ -251,7 +259,7 @@ class UnBinary(object):
in_censorship = True
state = 'get value length'
continue
self.buf.write(' ' + unicode(attr).encode('utf-8') + '=')
buf.write(' ' + encode(attr) + '=')
if attr in ['href', 'src']:
state = 'get href length'
else:
@ -259,24 +267,24 @@ class UnBinary(object):
elif state == 'get value length':
if not in_censorship:
self.buf.write('"')
buf.write('"')
count = oc - 1
if count == 0:
if not in_censorship:
self.buf.write('"')
buf.write('"')
in_censorship = False
state = 'get attr'
continue
state = 'get value'
if oc == 0xffff:
continue
if count < 0 or count > (len(self.bin) - index):
if count < 0 or count > (len(bin) - index):
raise LitError('Invalid character count %d' % count)
elif state == 'get value':
if count == 0xfffe:
if not in_censorship:
self.buf.write('%s"' % (oc - 1))
buf.write('%s"' % (oc - 1))
in_censorship = False
state = 'get attr'
elif count > 0:
@ -285,17 +293,17 @@ class UnBinary(object):
c = '&quot;'
elif c == '<':
c = '&lt;'
self.buf.write(c.encode('ascii', 'xmlcharrefreplace'))
buf.write(c.encode('ascii', 'xmlcharrefreplace'))
count -= 1
if count == 0:
if not in_censorship:
self.buf.write('"')
buf.write('"')
in_censorship = False
state = 'get attr'
elif state == 'get custom length':
count = oc - 1
if count <= 0 or count > len(self.bin)-index:
if count <= 0 or count > len(bin)-index:
raise LitError('Invalid character count %d' % count)
dynamic_tag += 1
state = 'get custom'
@ -305,26 +313,26 @@ class UnBinary(object):
tag_name += c
count -= 1
if count == 0:
self.buf.write(unicode(tag_name).encode('utf-8'))
buf.write(encode(tag_name))
state = 'get attr'
elif state == 'get attr length':
count = oc - 1
if count <= 0 or count > (len(self.bin) - index):
if count <= 0 or count > (len(bin) - index):
raise LitError('Invalid character count %d' % count)
self.buf.write(' ')
buf.write(' ')
state = 'get custom attr'
elif state == 'get custom attr':
self.buf.write(unicode(c).encode('utf-8'))
buf.write(encode(c))
count -= 1
if count == 0:
self.buf.write('=')
buf.write('=')
state = 'get value length'
elif state == 'get href length':
count = oc - 1
if count <= 0 or count > (len(self.bin) - index):
if count <= 0 or count > (len(bin) - index):
raise LitError('Invalid character count %d' % count)
href = ''
state = 'get href'
@ -338,10 +346,11 @@ class UnBinary(object):
if frag:
path = '#'.join((path, frag))
path = urlnormalize(path)
self.buf.write((u'"%s"' % path).encode('utf-8'))
buf.write(encode(u'"%s"' % path))
state = 'get attr'
return index
class DirectoryEntry(object):
def __init__(self, name, section, offset, size):
self.name = name
@ -356,6 +365,7 @@ class DirectoryEntry(object):
def __str__(self):
return repr(self)
class ManifestItem(object):
def __init__(self, original, internal, mime_type, offset, root, state):
self.original = original
@ -383,65 +393,87 @@ class ManifestItem(object):
% (self.internal, self.path, self.mime_type, self.offset,
self.root, self.state)
def preserve(function):
def wrapper(self, *args, **kwargs):
opos = self._stream.tell()
opos = self.stream.tell()
try:
return function(self, *args, **kwargs)
finally:
self._stream.seek(opos)
self.stream.seek(opos)
functools.update_wrapper(wrapper, function)
return wrapper
class LitReader(object):
class LitFile(object):
PIECE_SIZE = 16
XML_PARSER = etree.XMLParser(
recover=True, resolve_entities=False)
def __init__(self, filename_or_stream):
if hasattr(filename_or_stream, 'read'):
self.stream = filename_or_stream
else:
self.stream = open(filename_or_stream, 'rb')
try:
self.opf_path = os.path.splitext(
os.path.basename(self.stream.name))[0] + '.opf'
except AttributeError:
self.opf_path = 'content.opf'
if self.magic != 'ITOLITLS':
raise LitError('Not a valid LIT file')
if self.version != 1:
raise LitError('Unknown LIT version %d' % (self.version,))
self.read_secondary_header()
self.read_header_pieces()
self.read_section_names()
self.read_manifest()
self.read_drm()
def warn(self, msg):
print "WARNING: %s" % (msg,)
def magic():
@preserve
def fget(self):
self._stream.seek(0)
return self._stream.read(8)
self.stream.seek(0)
return self.stream.read(8)
return property(fget=fget)
magic = magic()
def version():
def fget(self):
self._stream.seek(8)
return u32(self._stream.read(4))
self.stream.seek(8)
return u32(self.stream.read(4))
return property(fget=fget)
version = version()
def hdr_len():
@preserve
def fget(self):
self._stream.seek(12)
return int32(self._stream.read(4))
self.stream.seek(12)
return int32(self.stream.read(4))
return property(fget=fget)
hdr_len = hdr_len()
def num_pieces():
@preserve
def fget(self):
self._stream.seek(16)
return int32(self._stream.read(4))
self.stream.seek(16)
return int32(self.stream.read(4))
return property(fget=fget)
num_pieces = num_pieces()
def sec_hdr_len():
@preserve
def fget(self):
self._stream.seek(20)
return int32(self._stream.read(4))
self.stream.seek(20)
return int32(self.stream.read(4))
return property(fget=fget)
sec_hdr_len = sec_hdr_len()
def guid():
@preserve
def fget(self):
self._stream.seek(24)
return self._stream.read(16)
self.stream.seek(24)
return self.stream.read(16)
return property(fget=fget)
guid = guid()
@ -451,44 +483,27 @@ class LitReader(object):
size = self.hdr_len \
+ (self.num_pieces * self.PIECE_SIZE) \
+ self.sec_hdr_len
self._stream.seek(0)
return self._stream.read(size)
self.stream.seek(0)
return self.stream.read(size)
return property(fget=fget)
header = header()
def __init__(self, filename_or_stream):
if hasattr(filename_or_stream, 'read'):
self._stream = filename_or_stream
else:
self._stream = open(filename_or_stream, 'rb')
if self.magic != 'ITOLITLS':
raise LitError('Not a valid LIT file')
if self.version != 1:
raise LitError('Unknown LIT version %d' % (self.version,))
self.entries = {}
self._read_secondary_header()
self._read_header_pieces()
self._read_section_names()
self._read_manifest()
self._read_meta()
self._read_drm()
@preserve
def __len__(self):
self._stream.seek(0, 2)
return self._stream.tell()
self.stream.seek(0, 2)
return self.stream.tell()
@preserve
def _read_raw(self, offset, size):
self._stream.seek(offset)
return self._stream.read(size)
def read_raw(self, offset, size):
self.stream.seek(offset)
return self.stream.read(size)
def _read_content(self, offset, size):
return self._read_raw(self.content_offset + offset, size)
def read_content(self, offset, size):
return self.read_raw(self.content_offset + offset, size)
def _read_secondary_header(self):
def read_secondary_header(self):
offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE)
bytes = self._read_raw(offset, self.sec_hdr_len)
bytes = self.read_raw(offset, self.sec_hdr_len)
offset = int32(bytes[4:])
while offset < len(bytes):
blocktype = bytes[offset:offset+4]
@ -516,21 +531,21 @@ class LitReader(object):
if not hasattr(self, 'content_offset'):
raise LitError('Could not figure out the content offset')
def _read_header_pieces(self):
def read_header_pieces(self):
src = self.header[self.hdr_len:]
for i in xrange(self.num_pieces):
piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE]
if u32(piece[4:]) != 0 or u32(piece[12:]) != 0:
raise LitError('Piece %s has 64bit value' % repr(piece))
offset, size = u32(piece), int32(piece[8:])
piece = self._read_raw(offset, size)
piece = self.read_raw(offset, size)
if i == 0:
continue # Dont need this piece
elif i == 1:
if u32(piece[8:]) != self.entry_chunklen or \
u32(piece[12:]) != self.entry_unknown:
raise LitError('Secondary header does not match piece')
self._read_directory(piece)
self.read_directory(piece)
elif i == 2:
if u32(piece[8:]) != self.count_chunklen or \
u32(piece[12:]) != self.count_unknown:
@ -541,12 +556,13 @@ class LitReader(object):
elif i == 4:
self.piece4_guid = piece
def _read_directory(self, piece):
def read_directory(self, piece):
if not piece.startswith('IFCM'):
raise LitError('Header piece #1 is not main directory.')
chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28])
if (32 + (num_chunks * chunk_size)) != len(piece):
raise LitError('IFCM HEADER has incorrect length')
raise LitError('IFCM header has incorrect length')
self.entries = {}
for i in xrange(num_chunks):
offset = 32 + (i * chunk_size)
chunk = piece[offset:offset + chunk_size]
@ -580,17 +596,17 @@ class LitReader(object):
entry = DirectoryEntry(name, section, offset, size)
self.entries[name] = entry
def _read_section_names(self):
def read_section_names(self):
if '::DataSpace/NameList' not in self.entries:
raise LitError('Lit file does not have a valid NameList')
raw = self.get_file('::DataSpace/NameList')
if len(raw) < 4:
raise LitError('Invalid Namelist section')
pos = 4
self.num_sections = u16(raw[2:pos])
self.section_names = [""]*self.num_sections
self.section_data = [None]*self.num_sections
for section in xrange(self.num_sections):
num_sections = u16(raw[2:pos])
self.section_names = [""] * num_sections
self.section_data = [None] * num_sections
for section in xrange(num_sections):
size = u16(raw[pos:pos+2])
pos += 2
size = size*2 + 2
@ -600,11 +616,12 @@ class LitReader(object):
raw[pos:pos+size].decode('utf-16-le').rstrip('\000')
pos += size
def _read_manifest(self):
def read_manifest(self):
if '/manifest' not in self.entries:
raise LitError('Lit file does not have a valid manifest')
raw = self.get_file('/manifest')
self.manifest = {}
self.paths = {self.opf_path: None}
while raw:
slen, raw = ord(raw[0]), raw[1:]
if slen == 0: break
@ -645,28 +662,9 @@ class LitReader(object):
for item in mlist:
if item.path[0] == '/':
item.path = os.path.basename(item.path)
self.paths[item.path] = item
def _pretty_print(self, xml):
f = cStringIO.StringIO(xml.encode('utf-8'))
doc = etree.parse(f, parser=self.XML_PARSER)
pretty = etree.tostring(doc, encoding='ascii', pretty_print=True)
return XML_DECL + unicode(pretty)
def _read_meta(self):
path = 'content.opf'
raw = self.get_file('/meta')
xml = OPF_DECL
try:
xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP))
except LitError:
if 'PENGUIN group' not in raw: raise
print "WARNING: attempting PENGUIN malformed OPF fix"
raw = raw.replace(
'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1)
xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP))
self.meta = xml
def _read_drm(self):
def read_drm(self):
self.drmlevel = 0
if '/DRMStorage/Licenses/EUL' in self.entries:
self.drmlevel = 5
@ -677,7 +675,7 @@ class LitReader(object):
else:
return
if self.drmlevel < 5:
msdes.deskey(self._calculate_deskey(), msdes.DE1)
msdes.deskey(self.calculate_deskey(), msdes.DE1)
bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed'))
if bookkey[0] != '\000':
raise LitError('Unable to decrypt title key!')
@ -685,7 +683,7 @@ class LitReader(object):
else:
raise DRMError("Cannot access DRM-protected book")
def _calculate_deskey(self):
def calculate_deskey(self):
hashfiles = ['/meta', '/DRMStorage/DRMSource']
if self.drmlevel == 3:
hashfiles.append('/DRMStorage/DRMBookplate')
@ -709,18 +707,18 @@ class LitReader(object):
def get_file(self, name):
entry = self.entries[name]
if entry.section == 0:
return self._read_content(entry.offset, entry.size)
return self.read_content(entry.offset, entry.size)
section = self.get_section(entry.section)
return section[entry.offset:entry.offset+entry.size]
def get_section(self, section):
data = self.section_data[section]
if not data:
data = self._get_section(section)
data = self.get_section_uncached(section)
self.section_data[section] = data
return data
def _get_section(self, section):
def get_section_uncached(self, section):
name = self.section_names[section]
path = '::DataSpace/Storage/' + name
transform = self.get_file(path + '/Transform/List')
@ -732,29 +730,29 @@ class LitReader(object):
raise LitError("ControlData is too short")
guid = msguid(transform)
if guid == DESENCRYPT_GUID:
content = self._decrypt(content)
content = self.decrypt(content)
control = control[csize:]
elif guid == LZXCOMPRESS_GUID:
reset_table = self.get_file(
'/'.join(('::DataSpace/Storage', name, 'Transform',
LZXCOMPRESS_GUID, 'InstanceData/ResetTable')))
content = self._decompress(content, control, reset_table)
content = self.decompress(content, control, reset_table)
control = control[csize:]
else:
raise LitError("Unrecognized transform: %s." % repr(guid))
transform = transform[16:]
return content
def _decrypt(self, content):
def decrypt(self, content):
length = len(content)
extra = length & 0x7
if extra > 0:
self._warn("content length not a multiple of block size")
self.warn("content length not a multiple of block size")
content += "\0" * (8 - extra)
msdes.deskey(self.bookkey, msdes.DE1)
return msdes.des(content)
def _decompress(self, content, control, reset_table):
def decompress(self, content, control, reset_table):
if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC":
raise LitError("Invalid ControlData tag value")
if len(reset_table) < (RESET_INTERVAL + 8):
@ -795,7 +793,7 @@ class LitReader(object):
result.append(
lzx.decompress(content[base:size], window_bytes))
except lzx.LZXError:
self._warn("LZX decompression error; skipping chunk")
self.warn("LZX decompression error; skipping chunk")
bytes_remaining -= window_bytes
base = size
accum += int32(reset_table[RESET_INTERVAL:])
@ -805,7 +803,7 @@ class LitReader(object):
try:
result.append(lzx.decompress(content[base:], bytes_remaining))
except lzx.LZXError:
self._warn("LZX decompression error; skipping chunk")
self.warn("LZX decompression error; skipping chunk")
bytes_remaining = 0
if bytes_remaining > 0:
raise LitError("Failed to completely decompress section")
@ -842,75 +840,56 @@ class LitReader(object):
self._warn("damaged or invalid atoms attributes table")
return (tags, attrs)
def get_entry_content(self, entry, pretty_print=False):
if 'spine' in entry.state:
name = '/'.join(('/data', entry.internal, 'content'))
path = entry.path
raw = self.get_file(name)
decl, map = (OPF_DECL, OPF_MAP) \
if name == '/meta' else (HTML_DECL, HTML_MAP)
atoms = self.get_atoms(entry)
content = decl + unicode(UnBinary(raw, path, self.manifest, map, atoms))
if pretty_print:
content = self._pretty_print(content)
content = content.encode('utf-8')
class LitContainer(object):
"""Simple Container-interface, read-only accessor for LIT files."""
def __init__(self, filename_or_stream):
self._litfile = LitFile(filename_or_stream)
def namelist(self):
return self._litfile.paths.keys()
def exists(self, name):
return urlunquote(name) in self._litfile.paths
def read(self, name):
entry = self._litfile.paths[urlunquote(name)] if name else None
if entry is None:
content = OPF_DECL + self._read_meta()
elif 'spine' in entry.state:
internal = '/'.join(('/data', entry.internal, 'content'))
raw = self._litfile.get_file(internal)
manifest = self._litfile.manifest
atoms = self._litfile.get_atoms(entry)
unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms)
content = HTML_DECL + str(unbin)
else:
name = '/'.join(('/data', entry.internal))
content = self.get_file(name)
internal = '/'.join(('/data', entry.internal))
content = self._litfile.get_file(internal)
return content
def extract_content(self, output_dir=os.getcwdu(), pretty_print=False):
output_dir = os.path.abspath(output_dir)
def _read_meta(self):
path = 'content.opf'
raw = self._litfile.get_file('/meta')
try:
opf_path = os.path.splitext(
os.path.basename(self._stream.name))[0] + '.opf'
except AttributeError:
opf_path = 'content.opf'
opf_path = os.path.join(output_dir, opf_path)
self._ensure_dir(opf_path)
with open(opf_path, 'wb') as f:
xml = self.meta
if pretty_print:
xml = self._pretty_print(xml)
f.write(xml.encode('utf-8'))
for entry in self.manifest.values():
path = os.path.join(output_dir, entry.path)
self._ensure_dir(path)
with open(path, 'wb') as f:
f.write(self.get_entry_content(entry, pretty_print))
unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
except LitError:
if 'PENGUIN group' not in raw: raise
print "WARNING: attempting PENGUIN malformed OPF fix"
raw = raw.replace(
'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1)
unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
return str(unbin)
def _ensure_dir(self, path):
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
def get_metadata(self):
return self._read_meta()
def _warn(self, msg):
print "WARNING: %s" % (msg,)
def option_parser():
from calibre.utils.config import OptionParser
parser = OptionParser(usage=_('%prog [options] LITFILE'))
parser.add_option(
'-o', '--output-dir', default='.',
help=_('Output directory. Defaults to current directory.'))
parser.add_option(
'-p', '--pretty-print', default=False, action='store_true',
help=_('Legibly format extracted markup. May modify meaningful whitespace.'))
parser.add_option(
'--verbose', default=False, action='store_true',
help=_('Useful for debugging.'))
return parser
class LitReader(OEBReader):
Container = LitContainer
DEFAULT_PROFILE = 'MSReader'
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
return 1
lr = LitReader(args[1])
lr.extract_content(opts.output_dir, opts.pretty_print)
print _('OEB ebook created in'), opts.output_dir
return 0
try:
import psyco
@ -918,6 +897,3 @@ try:
psyco.bind(UnBinary.binary_to_text)
except ImportError:
pass
if __name__ == '__main__':
sys.exit(main())

View File

@ -6,8 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
from cStringIO import StringIO
from struct import pack
from itertools import izip, count, chain
@ -17,7 +15,6 @@ import re
import copy
import uuid
import functools
import logging
from urlparse import urldefrag
from urllib import unquote as urlunquote
from lxml import etree
@ -25,22 +22,14 @@ from calibre.ebooks.lit.reader import DirectoryEntry
import calibre.ebooks.lit.maps as maps
from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_MIME, OEB_STYLES, \
CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.oeb.base import namespace, barename, prefixname, \
urlnormalize, xpath
from calibre.ebooks.oeb.base import Logger, OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.base import prefixname, \
urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.lit.lzx import Compressor
import calibre
from calibre import plugins
msdes, msdeserror = plugins['msdes']
import calibre.ebooks.lit.mssha1 as mssha1
from calibre.customize.ui import run_plugins_on_postprocess
__all__ = ['LitWriter']
@ -277,7 +266,7 @@ class ReBinary(object):
def build_ahc(self):
if len(self.anchors) > 6:
self.logger.log_warn("More than six anchors in file %r. " \
self.logger.warn("More than six anchors in file %r. " \
"Some links may not work properly." % self.item.href)
data = StringIO()
data.write(unichr(len(self.anchors)).encode('utf-8'))
@ -308,18 +297,18 @@ class LitWriter(object):
def _litize_oeb(self):
oeb = self._oeb
oeb.metadata.add('calibre-oeb2lit-version', calibre.__version__)
oeb.metadata.add('calibre-version', calibre.__version__)
cover = None
if oeb.metadata.cover:
id = str(oeb.metadata.cover[0])
cover = oeb.manifest[id]
cover = oeb.manifest.ids[id]
for type, title in ALL_MS_COVER_TYPES:
if type not in oeb.guide:
oeb.guide.add(type, title, cover.href)
else:
self._logger.warn('No suitable cover image found.')
def dump(self, oeb, path):
def __call__(self, oeb, path):
if hasattr(path, 'write'):
return self._dump_stream(oeb, path)
with open(path, 'w+b') as stream:
@ -468,7 +457,7 @@ class LitWriter(object):
self._add_folder('/data')
for item in self._oeb.manifest.values():
if item.media_type not in LIT_MIMES:
self._logger.log_warn("File %r of unknown media-type %r " \
self._logger.warn("File %r of unknown media-type %r " \
"excluded from output." % (item.href, item.media_type))
continue
name = '/data/' + item.id
@ -485,6 +474,8 @@ class LitWriter(object):
secnum = 1
elif isinstance(data, unicode):
data = data.encode('utf-8')
elif hasattr(data, 'cssText'):
data = str(data)
self._add_file(name, data, secnum)
item.size = len(data)
@ -720,53 +711,3 @@ class LitWriter(object):
return dcounts, dchunks, ichunk
def option_parser():
from calibre.utils.config import OptionParser
parser = OptionParser(usage=_('%prog [options] OPFFILE'))
parser.add_option(
'-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option(
'-v', '--verbose', default=0, action='count',
help=_('Useful for debugging.'))
return parser
def oeb2lit(opts, inpath):
logger = Logger(logging.getLogger('oeb2lit'))
logger.setup_cli_handler(opts.verbose)
outpath = opts.output
if outpath is None:
outpath = os.path.basename(inpath)
outpath = os.path.splitext(outpath)[0] + '.lit'
outpath = os.path.abspath(outpath)
context = Context('Browser', 'MSReader')
oeb = OEBBook(inpath, logger=logger)
tocadder = HTMLTOCAdder()
tocadder.transform(oeb, context)
mangler = CaseMangler()
mangler.transform(oeb, context)
fbase = context.dest.fbase
flattener = CSSFlattener(fbase=fbase, unfloat=True, untable=True)
flattener.transform(oeb, context)
rasterizer = SVGRasterizer()
rasterizer.transform(oeb, context)
trimmer = ManifestTrimmer()
trimmer.transform(oeb, context)
lit = LitWriter()
lit.dump(oeb, outpath)
run_plugins_on_postprocess(outpath, 'lit')
logger.info(_('Output written to ') + outpath)
def main(argv=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(argv[1:])
if len(args) != 1:
parser.print_help()
return 1
inpath = args[0]
oeb2lit(opts, inpath)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -4,40 +4,16 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
This package contains logic to read and write LRF files.
The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}.
"""
import sys, os
from optparse import OptionValueError
from htmlentitydefs import name2codepoint
from uuid import uuid4
from calibre.ebooks.lrf.pylrs.pylrs import Book as _Book
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, PutObj, \
Paragraph, TextStyle, BlockStyle
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, \
TextStyle, BlockStyle
from calibre.ebooks.lrf.fonts import FONT_FILE_MAP
from calibre.ebooks import ConversionError
from calibre import __appname__, __version__, __author__, iswindows
from calibre.utils.config import OptionParser
__docformat__ = "epytext"
preferred_source_formats = [
'LIT',
'MOBI',
'EPUB',
'ODT',
'HTML',
'HTM',
'XHTM',
'XHTML',
'PRC',
'AZW',
'FB2',
'RTF',
'PDF',
'TXT',
'ZIP',
'RAR'
]
class LRFParseError(Exception):
pass
@ -58,172 +34,6 @@ class PRS500_PROFILE(object):
name = 'prs500'
profile_map = {
PRS500_PROFILE.name : PRS500_PROFILE,
}
def profile_from_string(option, opt_str, value, parser):
try:
profile = profile_map[value]
setattr(parser.values, option.dest, profile)
except KeyError:
raise OptionValueError('Profile: '+value+' is not implemented. Implemented profiles: %s'%(profile_map.keys()))
def option_parser(usage, gui_mode=False):
parser = OptionParser(usage=usage, gui_mode=gui_mode)
metadata = parser.add_option_group('METADATA OPTIONS')
metadata.add_option("-t", "--title", action="store", type="string", default=None,\
dest="title", help=_("Set the title. Default: filename."))
metadata.add_option("-a", "--author", action="store", type="string", \
dest="author", help=_("Set the author(s). Multiple authors should be set as a comma separated list. Default: %default"),
default=_('Unknown'))
metadata.add_option("--comment", action="store", type="string", \
dest="freetext", help=_("Set the comment."), default=_('Unknown'))
metadata.add_option("--category", action="store", type="string", \
dest="category", help=_("Set the category"), default=_('Unknown'))
metadata.add_option('--title-sort', action='store', default='', dest='title_sort',
help=_('Sort key for the title'))
metadata.add_option('--author-sort', action='store', default='', dest='author_sort',
help=_('Sort key for the author'))
metadata.add_option('--publisher', action='store', default=_('Unknown'), dest='publisher',
help=_('Publisher'))
metadata.add_option('--cover', action='store', dest='cover', default=None, \
help=_('Path to file containing image to be used as cover'))
metadata.add_option('--use-metadata-cover', action='store_true', default=False,
help=_('If there is a cover graphic detected in the source file, use that instead of the specified cover.'))
parser.add_option('-o', '--output', action='store', default=None, \
help=_('Output file name. Default is derived from input filename'))
parser.add_option('--ignore-tables', action='store_true', default=False, dest='ignore_tables',
help=_('Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.'))
laf = parser.add_option_group('LOOK AND FEEL')
laf.add_option('--base-font-size', action='store', type='float', default=10.,
help=_('''Specify the base font size in pts. All fonts are rescaled accordingly. This option obsoletes the --font-delta option and takes precedence over it. To use --font-delta, set this to 0. Default: %defaultpt'''))
laf.add_option('--enable-autorotation', action='store_true', default=False,
help=_('Enable autorotation of images that are wider than the screen width.'),
dest='autorotation')
laf.add_option('--wordspace', dest='wordspace', default=2.5, type='float',
help=_('Set the space between words in pts. Default is %default'))
laf.add_option('--blank-after-para', action='store_true', default=False,
dest='blank_after_para', help=_('Separate paragraphs by blank lines.'))
laf.add_option('--header', action='store_true', default=False, dest='header',
help=_('Add a header to all the pages with title and author.'))
laf.add_option('--headerformat', default="%t by %a", dest='headerformat', type='string',
help=_('Set the format of the header. %a is replaced by the author and %t by the title. Default is %default'))
laf.add_option('--header-separation', default=0, type='int',
help=_('Add extra spacing below the header. Default is %default px.'))
laf.add_option('--override-css', default=None, dest='_override_css', type='string',
help=_('Override the CSS. Can be either a path to a CSS stylesheet or a string. If it is a string it is interpreted as CSS.'))
laf.add_option('--use-spine', default=False, dest='use_spine', action='store_true',
help=_('Use the <spine> element from the OPF file to determine the order in which the HTML files are appended to the LRF. The .opf file must be in the same directory as the base HTML file.'))
laf.add_option('--minimum-indent', default=0, type='float',
help=_('Minimum paragraph indent (the indent of the first line of a paragraph) in pts. Default: %default'))
laf.add_option('--font-delta', action='store', type='float', default=0., \
help=_("""Increase the font size by 2 * FONT_DELTA pts and """
'''the line spacing by FONT_DELTA pts. FONT_DELTA can be a fraction.'''
"""If FONT_DELTA is negative, the font size is decreased."""),
dest='font_delta')
laf.add_option('--ignore-colors', action='store_true', default=False, dest='ignore_colors',
help=_('Render all content as black on white instead of the colors specified by the HTML or CSS.'))
page = parser.add_option_group('PAGE OPTIONS')
profiles = profile_map.keys()
page.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
choices=profiles, action='callback', callback=profile_from_string,
help=_('''Profile of the target device for which this LRF is '''
'''being generated. The profile determines things like the '''
'''resolution and screen size of the target device. '''
'''Default: %s Supported profiles: ''')%(PRS500_PROFILE.name,)+\
', '.join(profiles))
page.add_option('--left-margin', default=20, dest='left_margin', type='int',
help=_('''Left margin of page. Default is %default px.'''))
page.add_option('--right-margin', default=20, dest='right_margin', type='int',
help=_('''Right margin of page. Default is %default px.'''))
page.add_option('--top-margin', default=10, dest='top_margin', type='int',
help=_('''Top margin of page. Default is %default px.'''))
page.add_option('--bottom-margin', default=0, dest='bottom_margin', type='int',
help=_('''Bottom margin of page. Default is %default px.'''))
page.add_option('--render-tables-as-images', default=False, action='store_true',
help=_('Render tables in the HTML as images (useful if the document has large or complex tables)'))
page.add_option('--text-size-multiplier-for-rendered-tables', type='float', default=1.0,
help=_('Multiply the size of text in rendered tables by this factor. Default is %default'))
link = parser.add_option_group('LINK PROCESSING OPTIONS')
link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
dest='link_levels',
help=_(r'''The maximum number of levels to recursively process '''
'''links. A value of 0 means thats links are not followed. '''
'''A negative value means that <a> tags are ignored.'''))
link.add_option('--link-exclude', dest='link_exclude', default='@',
help=_('''A regular expression. <a> tags whose href '''
'''matches will be ignored. Defaults to %default'''))
link.add_option('--no-links-in-toc', action='store_true', default=False,
dest='no_links_in_toc',
help=_('''Don't add links to the table of contents.'''))
chapter = parser.add_option_group('CHAPTER OPTIONS')
chapter.add_option('--disable-chapter-detection', action='store_true',
default=False, dest='disable_chapter_detection',
help=_('''Prevent the automatic detection chapters.'''))
chapter.add_option('--chapter-regex', dest='chapter_regex',
default='chapter|book|appendix',
help=_('''The regular expression used to detect chapter titles.'''
''' It is searched for in heading tags (h1-h6). Defaults to %default'''))
chapter.add_option('--chapter-attr', default='$,,$',
help=_('Detect a chapter beginning at an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". You can set the attribute to "none" to match only on tag names. So for example, to match all h2 tags, you would use "h2,none,". Default is %default'''))
chapter.add_option('--page-break-before-tag', dest='page_break', default='h[12]',
help=_('''If html2lrf does not find any page breaks in the '''
'''html file and cannot detect chapter headings, it will '''
'''automatically insert page-breaks before the tags whose '''
'''names match this regular expression. Defaults to %default. '''
'''You can disable it by setting the regexp to "$". '''
'''The purpose of this option is to try to ensure that '''
'''there are no really long pages as this degrades the page '''
'''turn performance of the LRF. Thus this option is ignored '''
'''if the current page has only a few elements.'''))
chapter.add_option('--force-page-break-before-tag', dest='force_page_break',
default='$', help=_('Force a page break before tags whose names match this regular expression.'))
chapter.add_option('--force-page-break-before-attr', dest='force_page_break_attr',
default='$,,$', help=_('Force a page break before an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". Default is %default'''))
chapter.add_option('--add-chapters-to-toc', action='store_true',
default=False, dest='add_chapters_to_toc',
help=_('''Add detected chapters to the table of contents.'''))
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
help=_('''Preprocess Baen HTML files to improve generated LRF.'''))
prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml',
help=_('''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.'''))
prepro.add_option('--book-designer', action='store_true', default=False, dest='book_designer',
help=_('''Use this option on html0 files from Book Designer.'''))
fonts = parser.add_option_group('FONT FAMILIES',
_('''Specify trutype font families for serif, sans-serif and monospace fonts. '''
'''These fonts will be embedded in the LRF file. Note that custom fonts lead to '''
'''slower page turns. '''
'''For example: '''
'''--serif-family "Times New Roman"
'''))
fonts.add_option('--serif-family',
default=None, dest='serif_family', type='string',
help=_('The serif family of fonts to embed'))
fonts.add_option('--sans-family',
default=None, dest='sans_family', type='string',
help=_('The sans-serif family of fonts to embed'))
fonts.add_option('--mono-family',
default=None, dest='mono_family', type='string',
help=_('The monospace family of fonts to embed'))
debug = parser.add_option_group('DEBUG OPTIONS')
debug.add_option('--verbose', dest='verbose', action='store_true', default=False,
help=_('''Be verbose while processing'''))
debug.add_option('--lrs', action='store_true', dest='lrs', \
help=_('Convert to LRS'), default=False)
parser.add_option('--minimize-memory-usage', action='store_true', default=False,
help=_('Minimize memory usage at the cost of longer processing times. Use this option if you are on a memory constrained machine.'))
parser.add_option('--encoding', default=None,
help=_('Specify the character encoding of the source file. If the output LRF file contains strange characters, try changing this option. A common encoding for files from windows computers is cp-1252. Another common choice is utf-8. The default is to try and guess the encoding.'))
return parser
def find_custom_fonts(options, logger):
from calibre.utils.fontconfig import files_for_family
fonts = {'serif' : None, 'sans' : None, 'mono' : None}
@ -299,4 +109,3 @@ def Book(options, logger, font_delta=0, header=None,
raise ConversionError, 'Could not find the normal version of the ' + family + ' font'
return book, fonts
from calibre import entity_to_unicode

View File

@ -1,2 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,199 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Convert any ebook file into a LRF file.'''
import sys, os, logging, shutil, tempfile, re
from calibre.ebooks import UnknownFormatError
from calibre.ebooks.lrf import option_parser as _option_parser
from calibre import __appname__, setup_cli_handlers, extract
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.lrf.lit.convert_from import process_file as lit2lrf
from calibre.ebooks.lrf.pdf.convert_from import process_file as pdf2lrf
from calibre.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf
from calibre.ebooks.lrf.txt.convert_from import process_file as txt2lrf
from calibre.ebooks.lrf.html.convert_from import process_file as html2lrf
from calibre.ebooks.lrf.epub.convert_from import process_file as epub2lrf
from calibre.ebooks.lrf.mobi.convert_from import process_file as mobi2lrf
from calibre.ebooks.lrf.fb2.convert_from import process_file as fb22lrf
from calibre.customize.ui import run_plugins_on_postprocess, run_plugins_on_preprocess
def largest_file(files):
maxsize, file = 0, None
for f in files:
size = os.stat(f).st_size
if size > maxsize:
maxsize = size
file = f
return file
def find_htmlfile(dir):
ext_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE)
toc_pat = re.compile(r'toc', re.IGNORECASE)
index_pat = re.compile(r'index', re.IGNORECASE)
toc_files, index_files, files = [], [], []
for root, dirs, _files in os.walk(dir):
for f in _files:
f = os.path.abspath(os.path.join(root, f))
ext = os.path.splitext(f)[1]
if ext and ext_pat.match(ext):
toc_files.append(f) if toc_pat.search(f) else \
index_files.append(f) if index_pat.search(f) else \
files.append(f)
a = toc_files if toc_files else index_files if index_files else files
if a:
return largest_file(a)
def number_of_unhidden_files(base, listing):
ans = 0
for i in listing:
i = os.path.join(base, i)
if os.path.isdir(i) or os.path.basename(i).startswith('.'):
continue
ans += 1
return ans
def unhidden_directories(base, listing):
ans = []
for i in listing:
if os.path.isdir(os.path.join(base, i)) and not i.startswith('__') and \
not i.startswith('.'):
ans.append(i)
return ans
def traverse_subdirs(tdir):
temp = os.listdir(tdir)
if number_of_unhidden_files(tdir, temp) == 0:
try:
cdir = os.path.join(tdir, unhidden_directories(tdir, temp)[0])
return traverse_subdirs(cdir)
except IndexError:
pass
return tdir
def handle_archive(path):
tdir = tempfile.mkdtemp(prefix=__appname__+'_'+'archive_')
extract(path, tdir)
files = []
cdir = traverse_subdirs(tdir)
file = None
exts = ['lit', 'rtf', 'fb2','pdf', 'txt', 'epub', 'mobi', 'prc']
candidates = map(lambda x:os.path.join(cdir, x), os.listdir(cdir))
for ext in exts:
for f in candidates:
if f.lower().endswith('.'+ext):
files.append(f)
file = largest_file(files)
if not file:
file = find_htmlfile(cdir)
if isinstance(file, str):
file = file.decode(sys.getfilesystemencoding())
return tdir, file
def odt2lrf(path, options, logger):
from calibre.ebooks.odt.to_oeb import Extract
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('odt2lrf')
setup_cli_handlers(logger, level)
with TemporaryDirectory('_odt2lrf') as tdir:
opf = Extract()(path, tdir)
options.use_spine = True
options.encoding = 'utf-8'
html_process_file(opf.replace('metadata.opf', 'index.html'), options, logger)
def process_file(path, options, logger=None):
path = os.path.abspath(os.path.expanduser(path))
path = run_plugins_on_preprocess(path)
tdir = None
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('any2lrf')
setup_cli_handlers(logger, level)
if not os.access(path, os.R_OK):
logger.critical('Cannot read from %s', path)
return 1
ext = os.path.splitext(path)[1]
if not ext or ext == '.':
logger.critical('Unknown file type: %s', path)
return 1
ext = ext[1:].lower()
cwd = os.getcwd()
if not options.output:
fmt = '.lrs' if options.lrs else '.lrf'
options.output = os.path.splitext(os.path.basename(path))[0] + fmt
options.output = os.path.abspath(os.path.expanduser(options.output))
if ext in ['zip', 'rar', 'oebzip']:
newpath = None
try:
tdir, newpath = handle_archive(path)
except:
logger.exception(' ')
if not newpath:
raise UnknownFormatError('Could not find ebook in archive')
path = newpath
logger.info('Found ebook in archive: %s', repr(path))
try:
ext = os.path.splitext(path)[1][1:].lower()
convertor = None
if 'htm' in ext:
convertor = html2lrf
elif 'lit' == ext:
convertor = lit2lrf
elif 'pdf' == ext:
convertor = pdf2lrf
elif 'rtf' == ext:
convertor = rtf2lrf
elif 'txt' == ext:
convertor = txt2lrf
elif 'epub' == ext:
convertor = epub2lrf
elif ext in ['mobi', 'prc', 'azw']:
convertor = mobi2lrf
elif ext == 'fb2':
convertor = fb22lrf
elif ext == 'odt':
convertor = odt2lrf
if not convertor:
raise UnknownFormatError(_('Converting from %s to LRF is not supported.')%ext)
convertor(path, options, logger)
finally:
os.chdir(cwd)
if tdir and os.path.exists(tdir):
shutil.rmtree(tdir)
return 0
def option_parser(gui_mode=False):
return _option_parser(usage=_('''\
any2lrf [options] myfile
Convert any ebook format into LRF. Supported formats are:
LIT, RTF, TXT, HTML, EPUB, MOBI, PRC and PDF. any2lrf will also process a RAR or
ZIP archive, looking for an ebook inside the archive.
'''), gui_mode=gui_mode)
def main(args=sys.argv, logger=None, gui_mode=False):
parser = option_parser(gui_mode)
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print _('No file to convert specified.')
return 1
src = args[1]
if not isinstance(src, unicode):
src = src.decode(sys.getfilesystemencoding())
return process_file(src, options, logger)
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,562 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Based on ideas from comiclrf created by FangornUK.
'''
import os, sys, shutil, traceback, textwrap, fnmatch
from uuid import uuid4
from calibre import extract, terminal_controller, __appname__, __version__
from calibre.utils.config import Config, StringConfig
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.parallel import Server, ParallelJob
from calibre.utils.terminfo import ProgressBar
from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.epub.from_html import config as html2epub_config, convert as html2epub
from calibre.customize.ui import run_plugins_on_preprocess
try:
from calibre.utils.PythonMagickWand import \
NewMagickWand, NewPixelWand, \
MagickSetImageBorderColor, \
MagickReadImage, MagickRotateImage, \
MagickTrimImage, PixelSetColor,\
MagickNormalizeImage, MagickGetImageWidth, \
MagickGetImageHeight, \
MagickResizeImage, MagickSetImageType, \
GrayscaleType, CatromFilter, MagickSetImagePage, \
MagickBorderImage, MagickSharpenImage, MagickDespeckleImage, \
MagickQuantizeImage, RGBColorspace, \
MagickWriteImage, DestroyPixelWand, \
DestroyMagickWand, CloneMagickWand, \
MagickThumbnailImage, MagickCropImage, ImageMagick
_imagemagick_loaded = True
except:
_imagemagick_loaded = False
PROFILES = {
# Name : (width, height) in pixels
'prs500':(584, 754),
# The SONY's LRF renderer (on the PRS500) only uses the first 800x600 block of the image
'prs500-landscape': (784, 1012)
}
def extract_comic(path_to_comic_file):
'''
Un-archive the comic file.
'''
tdir = PersistentTemporaryDirectory(suffix='_comic_extract')
extract(path_to_comic_file, tdir)
return tdir
def find_pages(dir, sort_on_mtime=False, verbose=False):
'''
Find valid comic pages in a previously un-archived comic.
:param dir: Directory in which extracted comic lives
:param sort_on_mtime: If True sort pages based on their last modified time.
Otherwise, sort alphabetically.
'''
extensions = ['jpeg', 'jpg', 'gif', 'png']
pages = []
for datum in os.walk(dir):
for name in datum[-1]:
path = os.path.join(datum[0], name)
if '__MACOSX' in path: continue
for ext in extensions:
if path.lower().endswith('.'+ext):
pages.append(path)
break
if sort_on_mtime:
comparator = lambda x, y : cmp(os.stat(x).st_mtime, os.stat(y).st_mtime)
else:
comparator = lambda x, y : cmp(os.path.basename(x), os.path.basename(y))
pages.sort(cmp=comparator)
if verbose:
print 'Found comic pages...'
print '\t'+'\n\t'.join([os.path.basename(p) for p in pages])
return pages
class PageProcessor(list):
'''
Contains the actual image rendering logic. See :method:`render` and
:method:`process_pages`.
'''
def __init__(self, path_to_page, dest, opts, num):
list.__init__(self)
self.path_to_page = path_to_page
self.opts = opts
self.num = num
self.dest = dest
self.rotate = False
self.render()
def render(self):
img = NewMagickWand()
if img < 0:
raise RuntimeError('Cannot create wand.')
if not MagickReadImage(img, self.path_to_page):
raise IOError('Failed to read image from: %'%self.path_to_page)
width = MagickGetImageWidth(img)
height = MagickGetImageHeight(img)
if self.num == 0: # First image so create a thumbnail from it
thumb = CloneMagickWand(img)
if thumb < 0:
raise RuntimeError('Cannot create wand.')
MagickThumbnailImage(thumb, 60, 80)
MagickWriteImage(thumb, os.path.join(self.dest, 'thumbnail.png'))
DestroyMagickWand(thumb)
self.pages = [img]
if width > height:
if self.opts.landscape:
self.rotate = True
else:
split1, split2 = map(CloneMagickWand, (img, img))
DestroyMagickWand(img)
if split1 < 0 or split2 < 0:
raise RuntimeError('Cannot create wand.')
MagickCropImage(split1, (width/2)-1, height, 0, 0)
MagickCropImage(split2, (width/2)-1, height, width/2, 0 )
self.pages = [split2, split1] if self.opts.right2left else [split1, split2]
self.process_pages()
def process_pages(self):
for i, wand in enumerate(self.pages):
pw = NewPixelWand()
try:
if pw < 0:
raise RuntimeError('Cannot create wand.')
PixelSetColor(pw, 'white')
MagickSetImageBorderColor(wand, pw)
if self.rotate:
MagickRotateImage(wand, pw, -90)
# 25 percent fuzzy trim?
if not self.opts.disable_trim:
MagickTrimImage(wand, 25*65535/100)
MagickSetImagePage(wand, 0,0,0,0) #Clear page after trim, like a "+repage"
# Do the Photoshop "Auto Levels" equivalent
if not self.opts.dont_normalize:
MagickNormalizeImage(wand)
sizex = MagickGetImageWidth(wand)
sizey = MagickGetImageHeight(wand)
SCRWIDTH, SCRHEIGHT = PROFILES[self.opts.profile]
if self.opts.keep_aspect_ratio:
# Preserve the aspect ratio by adding border
aspect = float(sizex) / float(sizey)
if aspect <= (float(SCRWIDTH) / float(SCRHEIGHT)):
newsizey = SCRHEIGHT
newsizex = int(newsizey * aspect)
deltax = (SCRWIDTH - newsizex) / 2
deltay = 0
else:
newsizex = SCRWIDTH
newsizey = int(newsizex / aspect)
deltax = 0
deltay = (SCRHEIGHT - newsizey) / 2
MagickResizeImage(wand, newsizex, newsizey, CatromFilter, 1.0)
MagickSetImageBorderColor(wand, pw)
MagickBorderImage(wand, pw, deltax, deltay)
elif self.opts.wide:
# Keep aspect and Use device height as scaled image width so landscape mode is clean
aspect = float(sizex) / float(sizey)
screen_aspect = float(SCRWIDTH) / float(SCRHEIGHT)
# Get dimensions of the landscape mode screen
# Add 25px back to height for the battery bar.
wscreenx = SCRHEIGHT + 25
wscreeny = int(wscreenx / screen_aspect)
if aspect <= screen_aspect:
newsizey = wscreeny
newsizex = int(newsizey * aspect)
deltax = (wscreenx - newsizex) / 2
deltay = 0
else:
newsizex = wscreenx
newsizey = int(newsizex / aspect)
deltax = 0
deltay = (wscreeny - newsizey) / 2
MagickResizeImage(wand, newsizex, newsizey, CatromFilter, 1.0)
MagickSetImageBorderColor(wand, pw)
MagickBorderImage(wand, pw, deltax, deltay)
else:
MagickResizeImage(wand, SCRWIDTH, SCRHEIGHT, CatromFilter, 1.0)
if not self.opts.dont_sharpen:
MagickSharpenImage(wand, 0.0, 1.0)
MagickSetImageType(wand, GrayscaleType)
if self.opts.despeckle:
MagickDespeckleImage(wand)
MagickQuantizeImage(wand, self.opts.colors, RGBColorspace, 0, 1, 0)
dest = '%d_%d.png'%(self.num, i)
dest = os.path.join(self.dest, dest)
MagickWriteImage(wand, dest+'8')
os.rename(dest+'8', dest)
self.append(dest)
finally:
if pw > 0:
DestroyPixelWand(pw)
DestroyMagickWand(wand)
def render_pages(tasks, dest, opts, notification=None):
'''
Entry point for the job server.
'''
failures, pages = [], []
with ImageMagick():
for num, path in tasks:
try:
pages.extend(PageProcessor(path, dest, opts, num))
msg = _('Rendered %s')%path
except:
failures.append(path)
msg = _('Failed %s')%path
if opts.verbose:
msg += '\n' + traceback.format_exc()
if notification is not None:
notification(0.5, msg)
return pages, failures
class JobManager(object):
'''
Simple job manager responsible for keeping track of overall progress.
'''
def __init__(self, total, update):
self.total = total
self.update = update
self.done = 0
self.add_job = lambda j: j
self.output = lambda j: j
self.start_work = lambda j: j
self.job_done = lambda j: j
def status_update(self, job):
self.done += 1
#msg = msg%os.path.basename(job.args[0])
self.update(float(self.done)/self.total, job.msg)
def process_pages(pages, opts, update):
'''
Render all identified comic pages.
'''
if not _imagemagick_loaded:
raise RuntimeError('Failed to load ImageMagick')
tdir = PersistentTemporaryDirectory('_comic2lrf_pp')
job_manager = JobManager(len(pages), update)
server = Server()
jobs = []
tasks = server.split(pages)
for task in tasks:
jobs.append(ParallelJob('render_pages', lambda s:s, job_manager=job_manager,
args=[task, tdir, opts]))
server.add_job(jobs[-1])
server.wait()
server.killall()
server.close()
ans, failures = [], []
for job in jobs:
if job.result is None:
raise Exception(_('Failed to process comic: %s\n\n%s')%(job.exception, job.traceback))
pages, failures_ = job.result
ans += pages
failures += failures_
return ans, failures, tdir
def config(defaults=None,output_format='lrf'):
desc = _('Options to control the conversion of comics (CBR, CBZ) files into ebooks')
if defaults is None:
c = Config('comic', desc)
else:
c = StringConfig(defaults, desc)
c.add_opt('title', ['-t', '--title'],
help=_('Title for generated ebook. Default is to use the filename.'))
c.add_opt('author', ['-a', '--author'],
help=_('Set the author in the metadata of the generated ebook. Default is %default'),
default=_('Unknown'))
c.add_opt('output', ['-o', '--output'],
help=_('Path to output file. By default a file is created in the current directory.'))
c.add_opt('colors', ['-c', '--colors'], type='int', default=64,
help=_('Number of colors for grayscale image conversion. Default: %default'))
c.add_opt('dont_normalize', ['-n', '--disable-normalize'], default=False,
help=_('Disable normalize (improve contrast) color range for pictures. Default: False'))
c.add_opt('keep_aspect_ratio', ['-r', '--keep-aspect-ratio'], default=False,
help=_('Maintain picture aspect ratio. Default is to fill the screen.'))
c.add_opt('dont_sharpen', ['-s', '--disable-sharpen'], default=False,
help=_('Disable sharpening.'))
c.add_opt('disable_trim', ['--disable-trim'], default=False,
help=_('Disable trimming of comic pages. For some comics, '
'trimming might remove content as well as borders.'))
c.add_opt('landscape', ['-l', '--landscape'], default=False,
help=_("Don't split landscape images into two portrait images"))
c.add_opt('wide', ['-w', '--wide-aspect'], default=False,
help=_("Keep aspect ratio and scale image using screen height as image width for viewing in landscape mode."))
c.add_opt('right2left', ['--right2left'], default=False, action='store_true',
help=_('Used for right-to-left publications like manga. Causes landscape pages to be split into portrait pages from right to left.'))
c.add_opt('despeckle', ['-d', '--despeckle'], default=False,
help=_('Enable Despeckle. Reduces speckle noise. May greatly increase processing time.'))
c.add_opt('no_sort', ['--no-sort'], default=False,
help=_("Don't sort the files found in the comic alphabetically by name. Instead use the order they were added to the comic."))
c.add_opt('profile', ['-p', '--profile'], default='prs500', choices=PROFILES.keys(),
help=_('Choose a profile for the device you are generating this file for. The default is the SONY PRS-500 with a screen size of 584x754 pixels. This is suitable for any reader with the same screen size. Choices are %s')%PROFILES.keys())
c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False,
help=_("Don't show progress bar."))
if output_format == 'pdf':
c.add_opt('no_process',['--no_process'], default=False,
help=_("Apply no processing to the image"))
return c
def option_parser(output_format='lrf'):
c = config(output_format=output_format)
return c.option_parser(usage=_('''\
%prog [options] comic.cb[z|r]
Convert a comic in a CBZ or CBR file to an ebook.
'''))
def create_epub(pages, profile, opts, thumbnail=None):
wrappers = []
WRAPPER = textwrap.dedent('''\
<html>
<head>
<title>Page #%d</title>
<style type="text/css">@page {margin:0pt; padding: 0pt;}</style>
</head>
<body style="margin: 0pt; padding: 0pt">
<div style="text-align:center">
<img src="%s" alt="comic page #%d" />
</div>
</body>
</html>
''')
dir = os.path.dirname(pages[0])
for i, page in enumerate(pages):
wrapper = WRAPPER%(i+1, os.path.basename(page), i+1)
page = os.path.join(dir, 'page_%d.html'%(i+1))
open(page, 'wb').write(wrapper)
wrappers.append(page)
mi = MetaInformation(opts.title, [opts.author])
opf = OPFCreator(dir, mi)
opf.create_manifest([(w, None) for w in wrappers])
opf.create_spine(wrappers)
metadata = os.path.join(dir, 'metadata.opf')
opf.render(open(metadata, 'wb'))
opts2 = html2epub_config('margin_left=0\nmargin_right=0\nmargin_top=0\nmargin_bottom=0').parse()
opts2.output = opts.output
html2epub(metadata, opts2)
def create_lrf(pages, profile, opts, thumbnail=None):
width, height = PROFILES[profile]
ps = {}
ps['topmargin'] = 0
ps['evensidemargin'] = 0
ps['oddsidemargin'] = 0
ps['textwidth'] = width
ps['textheight'] = height
book = Book(title=opts.title, author=opts.author,
bookid=uuid4().hex,
publisher='%s %s'%(__appname__, __version__), thumbnail=thumbnail,
category='Comic', pagestyledefault=ps,
booksetting=BookSetting(screenwidth=width, screenheight=height))
for page in pages:
imageStream = ImageStream(page)
_page = book.create_page()
_page.append(ImageBlock(refstream=imageStream,
blockwidth=width, blockheight=height, xsize=width,
ysize=height, x1=width, y1=height))
book.append(_page)
book.renderLrf(open(opts.output, 'wb'))
print _('Output written to'), opts.output
def create_pdf(pages, profile, opts, thumbnail=None,toc=None):
width, height = PROFILES[profile]
from reportlab.pdfgen import canvas
cur_page=0
heading = []
if toc != None:
if len(toc) == 1:
toc = None
else:
toc_index = 0
base_cur = 0
rem = 0
breaker = False
while True:
letter=toc[0][0][base_cur]
for i in range(len(toc)):
if letter != toc[i][0][base_cur]:
breaker = True
if breaker:
break
if letter == os.sep:
rem=base_cur
base_cur += 1
toc.append(("Not seen",-1))
pdf = canvas.Canvas(filename=opts.output, pagesize=(width,height+15))
pdf.setAuthor(opts.author)
pdf.setTitle(opts.title)
for page in pages:
if opts.keep_aspect_ratio:
img = NewMagickWand()
if img < 0:
raise RuntimeError('Cannot create wand.')
if not MagickReadImage(img, page):
raise IOError('Failed to read image from: %'%page)
sizex = MagickGetImageWidth(img)
sizey = MagickGetImageHeight(img)
if opts.keep_aspect_ratio:
# Preserve the aspect ratio by adding border
aspect = float(sizex) / float(sizey)
if aspect <= (float(width) / float(height)):
newsizey = height
newsizex = int(newsizey * aspect)
deltax = (width - newsizex) / 2
deltay = 0
else:
newsizex = width
newsizey = int(newsizex / aspect)
deltax = 0
deltay = (height - newsizey) / 2
pdf.drawImage(page, x=deltax,y=deltay,width=newsizex, height=newsizey)
else:
pdf.drawImage(page, x=0,y=0,width=width, height=height)
if toc != None:
if toc[toc_index][1] == cur_page:
tmp=toc[toc_index][0]
toc_current=tmp[rem:len(tmp)-4]
index=0
while True:
key = 'page%d-%d' % (cur_page, index)
pdf.bookmarkPage(key)
(head,dummy,list)=toc_current.partition(os.sep)
try:
if heading[index] != head:
heading[index] = head
pdf.addOutlineEntry(title=head,key=key,level=index)
except:
heading.append(head)
pdf.addOutlineEntry(title=head,key=key,level=index)
index += 1
toc_current=list
if dummy == "":
break
toc_index += 1
cur_page += 1
pdf.showPage()
# Write the document to disk
pdf.save()
def do_convert(path_to_file, opts, notification=lambda m, p: p, output_format='lrf'):
path_to_file = run_plugins_on_preprocess(path_to_file)
source = path_to_file
to_delete = []
toc = []
list = []
pages = []
if not opts.title:
opts.title = os.path.splitext(os.path.basename(source))[0]
if not opts.output:
opts.output = os.path.abspath(os.path.splitext(os.path.basename(source))[0]+'.'+output_format)
if os.path.isdir(source):
for path in all_files( source , '*.cbr|*.cbz' ):
list.append( path )
else:
list= [ os.path.abspath(source) ]
for source in list:
tdir = extract_comic(source)
new_pages = find_pages(tdir, sort_on_mtime=opts.no_sort, verbose=opts.verbose)
thumbnail = None
if not new_pages:
raise ValueError('Could not find any pages in the comic: %s'%source)
if not getattr(opts, 'no_process', False):
new_pages, failures, tdir2 = process_pages(new_pages, opts, notification)
if not new_pages:
raise ValueError('Could not find any valid pages in the comic: %s'%source)
if failures:
print 'Could not process the following pages (run with --verbose to see why):'
for f in failures:
print '\t', f
thumbnail = os.path.join(tdir2, 'thumbnail.png')
if not os.access(thumbnail, os.R_OK):
thumbnail = None
toc.append((source,len(pages)))
pages.extend(new_pages)
to_delete.append(tdir)
if output_format == 'lrf':
create_lrf(pages, opts.profile, opts, thumbnail=thumbnail)
if output_format == 'epub':
create_epub(pages, opts.profile, opts, thumbnail=thumbnail)
if output_format == 'pdf':
create_pdf(pages, opts.profile, opts, thumbnail=thumbnail,toc=toc)
for tdir in to_delete:
shutil.rmtree(tdir)
def all_files(root, patterns='*'):
# Expand patterns from semicolon-separated string to list
patterns = patterns.split('|')
for path, subdirs, files in os.walk(root):
files.sort( )
for name in files:
for pattern in patterns:
if fnmatch.fnmatch(name, pattern):
yield os.path.join(path, name)
break
def main(args=sys.argv, notification=None, output_format='lrf'):
parser = option_parser(output_format=output_format)
opts, args = parser.parse_args(args)
if len(args) < 2:
parser.print_help()
print '\nYou must specify a file to convert'
return 1
if not callable(notification):
pb = ProgressBar(terminal_controller, _('Rendering comic pages...'),
no_progress_bar=opts.no_progress_bar or getattr(opts, 'no_process', False))
notification = pb.update
source = os.path.abspath(args[1])
do_convert(source, opts, notification, output_format=output_format)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,3 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,75 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, shutil, logging
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks import ConversionError, DRMError
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.metadata.opf import OPF
from calibre.ebooks.metadata.epub import OCFDirReader
from calibre.utils.zipfile import ZipFile
from calibre import setup_cli_handlers
from calibre.ptempfile import PersistentTemporaryDirectory
def option_parser():
return lrf_option_parser(
_('''Usage: %prog [options] mybook.epub
%prog converts mybook.epub to mybook.lrf''')
)
def generate_html(pathtoepub, logger):
if not os.access(pathtoepub, os.R_OK):
raise ConversionError('Cannot read from ' + pathtoepub)
tdir = PersistentTemporaryDirectory('_epub2lrf')
#os.rmdir(tdir)
try:
ZipFile(pathtoepub).extractall(tdir)
except:
raise ConversionError, '.epub extraction failed'
if os.path.exists(os.path.join(tdir, 'META-INF', 'encryption.xml')):
raise DRMError(os.path.basename(pathtoepub))
return tdir
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('epub2lrf')
setup_cli_handlers(logger, level)
epub = os.path.abspath(os.path.expanduser(path))
tdir = generate_html(epub, logger)
try:
ocf = OCFDirReader(tdir)
htmlfile = ocf.opf.spine[0].path
options.opf = os.path.join(tdir, ocf.container[OPF.MIMETYPE])
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
options.use_spine = True
html_process_file(htmlfile, options, logger=logger)
finally:
try:
shutil.rmtree(tdir)
except:
logger.warning('Failed to delete temporary directory '+tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No epub file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,125 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
"""
Convert .fb2 files to .lrf
"""
import os, sys, shutil, logging
from base64 import b64decode
from lxml import etree
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre import setup_cli_handlers
from calibre.resources import fb2_xsl
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.metadata import MetaInformation
def option_parser():
parser = lrf_option_parser(
_('''%prog [options] mybook.fb2
%prog converts mybook.fb2 to mybook.lrf'''))
parser.add_option('--debug-html-generation', action='store_true', default=False,
dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.'))
parser.add_option('--keep-intermediate-files', action='store_true', default=False,
help=_('Keep generated HTML files after completing conversion to LRF.'))
return parser
def extract_embedded_content(doc):
for elem in doc.xpath('./*'):
if 'binary' in elem.tag and elem.attrib.has_key('id'):
fname = elem.attrib['id']
data = b64decode(elem.text.strip())
open(fname, 'wb').write(data)
def to_html(fb2file, tdir):
fb2file = os.path.abspath(fb2file)
cwd = os.getcwd()
try:
os.chdir(tdir)
print 'Parsing XML...'
parser = etree.XMLParser(recover=True, no_network=True)
doc = etree.parse(fb2file, parser)
extract_embedded_content(doc)
print 'Converting XML to HTML...'
styledoc = etree.fromstring(fb2_xsl)
transform = etree.XSLT(styledoc)
result = transform(doc)
open('index.html', 'wb').write(transform.tostring(result))
try:
mi = get_metadata(open(fb2file, 'rb'), 'fb2')
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(fb2file))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
return os.path.join(tdir, 'metadata.opf')
finally:
os.chdir(cwd)
def generate_html(fb2file, encoding, logger):
tdir = PersistentTemporaryDirectory('_fb22lrf')
to_html(fb2file, tdir)
return os.path.join(tdir, 'index.html')
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('fb22lrf')
setup_cli_handlers(logger, level)
fb2 = os.path.abspath(os.path.expanduser(path))
f = open(fb2, 'rb')
mi = get_metadata(f, 'fb2')
f.close()
htmlfile = generate_html(fb2, options.encoding, logger)
tdir = os.path.dirname(htmlfile)
cwd = os.getcwdu()
try:
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
if not mi.title:
mi.title = os.path.splitext(os.path.basename(fb2))[0]
if (not options.title or options.title == _('Unknown')):
options.title = mi.title
if (not options.author or options.author == _('Unknown')) and mi.authors:
options.author = mi.authors.pop()
if (not options.category or options.category == _('Unknown')) and mi.category:
options.category = mi.category
if (not options.freetext or options.freetext == _('Unknown')) and mi.comments:
options.freetext = mi.comments
os.chdir(tdir)
html_process_file(htmlfile, options, logger)
finally:
os.chdir(cwd)
if getattr(options, 'keep_intermediate_files', False):
logger.debug('Intermediate files in '+ tdir)
else:
shutil.rmtree(tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No fb2 file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,4 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,59 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Convert web feeds to LRF files.
'''
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file
from calibre.web.feeds.main import option_parser as feeds_option_parser
from calibre.web.feeds.main import run_recipe
from calibre.ptempfile import TemporaryDirectory
from calibre import sanitize_file_name, strftime
import sys, os
def option_parser():
parser = feeds_option_parser()
parser.remove_option('--output-dir')
parser.remove_option('--lrf')
parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk'))
lrf_parser = lrf_option_parser('')
lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf'))
parser.merge(lrf_parser)
return parser
def main(args=sys.argv, notification=None, handler=None):
parser = option_parser()
opts, args = parser.parse_args(args)
opts.lrf = True
if len(args) != 2 and opts.feeds is None:
parser.print_help()
return 1
recipe_arg = args[1] if len(args) > 1 else None
with TemporaryDirectory('_feeds2lrf') as tdir:
opts.output_dir = tdir
recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler)
htmlfile = os.path.join(tdir, 'index.html')
if not os.access(htmlfile, os.R_OK):
raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg)
lparser = lrf_option_parser('')
ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0]
parser.merge_options(ropts, opts)
if not opts.output:
ext = '.lrs' if opts.lrs else '.lrf'
fname = recipe.title + strftime(recipe.timefmt)+ext
opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
print 'Generating LRF...'
process_file(htmlfile, opts)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -6,7 +6,7 @@ Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs.
"""
import os, re, sys, copy, glob, logging, tempfile
import os, re, sys, copy, glob, tempfile
from collections import deque
from urllib import unquote
from urlparse import urlparse
@ -16,6 +16,7 @@ from calibre.customize.ui import run_plugins_on_postprocess
try:
from PIL import Image as PILImage
PILImage
except ImportError:
import Image as PILImage
@ -26,14 +27,13 @@ from calibre.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
LrsError, Sup, Sub, EmpLine
from calibre.ebooks.lrf.pylrs.pylrs import Span
from calibre.ebooks.lrf import Book, entity_to_unicode
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf import Book
from calibre.ebooks import ConversionError
from calibre.ebooks.lrf.html.table import Table
from calibre import filename_to_utf8, setup_cli_handlers, __appname__, \
fit_image, LoggingInterface, preferred_encoding
from calibre import filename_to_utf8, __appname__, \
fit_image, preferred_encoding, entity_to_unicode
from calibre.ptempfile import PersistentTemporaryFile
from calibre.devices.interface import Device
from calibre.devices.interface import DevicePlugin as Device
from calibre.ebooks.lrf.html.color_map import lrs_color
from calibre.ebooks.chardet import xml_to_unicode
@ -78,7 +78,7 @@ def tag_regex(tagname):
return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)'%dict(t=tagname), \
close=r'</\s*%(t)s\s*>'%dict(t=tagname))
class HTMLConverter(object, LoggingInterface):
class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
@ -213,7 +213,7 @@ class HTMLConverter(object, LoggingInterface):
'''
# Defaults for various formatting tags
object.__setattr__(self, 'options', options)
LoggingInterface.__init__(self, logger)
self.log = logger
self.fonts = fonts #: dict specifying font families to use
# Memory
self.scaled_images = {} #: Temporary files with scaled version of images
@ -288,9 +288,9 @@ class HTMLConverter(object, LoggingInterface):
if link['path'] == path:
self.links.remove(link)
break
self.log_warn('Could not process '+path)
self.log.warn('Could not process '+path)
if self.verbose:
self.log_exception(' ')
self.log.exception(' ')
self.links = self.process_links()
self.link_level += 1
paths = [link['path'] for link in self.links]
@ -302,7 +302,7 @@ class HTMLConverter(object, LoggingInterface):
self.book.addTocEntry(text, tb)
if self.base_font_size > 0:
self.log_info('\tRationalizing font sizes...')
self.log.info('\tRationalizing font sizes...')
self.book.rationalize_font_sizes(self.base_font_size)
def is_baen(self, soup):
@ -318,9 +318,9 @@ class HTMLConverter(object, LoggingInterface):
if not self.book_designer and self.is_book_designer(raw):
self.book_designer = True
self.log_info(_('\tBook Designer file detected.'))
self.log.info(_('\tBook Designer file detected.'))
self.log_info(_('\tParsing HTML...'))
self.log.info(_('\tParsing HTML...'))
if self.baen:
nmassage.extend(HTMLConverter.BAEN)
@ -343,7 +343,7 @@ class HTMLConverter(object, LoggingInterface):
raise
if not self.baen and self.is_baen(soup):
self.baen = True
self.log_info(_('\tBaen file detected. Re-parsing...'))
self.log.info(_('\tBaen file detected. Re-parsing...'))
return self.preprocess(raw)
if self.book_designer:
t = soup.find(id='BookTitle')
@ -359,7 +359,7 @@ class HTMLConverter(object, LoggingInterface):
try:
dump = open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb')
dump.write(unicode(soup).encode('utf-8'))
self.log_info(_('Written preprocessed HTML to ')+dump.name)
self.log.info(_('Written preprocessed HTML to ')+dump.name)
dump.close()
except:
pass
@ -377,7 +377,7 @@ class HTMLConverter(object, LoggingInterface):
upath = path.encode(sys.getfilesystemencoding()) if isinstance(path, unicode) else path
self.file_name = os.path.basename(upath.decode(sys.getfilesystemencoding()))
self.log_info(_('Processing %s'), repr(upath) if self.verbose else repr(self.file_name))
self.log.info(_('Processing %s')%( repr(upath) if self.verbose else repr(self.file_name)))
if not os.path.exists(upath):
upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names
@ -391,7 +391,7 @@ class HTMLConverter(object, LoggingInterface):
raw = xml_to_unicode(raw, self.verbose)[0]
f.close()
soup = self.preprocess(raw)
self.log_info(_('\tConverting to BBeB...'))
self.log.info(_('\tConverting to BBeB...'))
self.current_style = {}
self.page_break_found = False
if not isinstance(path, unicode):
@ -542,7 +542,7 @@ class HTMLConverter(object, LoggingInterface):
try:
index = self.book.pages().index(opage)
except ValueError:
self.log_warning(_('%s is an empty file')%self.file_name)
self.log.warning(_('%s is an empty file')%self.file_name)
tb = self.book.create_text_block()
self.current_page.append(tb)
return tb
@ -606,7 +606,7 @@ class HTMLConverter(object, LoggingInterface):
hasattr(target.parent, 'objId'):
self.book.addTocEntry(ascii_text, tb)
else:
self.log_debug(_("Cannot add link %s to TOC"), ascii_text)
self.log.debug(_("Cannot add link %s to TOC")%ascii_text)
def get_target_block(fragment, targets):
@ -937,7 +937,7 @@ class HTMLConverter(object, LoggingInterface):
try:
im = PILImage.open(path)
except IOError, err:
self.log_warning('Unable to process image: %s\n%s', original_path, err)
self.log.warning('Unable to process image: %s\n%s'%( original_path, err))
return
encoding = detect_encoding(im)
@ -955,7 +955,7 @@ class HTMLConverter(object, LoggingInterface):
self.scaled_images[path] = pt
return pt.name
except (IOError, SystemError), err: # PIL chokes on interlaced PNG images as well a some GIF images
self.log_warning(_('Unable to process image %s. Error: %s')%(path, err))
self.log.warning(_('Unable to process image %s. Error: %s')%(path, err))
if width == None or height == None:
width, height = im.size
@ -1000,7 +1000,7 @@ class HTMLConverter(object, LoggingInterface):
self.rotated_images[path] = pt
width, height = im.size
except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
self.log_debug(_('Unable to process interlaced PNG %s'), original_path)
self.log.debug(_('Unable to process interlaced PNG %s')% original_path)
finally:
pt.close()
@ -1015,7 +1015,8 @@ class HTMLConverter(object, LoggingInterface):
try:
self.images[path] = ImageStream(path, encoding=encoding)
except LrsError, err:
self.log_warning(_('Could not process image: %s\n%s'), original_path, err)
self.log.warning(_('Could not process image: %s\n%s')%(
original_path, err))
return
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
@ -1077,7 +1078,7 @@ class HTMLConverter(object, LoggingInterface):
if number_of_paragraphs > 2:
self.end_page()
self.log_debug('Forcing page break at %s', tagname)
self.log.debug('Forcing page break at %s'%tagname)
return end_page
def block_properties(self, tag_css):
@ -1467,7 +1468,7 @@ class HTMLConverter(object, LoggingInterface):
(self.chapter_attr[1].lower() == 'none' or \
(tag.has_key(self.chapter_attr[1]) and \
self.chapter_attr[2].match(tag[self.chapter_attr[1]])))):
self.log_debug('Detected chapter %s', tagname)
self.log.debug('Detected chapter %s'%tagname)
self.end_page()
self.page_break_found = True
@ -1507,7 +1508,7 @@ class HTMLConverter(object, LoggingInterface):
self.targets[self.target_prefix+tag[key]] = self.current_block
self.current_block.must_append = True
else:
self.log_debug('Could not follow link to '+tag['href'])
self.log.debug('Could not follow link to '+tag['href'])
self.process_children(tag, tag_css, tag_pseudo_css)
elif tag.has_key('name') or tag.has_key('id'):
self.process_anchor(tag, tag_css, tag_pseudo_css)
@ -1529,9 +1530,9 @@ class HTMLConverter(object, LoggingInterface):
self.process_image(path, tag_css, width, height,
dropcaps=dropcaps, rescale=True)
elif not urlparse(tag['src'])[0]:
self.log_warn('Could not find image: '+tag['src'])
self.log.warn('Could not find image: '+tag['src'])
else:
self.log_debug("Failed to process: %s", str(tag))
self.log.debug("Failed to process: %s"%str(tag))
elif tagname in ['style', 'link']:
ncss, npcss = {}, {}
if tagname == 'style':
@ -1552,7 +1553,7 @@ class HTMLConverter(object, LoggingInterface):
self.page_break_found = True
ncss, npcss = self.parse_css(src)
except IOError:
self.log_warn('Could not read stylesheet: '+tag['href'])
self.log.warn('Could not read stylesheet: '+tag['href'])
if ncss:
update_css(ncss, self.css)
self.css.update(self.override_css)
@ -1687,7 +1688,7 @@ class HTMLConverter(object, LoggingInterface):
if not self.disable_chapter_detection and tagname.startswith('h'):
if self.chapter_regex.search(src):
self.log_debug('Detected chapter %s', src)
self.log.debug('Detected chapter %s'%src)
self.end_page()
self.page_break_found = True
@ -1769,9 +1770,9 @@ class HTMLConverter(object, LoggingInterface):
try:
self.process_table(tag, tag_css)
except Exception, err:
self.log_warning(_('An error occurred while processing a table: %s. Ignoring table markup.'), repr(err))
self.log_debug('', exc_info=True)
self.log_debug(_('Bad table:\n%s'), unicode(tag)[:300])
self.log.warning(_('An error occurred while processing a table: %s. Ignoring table markup.')%repr(err))
self.log.exception('')
self.log.debug(_('Bad table:\n%s')%unicode(tag)[:300])
self.in_table = False
self.process_children(tag, tag_css, tag_pseudo_css)
finally:
@ -1821,14 +1822,7 @@ class HTMLConverter(object, LoggingInterface):
for _file in self.scaled_images.values() + self.rotated_images.values():
_file.__del__()
def process_file(path, options, logger=None):
if re.match('http://|https://', path):
raise ConversionError, _('You have to save the website %s as an html file first and then run html2lrf on it.')%(path,)
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('html2lrf')
setup_cli_handlers(logger, level)
def process_file(path, options, logger):
if not isinstance(path, unicode):
path = path.decode(sys.getfilesystemencoding())
path = os.path.abspath(path)
@ -1939,7 +1933,6 @@ def process_file(path, options, logger=None):
oname = os.path.abspath(os.path.expanduser(oname))
conv.writeto(oname, lrs=options.lrs)
run_plugins_on_postprocess(oname, 'lrf')
conv.log_info('Output written to %s', oname)
conv.cleanup()
return oname
@ -1998,38 +1991,3 @@ def try_opf(path, options, logger):
def option_parser():
return lrf_option_parser(
_('''Usage: %prog [options] mybook.html
%prog converts mybook.html to mybook.lrf.
%prog follows all links in mybook.html that point
to local files recursively. Thus, you can use it to
convert a whole tree of HTML files.'''))
def main(args=sys.argv):
try:
parser = option_parser()
options, args = parser.parse_args(args)
if options.output:
options.output = os.path.abspath(os.path.expanduser(options.output))
if len(args) != 2:
parser.print_help()
return 1
src = args[1]
if options.verbose:
import warnings
warnings.defaultaction = 'error'
except Exception, err:
print >> sys.stderr, err
return 1
if not isinstance(src, unicode):
src = src.decode(sys.getfilesystemencoding())
process_file(src, options)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,3 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,90 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, shutil, glob, logging
from tempfile import mkdtemp
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks import ConversionError
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.metadata.opf import OPFReader
from calibre import __appname__, setup_cli_handlers
def option_parser():
parser = lrf_option_parser(
_('''Usage: %prog [options] mybook.lit
%prog converts mybook.lit to mybook.lrf''')
)
return parser
def generate_html(pathtolit, logger):
if not os.access(pathtolit, os.R_OK):
raise ConversionError, 'Cannot read from ' + pathtolit
tdir = mkdtemp(prefix=__appname__+'_'+'lit2oeb_')
lr = LitReader(pathtolit)
print 'Extracting LIT file to', tdir
lr.extract_content(tdir)
return tdir
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('lit2lrf')
setup_cli_handlers(logger, level)
lit = os.path.abspath(os.path.expanduser(path))
tdir = generate_html(lit, logger)
try:
opf = glob.glob(os.path.join(tdir, '*.opf'))
if opf:
path = opf[0]
opf = OPFReader(path)
htmlfile = opf.spine[0].path.replace('&', '%26') #convertlit replaces & with %26
options.opf = path
else:
l = glob.glob(os.path.join(tdir, '*toc*.htm*'))
if not l:
l = glob.glob(os.path.join(tdir, '*top*.htm*'))
if not l:
l = glob.glob(os.path.join(tdir, '*contents*.htm*'))
if not l:
l = glob.glob(os.path.join(tdir, '*.htm*'))
if not l:
l = glob.glob(os.path.join(tdir, '*.txt*')) # Some lit file apparently have .txt files in them
if not l:
raise ConversionError('Conversion of lit to html failed. Cannot find html file.')
maxsize, htmlfile = 0, None
for c in l:
sz = os.path.getsize(c)
if sz > maxsize:
maxsize, htmlfile = sz, c
else:
htmlfile = l[0]
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
options.use_spine = True
html_process_file(htmlfile, options, logger=logger)
finally:
try:
shutil.rmtree(tdir)
except:
logger.warning('Failed to delete temporary directory '+tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No lit file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -229,6 +229,9 @@ def get_metadata(stream):
mi.author = lrf.author.strip()
mi.comments = lrf.free_text.strip()
mi.category = lrf.category.strip()+', '+lrf.classification.strip()
tags = [x.strip() for x in mi.category.split(',') if x.strip()]
if tags:
mi.tags = tags
mi.publisher = lrf.publisher.strip()
mi.cover_data = lrf.get_cover()
try:
@ -624,7 +627,9 @@ def set_metadata(stream, mi):
lrf.title = mi.title
if mi.authors:
lrf.author = ', '.join(mi.authors)
if mi.category:
if mi.tags:
lrf.category = mi.tags[0]
if getattr(mi, 'category', False):
lrf.category = mi.category
if mi.comments:
lrf.free_text = mi.comments

View File

@ -1,63 +0,0 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''''''
import sys, tempfile, os, logging, shutil
from calibre import setup_cli_handlers, __appname__
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
def generate_html(mobifile, tdir):
mr = MobiReader(mobifile)
mr.extract_content(tdir)
return mr.htmlfile
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('lit2lrf')
setup_cli_handlers(logger, level)
mobi = os.path.abspath(os.path.expanduser(path))
tdir = tempfile.mkdtemp('mobi2lrf', __appname__)
try:
htmlfile = generate_html(mobi, tdir)
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
options.use_spine = True
html_process_file(htmlfile, options, logger=logger)
finally:
try:
shutil.rmtree(tdir)
except:
logger.warning('Failed to delete temporary directory '+tdir)
def option_parser():
return lrf_option_parser(
_('''Usage: %prog [options] mybook.mobi|prc
%prog converts mybook.mobi to mybook.lrf''')
)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No mobi file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -2,7 +2,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import struct, array, zlib, cStringIO, collections, re
from calibre.ebooks.lrf import LRFParseError, PRS500_PROFILE, entity_to_unicode
from calibre.ebooks.lrf import LRFParseError, PRS500_PROFILE
from calibre import entity_to_unicode
from calibre.ebooks.lrf.tags import Tag
ruby_tags = {

View File

@ -0,0 +1,170 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import OptionRecommendation
class LRFOptions(object):
def __init__(self, output, opts, oeb):
def f2s(f):
try:
return unicode(f[0])
except:
return ''
m = oeb.metadata
self.title = None
self.author = self.publisher = _('Unknown')
self.freetext = f2s(m.description)
self.category = f2s(m.tags)
self.title_sort = self.author_sort = ''
self.cover = None
self.use_metadata_cover = True
self.output = output
self.ignore_tables = opts.linearize_tables
self.base_font_size = 0
self.blank_after_para = opts.insert_blank_line
self.use_spine = True
self.font_delta = 0
self.ignore_colors = False
from calibre.ebooks.lrf import PRS500_PROFILE
self.profile = PRS500_PROFILE
self.link_levels = sys.maxint
self.link_exclude = '@'
self.no_links_in_toc = True
self.disable_chapter_detection = True
self.chapter_regex = 'dsadcdswcdec'
self.chapter_attr = '$,,$'
self.override_css = self._override_css = ''
self.page_break = 'h[12]'
self.force_page_break = '$'
self.force_page_break_attr = '$'
self.add_chapters_to_toc = False
self.baen = self.pdftohtml = self.book_designer = False
self.verbose = opts.verbose
self.encoding = 'utf-8'
self.lrs = False
self.minimize_memory_usage = False
self.autorotation = opts.enable_autorotation
self.header_separation = (self.profile.dpi/72.) * opts.header_separation
for x in ('top', 'bottom', 'left', 'right'):
setattr(self, x+'_margin', (self.profile.dpi/72.) * getattr(opts,
'margin_'+x))
for x in ('wordspace', 'header', 'header_format',
'minimum_indent', 'serif_family',
'render_tables_as_images', 'sans_family', 'mono_family',
'text_size_multiplier_for_rendered_tables'):
setattr(self, x, getattr(opts, x))
class LRFOutput(OutputFormatPlugin):
name = 'LRF Output'
author = 'Kovid Goyal'
file_type = 'lrf'
options = set([
OptionRecommendation(name='enable_autorotation', recommended_value=False,
help=_('Enable autorotation of images that are wider than the screen width.')
),
OptionRecommendation(name='wordspace',
recommended_value=2.5, level=OptionRecommendation.LOW,
help=_('Set the space between words in pts. Default is %default')
),
OptionRecommendation(name='header', recommended_value=False,
help=_('Add a header to all the pages with title and author.')
),
OptionRecommendation(name='header_format', recommended_value="%t by %a",
help=_('Set the format of the header. %a is replaced by the author '
'and %t by the title. Default is %default')
),
OptionRecommendation(name='header_separation', recommended_value=0,
help=_('Add extra spacing below the header. Default is %default pt.')
),
OptionRecommendation(name='minimum_indent', recommended_value=0,
help=_('Minimum paragraph indent (the indent of the first line '
'of a paragraph) in pts. Default: %default')
),
OptionRecommendation(name='render_tables_as_images',
recommended_value=False,
help=_('Render tables in the HTML as images (useful if the '
'document has large or complex tables)')
),
OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
recommended_value=1.0,
help=_('Multiply the size of text in rendered tables by this '
'factor. Default is %default')
),
OptionRecommendation(name='serif_family', recommended_value=None,
help=_('The serif family of fonts to embed')
),
OptionRecommendation(name='sans_family', recommended_value=None,
help=_('The sans-serif family of fonts to embed')
),
OptionRecommendation(name='mono_family', recommended_value=None,
help=_('The monospace family of fonts to embed')
),
])
recommendations = set([
('dont_justify', True, OptionRecommendation.HIGH),
])
def convert_images(self, pages, opts, wide):
from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
from uuid import uuid4
from calibre.constants import __appname__, __version__
width, height = (784, 1012) if wide else (584, 754)
ps = {}
ps['topmargin'] = 0
ps['evensidemargin'] = 0
ps['oddsidemargin'] = 0
ps['textwidth'] = width
ps['textheight'] = height
book = Book(title=opts.title, author=opts.author,
bookid=uuid4().hex,
publisher='%s %s'%(__appname__, __version__),
category=_('Comic'), pagestyledefault=ps,
booksetting=BookSetting(screenwidth=width, screenheight=height))
for page in pages:
imageStream = ImageStream(page)
_page = book.create_page()
_page.append(ImageBlock(refstream=imageStream,
blockwidth=width, blockheight=height, xsize=width,
ysize=height, x1=width, y1=height))
book.append(_page)
book.renderLrf(open(opts.output, 'wb'))
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
lrf_opts = LRFOptions(output_path, opts, oeb)
if input_plugin.is_image_collection:
self.convert_images(input_plugin.get_images(), lrf_opts,
getattr(opts, 'wide', False))
return
from calibre.ptempfile import TemporaryDirectory
with TemporaryDirectory('_lrf_output') as tdir:
from calibre.customize.ui import plugin_for_output_format
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
from calibre.ebooks.lrf.html.convert_from import process_file
process_file(os.path.join(tdir, opf), lrf_opts, self.log)

View File

@ -1,2 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,131 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''''''
import sys, os, subprocess, logging
import errno
from functools import partial
from calibre import isosx, setup_cli_handlers, filename_to_utf8, iswindows, islinux
from calibre.ebooks import ConversionError, DRMError
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.metadata.pdf import get_metadata
PDFTOHTML = 'pdftohtml'
popen = subprocess.Popen
if isosx and hasattr(sys, 'frameworks_dir'):
PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
if iswindows and hasattr(sys, 'frozen'):
PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe')
popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
if islinux and getattr(sys, 'frozen_path', False):
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
def generate_html(pathtopdf, tdir):
'''
Convert the pdf into html.
@return: Path to a temporary file containing the HTML.
'''
if isinstance(pathtopdf, unicode):
pathtopdf = pathtopdf.encode(sys.getfilesystemencoding())
if not os.access(pathtopdf, os.R_OK):
raise ConversionError, 'Cannot read from ' + pathtopdf
index = os.path.join(tdir, 'index.html')
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
pathtopdf = os.path.abspath(pathtopdf)
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
'-nodrm', pathtopdf, os.path.basename(index))
cwd = os.getcwd()
try:
os.chdir(tdir)
try:
p = popen(cmd, stderr=subprocess.PIPE)
except OSError, err:
if err.errno == 2:
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
else:
raise
'''
print p.stdout.read()
'''
while True:
try:
ret = p.wait()
break
except OSError, e:
if e.errno == errno.EINTR:
continue
else:
raise
if ret != 0:
err = p.stderr.read()
raise ConversionError, err
if not os.path.exists(index) or os.stat(index).st_size < 100:
raise DRMError()
raw = open(index, 'rb').read()
open(index, 'wb').write('<!-- created by calibre\'s pdftohtml -->\n'+raw)
if not '<br' in raw[:4000]:
raise ConversionError(os.path.basename(pathtopdf) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
try:
mi = get_metadata(open(pathtopdf, 'rb'))
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(pathtopdf))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
finally:
os.chdir(cwd)
return index
def option_parser():
return lrf_option_parser(
_('''%prog [options] mybook.pdf
%prog converts mybook.pdf to mybook.lrf''')
)
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('pdf2lrf')
setup_cli_handlers(logger, level)
pdf = os.path.abspath(os.path.expanduser(path))
tdir = PersistentTemporaryDirectory('_pdf2lrf')
htmlfile = generate_html(pdf, tdir)
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
else:
options.output = os.path.abspath(options.output)
options.pdftohtml = True
if not options.title:
options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0])
html_process_file(htmlfile, options, logger)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No pdf file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,426 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Convert PDF to a reflowable format using pdftoxml.exe as the PDF parsing backend.
'''
import sys, os, re, tempfile, subprocess, atexit, shutil, logging, xml.parsers.expat
from xml.etree.ElementTree import parse
from calibre import isosx, setup_cli_handlers, __appname__
from calibre.utils.config import OptionParser
from calibre.ebooks import ConversionError
PDFTOXML = 'pdftoxml.exe'
if isosx and hasattr(sys, 'frameworks_dir'):
PDFTOXML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOXML)
class StyleContainer(object):
def set_style(self, iterator):
styles = set([])
for tok in iterator:
if hasattr(tok, 'style'):
styles.add(tok.style)
counts = [0*i for i in range(len(styles))]
for i in range(len(styles)):
counts[i] = sum([1 for j in self if j.style == styles[i]])
max = max(counts)
for i in range(len(counts)):
if counts[i] == max:
break
self.style = styles[i]
for obj in iterator:
if obj.style == self.style:
obj.style = None
class Page(object):
def __init__(self, attrs):
for a in ('number', 'width', 'height'):
setattr(self, a, float(attrs[a]))
self.id = attrs['id']
self.current_line = None
self.lines = []
def end_line(self):
if self.current_line is not None:
self.current_line.finalize()
self.lines.append(self.current_line)
self.current_line = None
def finalize(self):
self.identify_groups()
self.look_for_page_break()
def identify_groups(self):
groups = []
in_group = False
for i in range(len(self.lines)):
if not in_group:
groups.append(i)
in_group = True
else:
pl = self.lines[i-1]
cl = self.lines[i]
if cl.left != pl.left and cl.width != pl.width:
groups.append(i)
self.groups = []
for i in range(len(groups)):
start = groups[i]
if i +1 == len(groups):
stop = len(self.lines)
else:
stop = groups[i+i]
self.groups.append(self.lines[start:stop])
if len(self.groups) > 1:
self.group[0].test_header(self.width, self.height)
self.groups[-1].test_footer(self.width, self.height)
def look_for_page_break(self):
max = 0
for g in self.groups:
if not g.is_footer and g.bottom > max:
max = g.bottom
self.page_break_after = max < 0.8*self.height
class Group(StyleContainer):
def __init__(self, lines):
self.lines = lines
self.set_style(self.lines)
self.width = max([i.width for i in self.lines])
self.bottom = max([i.bottom for i in self.lines])
tot, ltot = 0, 0
for i in range(1, len(self.lines)):
bot = self.lines[i-1].bottom
top = self.lines[i].top
tot += abs(top - bot)
ltot += self.lines[i].left
self.average_line_spacing = tot/float(len(self.lines)-1)
ltot += self.lines[0].left
self.average_left_margin = ltot/float(len(self.lines))
self.left_margin = min([i.left for i in self.lines])
self.detect_paragraphs()
def detect_paragraphs(self):
if not self.lines:
return
indent_buffer = 5
self.lines[0].is_para_start = self.lines[0].left > self.average_left_margin+indent_buffer
for i in range(1, len(self.lines)):
pl, l = self.lines[i-1:i+1]
c1 = pl.bottom - l.top > self.average_line_spacing
c2 = l.left > self.average_left_margin+indent_buffer
c3 = pl.width < 0.8 * self.width
l.is_para_start = c1 or c2 or c3
def test_header(self, page_width, page_height):
self.is_header = len(self.lines) == 1 and self.lines[0].width < 0.5*page_width
def test_footer(self, page_width, page_height):
self.is_footer = len(self.lines) == 1 and self.lines[0].width < 0.5*page_width
class Text(object):
def __init__(self, attrs):
for a in ('x', 'y', 'width', 'height'):
setattr(self, a, float(attrs[a]))
self.id = attrs['id']
self.objects = []
def add_token(self, tok):
if not self.objects:
self.objects.append(tok)
else:
ptok = self.objects[-1]
if tok == ptok:
ptok.text += ' ' + tok.text
else:
self.objects.append(tok)
def add(self, object):
if isinstance(object, Token):
self.add_token(object)
else:
print 'WARNING: Unhandled object', object.__class__.__name__
def to_xhtml(self):
res = []
for obj in self.objects:
if isinstance(obj, Token):
res.append(obj.to_xhtml())
return ' '.join(res)
class Line(list, StyleContainer):
def calculate_geometry(self):
self.left = self[0].x
self.width = self[-1].x + self[-1].width - self.left
self.top = min(o.y for o in self)
self.bottom = max(o.height+o.y for o in self)
def finalize(self):
self.calculate_geometry()
self.set_style(self)
def to_xhtml(self, group_id):
ans = '<span class="%s" '%group_id
if self.style is not None:
ans += 'style="%s"'%self.style.to_css(inline=True)
ans += '>%s</span>'
res = []
for object in self:
if isinstance(object, Text):
res.append(object.to_xhtml())
return ans%(' '.join(res))
class TextStyle(object):
def __init__(self, tok):
self.bold = tok.bold
self.italic = tok.italic
self.font_name = tok.font_name
self.font_size = tok.font_size
self.color = tok.font_color
def __eq__(self, other):
if isinstance(other, self.__class__):
for a in ('font_size', 'bold', 'italic', 'font_name', 'color'):
if getattr(self, a) != getattr(other, a):
return False
return True
return False
def to_css(self, inline=False):
fw = 'bold' if self.bold else 'normal'
fs = 'italic' if self.italic else 'normal'
fsz = '%dpt'%self.font_size
props = ['font-weight: %s;'%fw, 'font-style: %s;'%fs, 'font-size: %s;'%fsz,
'color: rgb(%d, %d, %d);'%self.color]
joiner = ' '
if not inline:
joiner = '\n'
props = ['{'] + props + ['}']
return joiner.join(props)
class Token(object):
def __init__(self, attrs):
for a in ('x', 'y', 'width', 'height', 'rotation', 'angle', 'font-size'):
setattr(self, a.replace('-', '_'), float(attrs[a]))
for a in ('bold', 'italic'):
setattr(self, a, attrs[a]=='yes')
self.font_name = attrs['font-name']
fc = re.compile(r'#([a-f0-9]{2})([a-f0-9]{2})([a-f0-9]{2})', re.IGNORECASE)
fc = fc.match(attrs['font-color'])
self.font_color = (int(fc.group(1), 16), int(fc.group(2), 16), int(fc.group(3), 16))
self.id = attrs['id']
self.text = u''
self.style = TextStyle(self)
def handle_char_data(self, data):
self.text += data
def __eq__(self, other):
if isinstance(other, self.__class__):
for a in ('rotation', 'angle', 'font_size', 'bold', 'italic', 'font_name', 'font_color'):
if getattr(self, a) != getattr(other, a):
return False
return True
return False
def to_xhtml(self):
if self.style is not None:
ans = u'<span style="%s">%s</span>'%(self.style.to_css(inline=True), self.text)
else:
ans = self.text
return ans
class PDFDocument(object):
SKIPPED_TAGS = ('DOCUMENT', 'METADATA', 'PDFFILENAME', 'PROCESS', 'VERSION',
'COMMENT', 'CREATIONDATE')
def __init__(self, filename):
parser = xml.parsers.expat.ParserCreate('UTF-8')
parser.buffer_text = True
parser.returns_unicode = True
parser.StartElementHandler = self.start_element
parser.EndElementHandler = self.end_element
self.pages = []
self.current_page = None
self.current_token = None
src = open(filename, 'rb').read()
self.parser = parser
parser.Parse(src)
def start_element(self, name, attrs):
if name == 'TOKEN':
self.current_token = Token(attrs)
self.parser.CharacterDataHandler = self.current_token.handle_char_data
elif name == 'TEXT':
text = Text(attrs)
if self.current_page.current_line is None:
self.current_page.current_line = Line()
self.current_page.current_line.append(text)
else:
y, height = self.current_page.current_line[0].y, self.current_page.current_line[0].height
if y == text.y or y+height == text.y + text.height:
self.current_page.current_line.append(text)
else:
self.current_page.end_line()
self.current_page.current_line = Line()
self.current_page.current_line.append(text)
elif name == 'PAGE':
self.current_page = Page(attrs)
elif name.lower() == 'xi:include':
print 'WARNING: Skipping vector image'
elif name in self.SKIPPED_TAGS:
pass
else:
print 'WARNING: Unhandled element', name
def end_element(self, name):
if name == 'TOKEN':
if self.current_token.angle == 0 and self.current_token.rotation == 0:
self.current_page.current_line[-1].add(self.current_token)
self.current_token = None
self.parser.CharacterDataHandler = None
elif name == 'PAGE':
self.current_page.finalize()
self.pages.append(self.current_page)
self.current_page = None
def to_xhtml(self):
header = u'''\
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.w3.org/MarkUp/SCHEMA/xhtml11.xsd" >
<head>
<style type="text/css">
%(style)s
</style>
</head>
<body>
%(body)s
</body>
</html>
'''
res = []
para = []
styles = []
for page in self.pages:
res.append(u'<a name="%s" />'%page.id)
for group in page.groups:
if group.is_header or group.is_footer:
continue
if group.style is not None:
styles.append(u'.%s %s\n'%(group.id, group.style.to_css()))
for line in group.lines:
if line.is_para_start:
indent = group.left_margin - line.left
if para:
res.append(u'<p style="text-indent: %dpt">%s</p>'%(indent, ''.join(para)))
para = []
para.append(line.to_xhtml(group.id))
if page.page_break_after:
res.append(u'<br style="page-break-after:always" />')
if para:
res.append(u'<p>%s</p>'%(''.join(para)))
para = []
return (header%dict(style='\n'.join(styles), body='\n'.join(res))).encode('utf-8')
class PDFConverter(object):
@classmethod
def generate_xml(cls, pathtopdf, logger):
pathtopdf = os.path.abspath(pathtopdf)
tdir = tempfile.mkdtemp('pdf2xml', __appname__)
atexit.register(shutil.rmtree, tdir)
xmlfile = os.path.basename(pathtopdf)+'.xml'
os.chdir(tdir)
cmd = PDFTOXML + ' -outline "%s" "%s"'%(pathtopdf, xmlfile)
p = subprocess.Popen(cmd, shell=True, stderr=subprocess.STDOUT,
stdout=subprocess.PIPE)
log = p.stdout.read()
ret = p.wait()
if ret != 0:
raise ConversionError, log
xmlfile = os.path.join(tdir, xmlfile)
if os.stat(xmlfile).st_size < 20:
raise ConversionError(os.path.basename(pathtopdf) + ' does not allow copying of text.')
return xmlfile
def __init__(self, pathtopdf, logger, opts):
self.cwd = os.getcwdu()
self.logger = logger
self.opts = opts
try:
self.logger.info('Converting PDF to XML')
self.xmlfile = self.generate_xml(pathtopdf, self.logger)
self.tdir = os.path.dirname(self.xmlfile)
self.data_dir = self.xmlfile + '_data'
outline_file = self.xmlfile.rpartition('.')[0]+'_outline.xml'
self.logger.info('Parsing XML')
self.document = PDFDocument(self.xmlfile)
self.outline = parse(outline_file)
finally:
os.chdir(self.cwd)
def convert(self, output_dir):
doc = self.document.to_xhtml()
open(os.path.join(output_dir, 'document.html'), 'wb').write(doc)
def option_parser():
parser = OptionParser(usage=\
'''
%prog [options] myfile.pdf
Convert a PDF file to a HTML file.
''')
parser.add_option('-o', '--output-dir', default='.',
help=_('Path to output directory in which to create the HTML file. Defaults to current directory.'))
parser.add_option('--verbose', default=False, action='store_true',
help=_('Be more verbose.'))
return parser
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args()
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('pdf2html')
setup_cli_handlers(logger, level)
if len(args) != 1:
parser.print_help()
print _('You must specify a single PDF file.')
return 1
options.output_dir = os.path.abspath(options.output_dir)
converter = PDFConverter(os.path.abspath(args[0]), logger, options)
converter.convert(options.output_dir)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,190 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, shutil, logging, glob
from lxml import etree
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre import setup_cli_handlers
from calibre.libwand import convert, WandException
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.lrf.rtf.xsl import xhtml
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def option_parser():
parser = lrf_option_parser(
_('''%prog [options] mybook.rtf
%prog converts mybook.rtf to mybook.lrf''')
)
parser.add_option('--keep-intermediate-files', action='store_true', default=False)
return parser
def convert_images(html, logger):
wmfs = glob.glob('*.wmf') + glob.glob('*.WMF')
for wmf in wmfs:
target = os.path.join(os.path.dirname(wmf), os.path.splitext(os.path.basename(wmf))[0]+'.jpg')
try:
convert(wmf, target)
html = html.replace(os.path.basename(wmf), os.path.basename(target))
except WandException, err:
logger.warning(u'Unable to convert image %s with error: %s'%(wmf, unicode(err)))
continue
return html
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('rtf2lrf')
setup_cli_handlers(logger, level)
rtf = os.path.abspath(os.path.expanduser(path))
f = open(rtf, 'rb')
mi = get_metadata(f, 'rtf')
f.close()
tdir = PersistentTemporaryDirectory('_rtf2lrf')
html = generate_html(rtf, tdir)
cwd = os.getcwdu()
try:
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
if not mi.title:
mi.title = os.path.splitext(os.path.basename(rtf))[0]
if (not options.title or options.title == 'Unknown'):
options.title = mi.title
if (not options.author or options.author == 'Unknown') and mi.author:
options.author = mi.author
if (not options.category or options.category == 'Unknown') and mi.category:
options.category = mi.category
if (not options.freetext or options.freetext == 'Unknown') and mi.comments:
options.freetext = mi.comments
os.chdir(tdir)
html_process_file(html, options, logger)
finally:
os.chdir(cwd)
if hasattr(options, 'keep_intermediate_files') and options.keep_intermediate_files:
logger.debug('Intermediate files in '+ tdir)
else:
shutil.rmtree(tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No rtf file specified'
return 1
process_file(args[1], options, logger)
return 0
def generate_xml(rtfpath, tdir):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
ofile = os.path.join(tdir, 'index.xml')
cwd = os.getcwdu()
os.chdir(tdir)
rtfpath = os.path.abspath(rtfpath)
try:
parser = ParseRtf(
in_file = rtfpath,
out_file = ofile,
# Convert symbol fonts to unicode equivelents. Default
# is 1
convert_symbol = 1,
# Convert Zapf fonts to unicode equivelents. Default
# is 1.
convert_zapf = 1,
# Convert Wingding fonts to unicode equivelents.
# Default is 1.
convert_wingdings = 1,
# Convert RTF caps to real caps.
# Default is 1.
convert_caps = 1,
# Indent resulting XML.
# Default is 0 (no indent).
indent = 1,
# Form lists from RTF. Default is 1.
form_lists = 1,
# Convert headings to sections. Default is 0.
headings_to_sections = 1,
# Group paragraphs with the same style name. Default is 1.
group_styles = 1,
# Group borders. Default is 1.
group_borders = 1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs = 0,
)
parser.parse_rtf()
finally:
os.chdir(cwd)
return ofile
def generate_html(rtfpath, tdir):
print 'Converting RTF to XML...'
rtfpath = os.path.abspath(rtfpath)
try:
xml = generate_xml(rtfpath, tdir)
except RtfInvalidCodeException:
raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.'))
tdir = os.path.dirname(xml)
cwd = os.getcwdu()
os.chdir(tdir)
try:
print 'Parsing XML...'
parser = etree.XMLParser(recover=True, no_network=True)
try:
doc = etree.parse(xml, parser)
except:
raise
print 'Parsing failed. Trying to clean up XML...'
soup = BeautifulStoneSoup(open(xml, 'rb').read())
doc = etree.fromstring(str(soup))
print 'Converting XML to HTML...'
styledoc = etree.fromstring(xhtml)
transform = etree.XSLT(styledoc)
result = transform(doc)
tdir = os.path.dirname(xml)
html = os.path.join(tdir, 'index.html')
f = open(html, 'wb')
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
f.write(res)
f.close()
try:
mi = get_metadata(open(rtfpath, 'rb'), 'rtf')
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(rtfpath))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
finally:
os.chdir(cwd)
return html
if __name__ == '__main__':
sys.exit(main())

View File

@ -207,32 +207,32 @@ class Tag(object):
s += " at %08X, contents: %s" % (self.offset, repr(self.contents))
return s
@apply
def byte():
@dynamic_property
def byte(self):
def fget(self):
if len(self.contents) != 1:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<B", self.contents)[0]
return property(fget=fget)
@apply
def word():
@dynamic_property
def word(self):
def fget(self):
if len(self.contents) != 2:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<H", self.contents)[0]
return property(fget=fget)
@apply
def sword():
@dynamic_property
def sword(self):
def fget(self):
if len(self.contents) != 2:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<h", self.contents)[0]
return property(fget=fget)
@apply
def dword():
@dynamic_property
def dword(self):
def fget(self):
if len(self.contents) != 4:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)

View File

@ -1,2 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,112 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
"""
Convert .txt files to .lrf
"""
import os, sys, codecs, logging, re, shutil
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks import ConversionError
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.markdown import markdown
from calibre import setup_cli_handlers
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def option_parser():
parser = lrf_option_parser(
_('''%prog [options] mybook.txt
%prog converts mybook.txt to mybook.lrf'''))
parser.add_option('--debug-html-generation', action='store_true', default=False,
dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.'))
return parser
def fix_image_includes(sdir, tdir, match):
path = match.group(1).split('/')
src = os.path.join(sdir, *path)
dest = os.path.join(tdir, *path)
p = os.path.dirname(dest)
if not os.path.exists(p):
os.makedirs(p)
if not os.path.exists(dest):
shutil.copyfile(src, dest)
def generate_html(txtfile, encoding, tdir):
'''
Convert txtfile to html and return a PersistentTemporaryFile object pointing
to the file with the HTML.
'''
txtfile = os.path.abspath(txtfile)
enc = encoding
if not encoding:
encodings = ['cp1252', 'latin-1', 'utf8', 'iso-8859-1', 'koi8_r', 'koi8_u']
txt, enc = None, None
for encoding in encodings:
try:
txt = codecs.open(txtfile, 'rb', encoding).read()
except UnicodeDecodeError:
continue
enc = encoding
break
if txt == None:
raise ConversionError, 'Could not detect encoding of %s'%(txtfile,)
else:
txt = codecs.open(txtfile, 'rb', enc).read()
print 'Converting text to HTML...'
md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'],
safe_mode=False,
)
html = '<html><body>'+md.convert(txt)+'</body></html>'
for match in re.finditer(r'<img\s+[^>]*src="([^"]+)"', html):
fix_image_includes(os.path.dirname(txtfile), tdir, match)
p = os.path.join(tdir, 'index.html')
open(p, 'wb').write(html.encode('utf-8'))
mi = MetaInformation(os.path.splitext(os.path.basename(txtfile))[0], [_('Unknown')])
opf = OPFCreator(tdir, mi)
opf.create_manifest([(os.path.join(tdir, 'index.html'), None)])
opf.create_spine([os.path.join(tdir, 'index.html')])
opf.render(open(os.path.join(tdir, 'metadata.opf'), 'wb'))
return p
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('txt2lrf')
setup_cli_handlers(logger, level)
txt = os.path.abspath(os.path.expanduser(path))
if not hasattr(options, 'debug_html_generation'):
options.debug_html_generation = False
tdir = PersistentTemporaryDirectory('_txt2lrf')
htmlfile = generate_html(txt, options.encoding, tdir)
options.encoding = 'utf-8'
if not options.debug_html_generation:
options.force_page_break = 'h2'
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
if not options.title:
options.title = os.path.splitext(os.path.basename(path))[0]
html_process_file(htmlfile, options, logger)
else:
print open(htmlfile, 'rb').read()
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No txt file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,89 +0,0 @@
Demonstration of `txt2lrf`
==========================
`txt2lrf` provides a convenient way to create LRF files with good formatting.
`txt2lrf` recognizes a simple markup language called *markdown*.
The idea is to provide a lightweight markup that can be used to create
TXT files that can be read by themselves or automatically converted to LRF.
[{@name=toc}]()
<br /><br />
///Table of Contents///
Text formatting
---------------
**Bold** and *italic* text is easily specified.
> Blockquotes are also very simple to specify.
> This is a basic blockquote paragraph. I absolutely
> love block quotes don't you?
This is a preformatted code block. No formatting rules are applied to text in this block and it is rendered in a monospaced font.
For details on the text formatting syntax visit
http://daringfireball.net/projects/markdown/syntax
___
[Table of Contents](#toc)
Lists
-----
Both ordered and unordered lists are supported.
### Unordered lists
+ What a
+ *nice*
+ list
### Ordered lists
1. One
2. Two
3. Three
**Note:** Nested lists are not supported
___
[Table of Contents](#toc)
Tables
------
Simple tables are easily generated
| |* Col 1 *|* Col 2 *|
|* Row 1 *| (1, 1) | (1, 2) |
|* Row 2 *| (2, 1) | (2, 2) |
**Note:** Nested tables are not supported
___
[Table of Contents](#toc)
Images
------
`txt2lrf` also has support for inline images like
![this one](small.jpg) this one.
___
[Table of Contents](#toc)
Automatic TOC Creation
----------------------
By inserting `///Table of Contents///` into the text at some point
a table of contents is automatically generated with links that point
to all headings underlined with `-------`.
___
[Table of Contents](#toc)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

View File

@ -1,6 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
builtin_profiles = []
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]

View File

@ -1,183 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Convert websites into LRF files.'''
import sys, tempfile, shutil, os, logging, imp, inspect, re
from urlparse import urlsplit
from calibre import __appname__, setup_cli_handlers, CommandLineError, strftime
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file
from calibre.web.fetch.simple import create_fetcher
from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile, create_class
from calibre.ebooks.lrf.web import builtin_profiles, available_profiles
def option_parser():
parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n'''
'''%prog downloads a site from the web and converts it '''
'''into a LRF file for use with the SONY Reader. '''
'''website_profile is one of '''+str(available_profiles)+\
''' If you specify a website_profile of default or do not specify '''
'''it, you must specify the --url option.'''
)
parser.add_option('-u', '--url', dest='url', default=None,
help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
parser.add_option('--user-profile', default=None,
help='Path to a python file containing a user created profile. For help visit http://%s.kovidgoyal.net/wiki/UserProfiles'%__appname__)
parser.add_option('--username', dest='username', default=None,
help='Specify the username to be used while downloading. Only used if the profile supports it.')
parser.add_option('--password', dest='password', default=None,
help='Specify the password to be used while downloading. Only used if the profile supports it.')
parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout,
default=None, type='int', dest='timeout')
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout,
default=None, type='int', dest='max_recursions')
parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout)
parser.add_option('--delay', default=None, dest='delay', type='int',
help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout)
parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
help='Do not download CSS stylesheets.', dest='no_stylesheets')
parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append',
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
parser.add_option('--keep-downloaded-files', default=False, action='store_true',
help='''Do not delete the downloaded files after creating the LRF''')
return parser
def fetch_website(options, logger):
tdir = tempfile.mkdtemp(prefix=__appname__+'_', suffix='_web2lrf')
options.dir = tdir
fetcher = create_fetcher(options, logger)
fetcher.preprocess_regexps = options.preprocess_regexps
return fetcher.start_fetch(options.url), tdir
def create_lrf(htmlfile, options, logger):
if not options.author or options.author.lower() == 'unknown':
options.author = __appname__
options.header = True
if options.output:
options.output = os.path.abspath(os.path.expanduser(options.output))
else:
options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
process_file(htmlfile, options, logger)
def process_profile(args, options, logger=None):
tdir = None
try:
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('web2lrf')
setup_cli_handlers(logger, level)
index = -1
if len(args) == 2 and re.search(r'class\s+\S+\(\S+\)\s*\:', args[1]):
profile = create_class(args[1])
else:
if options.user_profile is not None:
path = os.path.abspath(options.user_profile)
name = os.path.splitext(os.path.basename(path))[0]
res = imp.find_module(name, [os.path.dirname(path)])
module = imp.load_module(name, *res)
classes = inspect.getmembers(module,
lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\
and x is not DefaultProfile and x is not FullContentProfile)
if not classes:
raise CommandLineError('Invalid user profile '+path)
builtin_profiles.append(classes[0][1])
available_profiles.append(name)
if len(args) < 2:
args.append(name)
args[1] = name
index = -1
if len(args) == 2:
try:
if isinstance(args[1], basestring):
if args[1] != 'default':
index = available_profiles.index(args[1])
except ValueError:
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles))
else:
raise CommandLineError('Only one profile at a time is allowed.')
profile = DefaultProfile if index == -1 else builtin_profiles[index]
profile = profile(logger, options.verbose, options.username, options.password)
if profile.browser is not None:
options.browser = profile.browser
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
val = getattr(options, opt)
if val is None:
setattr(options, opt, getattr(profile, opt))
if not options.url:
options.url = profile.url
if not options.url:
raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
if not options.title:
title = profile.title
if not title:
title = urlsplit(options.url).netloc
options.title = title + strftime(profile.timefmt)
options.match_regexps += profile.match_regexps
options.preprocess_regexps = profile.preprocess_regexps
options.filter_regexps += profile.filter_regexps
options.encoding = profile.encoding if options.encoding is None else options.encoding
if len(args) == 2 and args[1] != 'default':
options.anchor_ids = False
htmlfile, tdir = fetch_website(options, logger)
options.encoding = 'utf-8'
cwd = os.getcwd()
if not options.output:
title = options.title.encode(sys.getfilesystemencoding()) if isinstance(options.title, unicode) else options.title
options.output = os.path.join(cwd, options.title+('.lrs' if options.lrs else '.lrf'))
if not os.path.isabs(options.output):
options.output = os.path.join(cwd, options.output)
option_parser().parse_args(profile.html2lrf_options, options)
try:
os.chdir(os.path.dirname(htmlfile))
create_lrf(os.path.basename(htmlfile), options, logger)
finally:
os.chdir(cwd)
finally:
try:
profile.cleanup()
except:
pass
if tdir and os.path.isdir(tdir):
if options.keep_downloaded_files:
print 'Downloaded files in ', tdir
else:
shutil.rmtree(tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) > 2 or (len(args) == 1 and not options.user_profile):
parser.print_help()
return 1
try:
process_profile(args, options, logger=logger)
except CommandLineError, err:
print >>sys.stderr, err
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,572 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Contains the Base Profiles that can be used to easily create profiles to download
particular websites.
'''
import tempfile, time, calendar, re, operator, atexit, shutil, os
from htmlentitydefs import name2codepoint
from email.utils import formatdate
from calibre import __appname__, iswindows, browser, strftime
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag
class DefaultProfile(object):
#: The title to use for the LRF file
#: @type: string
title = 'Default Profile'
#: Maximum number of articles to download from each feed
#: @type: integer
max_articles_per_feed = 10
#: If True process the <description> element of the feed as HTML
#: @type: boolean
html_description = True
#: How many days old should the oldest article downloaded from the feeds be
#: @type: integer
oldest_article = 7
#: Recommend frequency at which to download this profile. In days.
recommended_frequency = 7
#: Number of levels of links to follow
#: @type: integer
max_recursions = 1
#: Maximum number of files to download
#: @type: integer
max_files = 3000
#: Delay between consecutive downloads in seconds
#: @type: integer
delay = 0
#: Timeout for fetching files from server in seconds
#: @type: integer
timeout = 10
#: The format string for the date shown on the first page
#: @type: string
timefmt = ' [%a %d %b %Y]'
#: The order of elements to search for a URL when parsing the RSS feed. You
#: can replace these elements by completely arbitrary elements to customize
#: feed processing.
#: @type: list of strings
url_search_order = ['guid', 'link']
#: The format string used to parse the publication date in the RSS feed.
#: If set to None some default heuristics are used, these may fail,
#: in which case set this to the correct string or re-implement
#: L{DefaultProfile.strptime} in your subclass.
#: @type: string or None
pubdate_fmt = None
#: If True will look for a publication date for each article.
#: If False assumes the publication date is the current time.
#: @type: boolean
use_pubdate = True,
#: Max number of characters in the short description.
#: Used by L{FullContentProfile}
#: @type: integer
summary_length = 500
#: If True stylesheets are not downloaded and processed
#: Convenient flag to disable loading of stylesheets for websites
#: that have overly complex stylesheets unsuitable for conversion
#: to ebooks formats
#: @type: boolean
no_stylesheets = False
#: If False articles with the same title in the same feed
#: are not downloaded multiple times
#: @type: boolean
allow_duplicates = False
#: If True the GUI will ask the user for a username and password
#: to use while downloading
#: @type: boolean
needs_subscription = False
#: Specify an override encoding for sites that have an incorrect
#: charset specification. THe most common being specifying latin1 and
#: using cp1252
encoding = None
#: List of regular expressions that determines which links to follow
#: If empty, it is ignored.
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
match_regexps = []
#: List of regular expressions that determines which links to ignore
#: If empty it is ignored
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
filter_regexps = []
#: List of options to pass to html2lrf, to customize conversion
#: to LRF
#: @type: list of strings
html2lrf_options = []
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
#: list should be a two element tuple. The first element of the tuple should
#: be a compiled regular expression and the second a callable that takes
#: a single match object and returns a string to replace the match.
#: @type: list of tuples
preprocess_regexps = []
# See the built-in profiles for examples of these settings.
#: The URL of the website
#: @type: string
url = ''
feeds = []
CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL)
def get_feeds(self):
'''
Return a list of RSS feeds to fetch for this profile. Each element of the list
must be a 2-element tuple of the form (title, url).
'''
if not self.feeds:
raise NotImplementedError
return self.feeds
@classmethod
def print_version(cls, url):
'''
Take a URL pointing to an article and returns the URL pointing to the
print version of the article.
'''
return url
@classmethod
def get_browser(cls):
'''
Return a browser instance used to fetch documents from the web.
If your profile requires that you login first, override this method
in your subclass. See for example the nytimes profile.
'''
return browser()
def __init__(self, logger, verbose=False, username=None, password=None, lrf=True):
self.logger = logger
self.username = username
self.password = password
self.verbose = verbose
self.lrf = lrf
self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_')
self.browser = self.get_browser()
try:
self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
except NotImplementedError:
self.url = None
atexit.register(cleanup, self.temp_dir)
def build_index(self):
'''Build an RSS based index.html'''
articles = self.parse_feeds()
encoding = 'utf-8' if self.encoding is None else self.encoding
def build_sub_index(title, items):
ilist = ''
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
for item in items:
if not item.has_key('date'):
item['date'] = time.strftime('%a, %d %b', time.localtime())
ilist += li%item
return u'''\
<html>
<body>
<h2>%(title)s</h2>
<ul>
%(items)s
</ul>
</body>
</html>
'''%dict(title=title, items=ilist.rstrip())
cnum = 0
clist = ''
categories = articles.keys()
categories.sort()
for category in categories:
cnum += 1
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
prefix = 'file:' if iswindows else ''
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
src = build_sub_index(category, articles[category])
open(cfile, 'wb').write(src.encode(encoding))
title = self.title
if not isinstance(title, unicode):
title = unicode(title, 'utf-8', 'replace')
src = u'''\
<html>
<body>
<h1>%(title)s</h1>
<div style='text-align: right; font-weight: bold'>%(date)s</div>
<ul>
%(categories)s
</ul>
</body>
</html>
'''%dict(date=strftime('%a, %d %B, %Y'),
categories=clist, title=title)
index = os.path.join(self.temp_dir, 'index.html')
open(index, 'wb').write(src.encode(encoding))
return index
@classmethod
def tag_to_string(cls, tag, use_alt=True):
'''
Convenience method to take a BeautifulSoup Tag and extract the text from it
recursively, including any CDATA sections and alt tag attributes.
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
@type use_alt: boolean
@return: A unicode (possibly empty) object
@rtype: unicode string
'''
if not tag:
return ''
if isinstance(tag, basestring):
return tag
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):
strings.append(item.string)
elif isinstance(item, Tag):
res = cls.tag_to_string(item)
if res:
strings.append(res)
elif use_alt and item.has_key('alt'):
strings.append(item['alt'])
return u''.join(strings)
def get_article_url(self, item):
'''
Return the article URL given an item Tag from a feed, or None if no valid URL is found
@type item: BeatifulSoup.Tag
@param item: A BeautifulSoup Tag instance corresponding to the <item> tag from a feed.
@rtype: string or None
'''
url = None
for element in self.url_search_order:
url = item.find(element.lower())
if url:
break
return url
def parse_feeds(self, require_url=True):
'''
Create list of articles from a list of feeds.
@param require_url: If True skip articles that don't have a link to a HTML page with the full article contents.
@type require_url: boolean
@rtype: dictionary
@return: A dictionary whose keys are feed titles and whose values are each
a list of dictionaries. Each list contains dictionaries of the form::
{
'title' : article title,
'url' : URL of print version,
'date' : The publication date of the article as a string,
'description' : A summary of the article
'content' : The full article (can be an empty string). This is used by FullContentProfile
}
'''
added_articles = {}
feeds = self.get_feeds()
articles = {}
for title, url in feeds:
try:
src = self.browser.open(url).read()
except Exception, err:
self.logger.error('Could not fetch feed: %s\nError: %s'%(url, err))
if self.verbose:
self.logger.exception(' ')
continue
articles[title] = []
added_articles[title] = []
soup = BeautifulStoneSoup(src)
for item in soup.findAll('item'):
try:
atitle = item.find('title')
if not atitle:
continue
atitle = self.tag_to_string(atitle)
if self.use_pubdate:
pubdate = item.find('pubdate')
if not pubdate:
pubdate = item.find('dc:date')
if not pubdate or not pubdate.string:
pubdate = formatdate()
pubdate = self.tag_to_string(pubdate)
pubdate = pubdate.replace('+0000', 'GMT')
url = self.get_article_url(item)
url = self.tag_to_string(url)
if require_url and not url:
self.logger.debug('Skipping article %s as it does not have a link url'%atitle)
continue
purl = url
try:
purl = self.print_version(url)
except Exception, err:
self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err))
continue
content = item.find('content:encoded')
if not content:
content = item.find('description')
if content:
content = self.process_html_description(content, strip_links=False)
else:
content = ''
d = {
'title' : atitle,
'url' : purl,
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
'date' : pubdate if self.use_pubdate else formatdate(),
'content' : content,
}
delta = time.time() - d['timestamp']
if not self.allow_duplicates:
if d['title'] in added_articles[title]:
continue
added_articles[title].append(d['title'])
if delta > self.oldest_article*3600*24:
continue
except Exception, err:
if self.verbose:
self.logger.exception('Error parsing article:\n%s'%(item,))
continue
try:
desc = ''
for c in item.findAll('description'):
desc = self.tag_to_string(c)
if desc:
break
d['description'] = self.process_html_description(desc) if self.html_description else desc.string
except:
d['description'] = ''
articles[title].append(d)
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
articles[title] = articles[title][:self.max_articles_per_feed+1]
#for item in articles[title]:
# item.pop('timestamp')
if not articles[title]:
articles.pop(title)
return articles
def cleanup(self):
'''
Called after LRF file has been generated. Use it to do any cleanup like
logging out of subscription sites, etc.
'''
pass
@classmethod
def process_html_description(cls, tag, strip_links=True):
'''
Process a <description> tag that contains HTML markup, either
entity encoded or escaped in a CDATA section.
@return: HTML
@rtype: string
'''
src = '\n'.join(tag.contents) if hasattr(tag, 'contents') else tag
match = cls.CDATA_PAT.match(src.lstrip())
if match:
src = match.group(1)
else:
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
for e in replaced_entities:
ent = '&'+e+';'
src = src.replace(ent, unichr(name2codepoint[e]))
if strip_links:
src = re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
return src
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
FULL_DAY_MAP = dict(Sunday=0, Monday=1, Tueday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6)
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
July=7, August=8, September=9, October=10,
November=11, December=12)
@classmethod
def strptime(cls, src):
'''
Take a string and return the date that string represents, in UTC as
an epoch (i.e. number of seconds since Jan 1, 1970). This function uses
a bunch of heuristics and is a prime candidate for being overridden in a
subclass.
@param src: Timestamp as a string
@type src: string
@return: time ans a epoch
@rtype: number
'''
delta = 0
zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src)
if zone:
delta = zone.group(1)
hrs, mins = int(delta[1:3]), int(delta[-2:].rstrip())
delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1)
src = src.replace(zone.group(), '')
if cls.pubdate_fmt is None:
src = src.strip().split()
try:
src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
except KeyError:
src[0] = str(cls.FULL_DAY_MAP[src[0][:-1]])+','
try:
src[2] = str(cls.MONTH_MAP[src[2]])
except KeyError:
src[2] = str(cls.FULL_MONTH_MAP[src[2]])
fmt = '%w, %d %m %Y %H:%M:%S'
src = src[:5] # Discard extra information
try:
time_t = time.strptime(' '.join(src), fmt)
except ValueError:
time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y'))
return calendar.timegm(time_t)-delta
else:
return calendar.timegm(time.strptime(src, cls.pubdate_fmt))
def command_line_options(self):
args = []
args.append('--max-recursions='+str(self.max_recursions))
args.append('--delay='+str(self.delay))
args.append('--max-files='+str(self.max_files))
for i in self.match_regexps:
args.append('--match-regexp="'+i+'"')
for i in self.filter_regexps:
args.append('--filter-regexp="'+i+'"')
return args
class FullContentProfile(DefaultProfile):
'''
This profile is designed for feeds that embed the full article content in the RSS file.
'''
max_recursions = 0
article_counter = 0
def build_index(self):
'''Build an RSS based index.html. '''
articles = self.parse_feeds(require_url=False)
def build_sub_index(title, items):
ilist = ''
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
for item in items:
content = item['content']
if not content:
self.logger.debug('Skipping article as it has no content:%s'%item['title'])
continue
item['description'] = cutoff(item['description'], self.summary_length)+'&hellip;'
self.article_counter = self.article_counter + 1
url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter)
item['url'] = url
open(url, 'wb').write((u'''\
<html>
<body>
<h2>%s</h2>
<div>
%s
</div>
</body>
</html>'''%(item['title'], content)).encode('utf-8')
)
ilist += li%item
return u'''\
<html>
<body>
<h2>%(title)s</h2>
<ul>
%(items)s
</ul>
</body>
</html>
'''%dict(title=title, items=ilist.rstrip())
cnum = 0
clist = ''
categories = articles.keys()
categories.sort()
for category in categories:
cnum += 1
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
prefix = 'file:' if iswindows else ''
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
src = build_sub_index(category, articles[category])
open(cfile, 'wb').write(src.encode('utf-8'))
src = '''\
<html>
<body>
<h1>%(title)s</h1>
<div style='text-align: right; font-weight: bold'>%(date)s</div>
<ul>
%(categories)s
</ul>
</body>
</html>
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
categories=clist, title=self.title)
index = os.path.join(self.temp_dir, 'index.html')
open(index, 'wb').write(src.encode('utf-8'))
return index
def cutoff(src, pos, fuzz=50):
si = src.find(';', pos)
if si > 0 and si-pos > fuzz:
si = -1
gi = src.find('>', pos)
if gi > 0 and gi-pos > fuzz:
gi = -1
npos = max(si, gi)
if npos < 0:
npos = pos
return src[:npos+1]
def create_class(src):
environment = {'FullContentProfile':FullContentProfile, 'DefaultProfile':DefaultProfile}
exec src in environment
for item in environment.values():
if hasattr(item, 'build_index'):
if item.__name__ not in ['DefaultProfile', 'FullContentProfile']:
return item
def cleanup(tdir):
try:
if os.path.isdir(tdir):
shutil.rmtree(tdir)
except:
pass

View File

@ -1,38 +0,0 @@
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class AssociatedPress(DefaultProfile):
title = 'Associated Press'
max_recursions = 2
max_articles_per_feed = 15
html2lrf_options = ['--force-page-break-before-tag="chapter"']
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<body class="apple-rss-no-unread-mode" onLoad="setup(null)">.*?<!-- start Entries -->', lambda match : '<body>'),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body.*?>.*?<span class="headline">', lambda match : '<body><span class="headline"><chapter>'),
(r'<tr><td><div class="body">.*?<p class="ap-story-p">', lambda match : '<p class="ap-story-p">'),
(r'<p class="ap-story-p">', lambda match : '<p>'),
(r'Learn more about our <a href="http://apdigitalnews.com/privacy.html">Privacy Policy</a>.*?</body>', lambda match : '</body>'),
]
]
def get_feeds(self):
return [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'),
('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'),
('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'),
('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'),
('AP Washington State News', 'http://hosted.ap.org/lineups/WASHINGTONHEADS-rss_2.0.xml?SITE=NYPLA&SECTION=HOME'),
('AP Technology News', 'http://hosted.ap.org/lineups/TECHHEADS-rss_2.0.xml?SITE=CTNHR&SECTION=HOME'),
('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'),
('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'),
('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'),
]

View File

@ -1,47 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Atlantic(DefaultProfile):
title = 'The Atlantic'
max_recursions = 2
INDEX = 'http://www.theatlantic.com/doc/current'
preprocess_regexps = [
(re.compile(r'<body.*?<div id="storytop"', re.DOTALL|re.IGNORECASE),
lambda m: '<body><div id="storytop"')
]
def parse_feeds(self):
articles = []
src = self.browser.open(self.INDEX).read()
soup = BeautifulSoup(src)
issue = soup.find('span', attrs={'class':'issue'})
if issue:
self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
for item in soup.findAll('div', attrs={'class':'item'}):
a = item.find('a')
if a and a.has_key('href'):
url = a['href']
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
title = self.tag_to_string(a)
byline = item.find(attrs={'class':'byline'})
date = self.tag_to_string(byline) if byline else ''
description = ''
articles.append({
'title':title,
'date':date,
'url':url,
'description':description
})
return {'Current Issue' : articles }

View File

@ -1,75 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre import iswindows
from calibre.ebooks.chardet import xml_to_unicode
class AutomaticRSSProfile(DefaultProfile):
'''
Make downloading of RSS feeds completely automatic. Only input
required is the URL of the feed.
'''
max_recursions = 2
def __init__(self, *args, **kwargs):
self.cindex = 1
DefaultProfile.__init__(*args, **kwargs)
def fetch_content(self, index):
raw = open(index, 'rb').read()
if self.encoding:
raw = raw.decode(self.encoding)
enc = self.encoding
else:
raw, enc = xml_to_unicode(raw)
isoup = BeautifulSoup(raw)
for a in isoup.findAll('a', href=True):
src = a['href']
if src.startswith('file:'):
src = src[5:]
if os.access(src, os.R_OK):
self.fetch_content(src)
continue
try:
src = self.browser.open(src).read()
except:
continue
soup = BeautifulSoup(src)
header, content = [], []
head = soup.find('head')
if head is not None:
for style in head('style'):
header.append(unicode(style))
body = soup.find('body')
if body is None:
continue
for tag in body(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
in_table = False
c = tag.parent
while c is not None:
if c.name == 'table':
in_table = True
break
c = c.parent
if in_table:
continue
content.append(unicode(tag))
cfile = 'content%d.html'%self.cindex
self.cindex += 1
cfile = os.path.join(os.path.dirname(index), cfile)
html = '<html>\n<head>%s</head>\n<body>%s</body></html>'%('\n'.join(header), '\n'.join(content))
open(cfile, 'wb').write(html.encode(enc))
a['href'] = ('file:' if iswindows else '') + cfile
open(index, 'wb').write(unicode(isoup).encode(enc))
def build_index(self):
index = DefaultProfile.build_index(self)
self.fetch_content(index)

View File

@ -1,90 +0,0 @@
##
## web2lrf profile to download articles from Barrons.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class Barrons(DefaultProfile):
title = 'Barron\'s'
max_recursions = 3
max_articles_per_feed = 50
needs_subscription = True
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = False
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
##delay = 1
## Don't grab articles more than 7 days old
oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove any reprint info from the body of the article.
(r'<hr size.*?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.barrons.com/auth/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
## Use the print version of a page when available.
def print_version(self, url):
return url.replace('/article/', '/article_print/')
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
def get_feeds(self):
return [
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# try:
# self.browser.set_debug_responses(True)
# import sys, logging
# logger = logging.getLogger("mechanize")
# logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.INFO)
# res = self.browser.open('http://online.barrons.com/logout')
# except:
# import traceback
# traceback.print_exc()

View File

@ -1,45 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch the BBC.
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class BBC(DefaultProfile):
title = 'The BBC'
max_recursions = 2
timefmt = ' [%a, %d %b, %Y]'
no_stylesheets = True
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove footer from individual stories
(r'<div class=.footer.>.*?Published',
lambda match : '<p></p><div class="footer">Published'),
# Add some style info in place of disabled stylesheet
(r'<link.*?type=.text/css.*?>', lambda match :
'''<style type="text/css">
.headline {font-size: x-large;}
.fact { padding-top: 10pt }
</style>'''),
]
]
def print_version(self, url):
return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
def get_feeds(self):
src = self.browser.open('http://news.bbc.co.uk/1/hi/help/3223484.stm').read()
soup = BeautifulSoup(src[src.index('<html'):])
feeds = []
ul = soup.find('ul', attrs={'class':'rss'})
for link in ul.findAll('a'):
feeds.append((link.string, link['href']))
return feeds

View File

@ -1,46 +0,0 @@
import re, time
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class ChristianScienceMonitor(DefaultProfile):
title = 'Christian Science Monitor'
max_recursions = 2
max_articles_per_feed = 20
no_stylesheets = True
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<body.*?<div id="story"', lambda match : '<body><div id="story"'),
(r'<div class="pubdate">.*?</div>', lambda m: ''),
(r'Full HTML version of this story which may include photos, graphics, and related links.*</body>',
lambda match : '</body>'),
]]
def parse_feeds(self):
soup = BeautifulSoup(self.browser.open('http://www.csmonitor.com/textedition'))
articles = {}
feed = []
for tag in soup.findAll(['h2', 'p']):
if tag.name == 'h2':
title = self.tag_to_string(tag)
feed = []
articles[title] = feed
elif tag.has_key('class') and tag['class'] == 'story':
a = tag.find('a')
if a is not None and a.has_key('href'):
feed.append({
'title': self.tag_to_string(a),
'url' : 'http://www.csmonitor.com'+a['href'],
'date' : time.strftime('%d %b'),
'content' : '',
})
a.extract()
feed[-1]['description'] = self.tag_to_string(tag).strip()
return articles

View File

@ -1,51 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Profile to download CNN
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class CNN(DefaultProfile):
title = 'CNN'
max_recursions = 2
timefmt = ' [%d %b %Y]'
html_description = True
no_stylesheets = True
oldest_article = 15
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [
(r'<head>.*?<title', lambda match : '<head><title'),
(r'</title>.*?</head>', lambda match : '</title></head>'),
(r'<body.*?<\!\-\-Article.*?>', lambda match : ''),
(r'<\!\-\-Article End\-\->.*?</body>', lambda match : '</body>'),
(r'(</h\d>)<ul>.*?</ul>', lambda match : match.group(1)), # drop story highlights
(r'<h2>(.*?)</h2><h1>(.*?)</h1>', lambda match : '<h1>' + match.group(1) + '</h1><h2>' + match.group(2) + '</h2>'), # sports uses h2 for main title and h1 for subtitle (???) switch these around
(r'<span class="cnnEmbeddedMosLnk">.*?</span>', lambda match : ''), # drop 'watch more' links
(r'(<div class="cnnstorybody">).*?(<p)', lambda match : match.group(1) + match.group(2)), # drop sports photos
(r'</?table.*?>|</?tr.*?>|</?td.*?>', lambda match : ''), # drop table formatting
(r'<div class="cnnendofstorycontent".*?>.*?</div>', lambda match : ''), # drop extra business links
(r'<a href="#TOP">.*?</a>', lambda match : '') # drop business 'to top' link
] ]
def print_version(self, url):
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
def get_feeds(self):
return [
('Top News', 'http://rss.cnn.com/rss/cnn_topstories.rss'),
('World', 'http://rss.cnn.com/rss/cnn_world.rss'),
('U.S.', 'http://rss.cnn.com/rss/cnn_us.rss'),
('Sports', 'http://rss.cnn.com/rss/si_topstories.rss'),
('Business', 'http://rss.cnn.com/rss/money_latest.rss'),
('Politics', 'http://rss.cnn.com/rss/cnn_allpolitics.rss'),
('Law', 'http://rss.cnn.com/rss/cnn_law.rss'),
('Technology', 'http://rss.cnn.com/rss/cnn_tech.rss'),
('Science & Space', 'http://rss.cnn.com/rss/cnn_space.rss'),
('Health', 'http://rss.cnn.com/rss/cnn_health.rss'),
('Entertainment', 'http://rss.cnn.com/rss/cnn_showbiz.rss'),
('Education', 'http://rss.cnn.com/rss/cnn_education.rss'),
('Offbeat', 'http://rss.cnn.com/rss/cnn_offbeat.rss'),
('Most Popular', 'http://rss.cnn.com/rss/cnn_mostpopular.rss')
]

Some files were not shown because too many files have changed in this diff Show More