diff --git a/.bzrignore b/.bzrignore index 47f754ef3c..0a44159b1e 100644 --- a/.bzrignore +++ b/.bzrignore @@ -13,6 +13,7 @@ src/calibre/manual/cli/ build dist docs +nbproject/ src/calibre/gui2/pictureflow/Makefile.Debug src/calibre/gui2/pictureflow/Makefile.Release src/calibre/gui2/pictureflow/debug/ diff --git a/.pydevproject b/.pydevproject index aaa4cc3986..509137a36a 100644 --- a/.pydevproject +++ b/.pydevproject @@ -2,9 +2,9 @@ -python 2.5 +python 2.6 -/calibre/src +/calibre-pluginize/src Default diff --git a/setup.py b/setup.py index cba8c17610..ee2d54cc5a 100644 --- a/setup.py +++ b/setup.py @@ -72,6 +72,9 @@ if __name__ == '__main__': library_dirs=[os.environ.get('PODOFO_LIB_DIR', podofo_lib)], include_dirs=\ [os.environ.get('PODOFO_INC_DIR', podofo_inc)])) + else: + print 'WARNING: PoDoFo not found on your system. Various PDF related', + print 'functionality will not work.' ext_modules = optional + [ @@ -88,6 +91,9 @@ if __name__ == '__main__': 'src/calibre/utils/msdes/des.c'], include_dirs=['src/calibre/utils/msdes']), + Extension('calibre.plugins.cPalmdoc', + sources=['src/calibre/ebooks/compression/palmdoc.c']), + PyQtExtension('calibre.plugins.pictureflow', ['src/calibre/gui2/pictureflow/pictureflow.cpp', 'src/calibre/gui2/pictureflow/pictureflow.h'], diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index a0dc41009a..79dc659f34 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -2,11 +2,11 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' __docformat__ = 'restructuredtext en' - -import sys, os, re, logging, time, subprocess, atexit, mimetypes, warnings +import sys, os, re, logging, time, mimetypes, \ + __builtin__, warnings, multiprocessing +__builtin__.__dict__['dynamic_property'] = lambda(func): func(None) from htmlentitydefs import name2codepoint from math import floor -from logging import Formatter warnings.simplefilter('ignore', DeprecationWarning) @@ -45,6 +45,13 @@ def to_unicode(raw, encoding='utf-8', errors='strict'): return raw return raw.decode(encoding, errors) +def patheq(p1, p2): + p = os.path + d = lambda x : p.normcase(p.normpath(p.realpath(p.normpath(x)))) + if not p1 or not p2: + return False + return d(p1) == d(p2) + def unicode_path(path, abs=False): if not isinstance(path, unicode): path = path.decode(sys.getfilesystemencoding()) @@ -71,7 +78,7 @@ def sanitize_file_name(name, substitute='_', as_unicode=False): **WARNING:** This function also replaces path separators, so only pass file names and not full paths to it. *NOTE:* This function always returns byte strings, not unicode objects. The byte strings - are encoded in the filesystem encoding of the platform, or UTF-8. + are encoded in the filesystem encoding of the platform, or UTF-8. ''' if isinstance(name, unicode): name = name.encode(filesystem_encoding, 'ignore') @@ -83,26 +90,33 @@ def sanitize_file_name(name, substitute='_', as_unicode=False): return one.replace('..', '_') +def prints(*args, **kwargs): + ''' + Print unicode arguments safely by encoding them to preferred_encoding + Has the same signature as the print function from Python 3. + ''' + file = kwargs.get('file', sys.stdout) + sep = kwargs.get('sep', ' ') + end = kwargs.get('end', '\n') + enc = preferred_encoding + if 'CALIBRE_WORKER' in os.environ: + enc = 'utf-8' + for i, arg in enumerate(args): + if isinstance(arg, unicode): + arg = arg.encode(enc) + if not isinstance(arg, str): + arg = str(arg) + if not isinstance(arg, unicode): + arg = arg.decode(preferred_encoding, 'replace').encode(enc) + file.write(arg) + if i != len(args)-1: + file.write(sep) + file.write(end) + class CommandLineError(Exception): pass -class ColoredFormatter(Formatter): - def format(self, record): - ln = record.__dict__['levelname'] - col = '' - if ln == 'CRITICAL': - col = terminal_controller.YELLOW - elif ln == 'ERROR': - col = terminal_controller.RED - elif ln in ['WARN', 'WARNING']: - col = terminal_controller.BLUE - elif ln == 'INFO': - col = terminal_controller.GREEN - elif ln == 'DEBUG': - col = terminal_controller.CYAN - record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL - return Formatter.format(self, record) def setup_cli_handlers(logger, level): @@ -157,7 +171,7 @@ def extract(path, dir): def get_proxies(): proxies = {} - + for q in ('http', 'ftp'): proxy = os.environ.get(q+'_proxy', None) if not proxy: continue @@ -192,8 +206,8 @@ def get_proxies(): def browser(honor_time=True, max_time=2, mobile_browser=False): ''' Create a mechanize browser for web scraping. The browser handles cookies, - refresh requests and ignores robots.txt. Also uses proxy if avaialable. - + refresh requests and ignores robots.txt. Also uses proxy if avaialable. + :param honor_time: If True honors pause time in refresh requests :param max_time: Maximum time in seconds to wait during a refresh request ''' @@ -230,21 +244,21 @@ def fit_image(width, height, pwidth, pheight): return scaled, int(width), int(height) class CurrentDir(object): - + def __init__(self, path): self.path = path self.cwd = None - + def __enter__(self, *args): self.cwd = os.getcwd() os.chdir(self.path) return self.cwd - + def __exit__(self, *args): os.chdir(self.cwd) -class FileWrapper(object): +class StreamReadWrapper(object): ''' Used primarily with pyPdf to ensure the stream is properly closed. ''' @@ -263,40 +277,7 @@ class FileWrapper(object): def detect_ncpus(): """Detects the number of effective CPUs in the system""" - try: - from PyQt4.QtCore import QThread - ans = QThread.idealThreadCount() - if ans > 0: - return ans - except: - pass - #for Linux, Unix and MacOS - if hasattr(os, "sysconf"): - if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"): - #Linux and Unix - ncpus = os.sysconf("SC_NPROCESSORS_ONLN") - if isinstance(ncpus, int) and ncpus > 0: - return ncpus - else: - #MacOS X - try: - return int(subprocess.Popen(('sysctl', '-n', 'hw.cpu'), stdout=subprocess.PIPE).stdout.read()) - except IOError: # Occassionally the system call gets interrupted - try: - return int(subprocess.Popen(('sysctl', '-n', 'hw.cpu'), stdout=subprocess.PIPE).stdout.read()) - except IOError: - return 1 - except ValueError: # On some systems the sysctl call fails - return 1 - - #for Windows - if os.environ.has_key("NUMBER_OF_PROCESSORS"): - ncpus = int(os.environ["NUMBER_OF_PROCESSORS"]); - if ncpus > 0: - return ncpus - #return the default value - return 1 - + return multiprocessing.cpu_count() def launch(path_or_url): if os.path.exists(path_or_url): @@ -343,67 +324,6 @@ def english_sort(x, y): ''' return cmp(_spat.sub('', x), _spat.sub('', y)) -class LoggingInterface: - - def __init__(self, logger): - self.__logger = self.logger = logger - - def setup_cli_handler(self, verbosity): - for handler in self.__logger.handlers: - if isinstance(handler, logging.StreamHandler): - return - if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers: - return - stream = sys.stdout - formatter = logging.Formatter() - level = logging.INFO - if verbosity > 0: - formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \ - ColoredFormatter('%(levelname)s: %(message)s') - level = logging.DEBUG - if verbosity > 1: - stream = sys.stderr - - handler = logging.StreamHandler(stream) - handler.setFormatter(formatter) - handler.setLevel(level) - self.__logger.addHandler(handler) - self.__logger.setLevel(level) - - - def ___log(self, func, msg, args, kwargs): - args = [msg] + list(args) - for i in range(len(args)): - if not isinstance(args[i], basestring): - continue - if sys.version_info[:2] > (2, 5): - if not isinstance(args[i], unicode): - args[i] = args[i].decode(preferred_encoding, 'replace') - elif isinstance(args[i], unicode): - args[i] = args[i].encode(preferred_encoding, 'replace') - func(*args, **kwargs) - - def log_debug(self, msg, *args, **kwargs): - self.___log(self.__logger.debug, msg, args, kwargs) - - def log_info(self, msg, *args, **kwargs): - self.___log(self.__logger.info, msg, args, kwargs) - - def log_warning(self, msg, *args, **kwargs): - self.___log(self.__logger.warning, msg, args, kwargs) - - def log_warn(self, msg, *args, **kwargs): - self.___log(self.__logger.warning, msg, args, kwargs) - - def log_error(self, msg, *args, **kwargs): - self.___log(self.__logger.error, msg, args, kwargs) - - def log_critical(self, msg, *args, **kwargs): - self.___log(self.__logger.critical, msg, args, kwargs) - - def log_exception(self, msg, *args): - self.___log(self.__logger.exception, msg, args, {}) - def walk(dir): ''' A nice interface to os.walk ''' for record in os.walk(dir): diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 52f85cc20c..e03d5e5edc 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -53,7 +53,7 @@ if plugins is None: plugin_path = getattr(pkg_resources, 'resource_filename')('calibre', 'plugins') sys.path.insert(0, plugin_path) - for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo'] + \ + for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo', 'cPalmdoc'] + \ (['winutil'] if iswindows else []) + \ (['usbobserver'] if isosx else []): try: diff --git a/src/calibre/customize/__init__.py b/src/calibre/customize/__init__.py index 3d48f42535..0e6bad8d2e 100644 --- a/src/calibre/customize/__init__.py +++ b/src/calibre/customize/__init__.py @@ -220,4 +220,5 @@ class MetadataWriterPlugin(Plugin): ''' pass - + + diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index d4470b16fd..d107413e38 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -1,8 +1,9 @@ -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import textwrap, os +import textwrap +import os +import glob from calibre.customize import FileTypePlugin, MetadataReaderPlugin, MetadataWriterPlugin from calibre.constants import __version__ @@ -18,170 +19,41 @@ every time you add an HTML file to the library.\ file_types = set(['html', 'htm', 'xhtml', 'xhtm']) supported_platforms = ['windows', 'osx', 'linux'] on_import = True - + def run(self, htmlfile): - of = self.temporary_file('_plugin_html2zip.zip') - from calibre.ebooks.html import gui_main as html2oeb - html2oeb(htmlfile, of) + from calibre.ptempfile import TemporaryDirectory + from calibre.gui2.convert.gui_conversion import gui_convert + from calibre.customize.conversion import OptionRecommendation + from calibre.ebooks.epub import initialize_container + + with TemporaryDirectory('_plugin_html2zip') as tdir: + gui_convert(htmlfile, tdir, [('debug_input', tdir, + OptionRecommendation.HIGH)]) + of = self.temporary_file('_plugin_html2zip.zip') + opf = glob.glob(os.path.join(tdir, '*.opf'))[0] + ncx = glob.glob(os.path.join(tdir, '*.ncx')) + if ncx: + os.remove(ncx[0]) + epub = initialize_container(of.name, os.path.basename(opf)) + epub.add_dir(tdir) + epub.close() + return of.name -class OPFMetadataReader(MetadataReaderPlugin): - - name = 'Read OPF metadata' - file_types = set(['opf']) - description = _('Read metadata from %s files')%'OPF' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.opf2 import OPF - from calibre.ebooks.metadata import MetaInformation - return MetaInformation(OPF(stream, os.getcwd())) - -class RTFMetadataReader(MetadataReaderPlugin): - - name = 'Read RTF metadata' - file_types = set(['rtf']) - description = _('Read metadata from %s files')%'RTF' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.rtf import get_metadata - return get_metadata(stream) - -class FB2MetadataReader(MetadataReaderPlugin): - - name = 'Read FB2 metadata' - file_types = set(['fb2']) - description = _('Read metadata from %s files')%'FB2' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.fb2 import get_metadata - return get_metadata(stream) - - -class LRFMetadataReader(MetadataReaderPlugin): - - name = 'Read LRF metadata' - file_types = set(['lrf']) - description = _('Read metadata from %s files')%'LRF' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.lrf.meta import get_metadata - return get_metadata(stream) - -class PDFMetadataReader(MetadataReaderPlugin): - - name = 'Read PDF metadata' - file_types = set(['pdf']) - description = _('Read metadata from %s files')%'PDF' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.pdf import get_metadata - return get_metadata(stream) - -class LITMetadataReader(MetadataReaderPlugin): - - name = 'Read LIT metadata' - file_types = set(['lit']) - description = _('Read metadata from %s files')%'LIT' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.lit import get_metadata - return get_metadata(stream) - -class IMPMetadataReader(MetadataReaderPlugin): - - name = 'Read IMP metadata' - file_types = set(['imp']) - description = _('Read metadata from %s files')%'IMP' - author = 'Ashish Kulkarni' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.imp import get_metadata - return get_metadata(stream) - -class RBMetadataReader(MetadataReaderPlugin): - - name = 'Read RB metadata' - file_types = set(['rb']) - description = _('Read metadata from %s files')%'RB' - author = 'Ashish Kulkarni' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.rb import get_metadata - return get_metadata(stream) - -class EPUBMetadataReader(MetadataReaderPlugin): - - name = 'Read EPUB metadata' - file_types = set(['epub']) - description = _('Read metadata from %s files')%'EPUB' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.epub import get_metadata - return get_metadata(stream) - -class HTMLMetadataReader(MetadataReaderPlugin): - - name = 'Read HTML metadata' - file_types = set(['html']) - description = _('Read metadata from %s files')%'HTML' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.html import get_metadata - return get_metadata(stream) - -class MOBIMetadataReader(MetadataReaderPlugin): - - name = 'Read MOBI metadata' - file_types = set(['mobi', 'prc', 'azw']) - description = _('Read metadata from %s files')%'MOBI' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.mobi.reader import get_metadata - return get_metadata(stream) - - -class TOPAZMetadataReader(MetadataReaderPlugin): - - name = 'Read Topaz metadata' - file_types = set(['tpz', 'azw1']) - description = _('Read metadata from %s files')%'MOBI' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.topaz import get_metadata - return get_metadata(stream) - -class ODTMetadataReader(MetadataReaderPlugin): - - name = 'Read ODT metadata' - file_types = set(['odt']) - description = _('Read metadata from %s files')%'ODT' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.odt import get_metadata - return get_metadata(stream) - -class LRXMetadataReader(MetadataReaderPlugin): - - name = 'Read LRX metadata' - file_types = set(['lrx']) - description = _('Read metadata from %s files')%'LRX' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.lrx import get_metadata - return get_metadata(stream) class ComicMetadataReader(MetadataReaderPlugin): - + name = 'Read comic metadata' file_types = set(['cbr', 'cbz']) description = _('Extract cover from comic files') - + def get_metadata(self, stream, ftype): if ftype == 'cbr': from calibre.libunrar import extract_member as extract_first + extract_first else: from calibre.libunzip import extract_member as extract_first - from calibre.ebooks.metadata import MetaInformation + from calibre.ebooks.metadata import MetaInformation ret = extract_first(stream) mi = MetaInformation(None, None) if ret is not None: @@ -189,83 +61,346 @@ class ComicMetadataReader(MetadataReaderPlugin): ext = os.path.splitext(path)[1][1:] mi.cover_data = (ext.lower(), data) return mi - -class ZipMetadataReader(MetadataReaderPlugin): - - name = 'Read ZIP metadata' - file_types = set(['zip', 'oebzip']) - description = _('Read metadata from ebooks in ZIP archives') - + +class EPUBMetadataReader(MetadataReaderPlugin): + + name = 'Read EPUB metadata' + file_types = set(['epub']) + description = _('Read metadata from %s files')%'EPUB' + def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.zip import get_metadata + from calibre.ebooks.metadata.epub import get_metadata + return get_metadata(stream) + +class FB2MetadataReader(MetadataReaderPlugin): + + name = 'Read FB2 metadata' + file_types = set(['fb2']) + description = _('Read metadata from %s files')%'FB2' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.fb2 import get_metadata + return get_metadata(stream) + +class HTMLMetadataReader(MetadataReaderPlugin): + + name = 'Read HTML metadata' + file_types = set(['html']) + description = _('Read metadata from %s files')%'HTML' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.html import get_metadata + return get_metadata(stream) + +class IMPMetadataReader(MetadataReaderPlugin): + + name = 'Read IMP metadata' + file_types = set(['imp']) + description = _('Read metadata from %s files')%'IMP' + author = 'Ashish Kulkarni' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.imp import get_metadata + return get_metadata(stream) + +class LITMetadataReader(MetadataReaderPlugin): + + name = 'Read LIT metadata' + file_types = set(['lit']) + description = _('Read metadata from %s files')%'LIT' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.lit import get_metadata + return get_metadata(stream) + +class LRFMetadataReader(MetadataReaderPlugin): + + name = 'Read LRF metadata' + file_types = set(['lrf']) + description = _('Read metadata from %s files')%'LRF' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.lrf.meta import get_metadata + return get_metadata(stream) + +class LRXMetadataReader(MetadataReaderPlugin): + + name = 'Read LRX metadata' + file_types = set(['lrx']) + description = _('Read metadata from %s files')%'LRX' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.lrx import get_metadata + return get_metadata(stream) + +class MOBIMetadataReader(MetadataReaderPlugin): + + name = 'Read MOBI metadata' + file_types = set(['mobi', 'prc', 'azw']) + description = _('Read metadata from %s files')%'MOBI' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.mobi.reader import get_metadata + return get_metadata(stream) + +class ODTMetadataReader(MetadataReaderPlugin): + + name = 'Read ODT metadata' + file_types = set(['odt']) + description = _('Read metadata from %s files')%'ODT' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.odt import get_metadata + return get_metadata(stream) + +class OPFMetadataReader(MetadataReaderPlugin): + + name = 'Read OPF metadata' + file_types = set(['opf']) + description = _('Read metadata from %s files')%'OPF' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.opf2 import OPF + from calibre.ebooks.metadata import MetaInformation + return MetaInformation(OPF(stream, os.getcwd())) + +class PDBMetadataReader(MetadataReaderPlugin): + + name = 'Read PDB metadata' + file_types = set(['pdb']) + description = _('Read metadata from %s files') % 'PDB' + author = 'John Schember' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.pdb import get_metadata + return get_metadata(stream) + +class PDFMetadataReader(MetadataReaderPlugin): + + name = 'Read PDF metadata' + file_types = set(['pdf']) + description = _('Read metadata from %s files')%'PDF' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.pdf import get_metadata return get_metadata(stream) class RARMetadataReader(MetadataReaderPlugin): - + name = 'Read RAR metadata' file_types = set(['rar']) description = _('Read metadata from ebooks in RAR archives') - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.rar import get_metadata return get_metadata(stream) +class RBMetadataReader(MetadataReaderPlugin): + + name = 'Read RB metadata' + file_types = set(['rb']) + description = _('Read metadata from %s files')%'RB' + author = 'Ashish Kulkarni' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.rb import get_metadata + return get_metadata(stream) + +class RTFMetadataReader(MetadataReaderPlugin): + + name = 'Read RTF metadata' + file_types = set(['rtf']) + description = _('Read metadata from %s files')%'RTF' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.rtf import get_metadata + return get_metadata(stream) + +class TOPAZMetadataReader(MetadataReaderPlugin): + + name = 'Read Topaz metadata' + file_types = set(['tpz', 'azw1']) + description = _('Read metadata from %s files')%'MOBI' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.topaz import get_metadata + return get_metadata(stream) + +class TXTMetadataReader(MetadataReaderPlugin): + + name = 'Read TXT metadata' + file_types = set(['txt']) + description = _('Read metadata from %s files') % 'TXT' + author = 'John Schember' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.txt import get_metadata + return get_metadata(stream) + +class ZipMetadataReader(MetadataReaderPlugin): + + name = 'Read ZIP metadata' + file_types = set(['zip', 'oebzip']) + description = _('Read metadata from ebooks in ZIP archives') + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.zip import get_metadata + return get_metadata(stream) + class EPUBMetadataWriter(MetadataWriterPlugin): - + name = 'Set EPUB metadata' file_types = set(['epub']) description = _('Set metadata in %s files')%'EPUB' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.epub import set_metadata set_metadata(stream, mi) - + class LRFMetadataWriter(MetadataWriterPlugin): - + name = 'Set LRF metadata' file_types = set(['lrf']) description = _('Set metadata in %s files')%'LRF' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.lrf.meta import set_metadata set_metadata(stream, mi) -class RTFMetadataWriter(MetadataWriterPlugin): - - name = 'Set RTF metadata' - file_types = set(['rtf']) - description = _('Set metadata in %s files')%'RTF' - - def set_metadata(self, stream, mi, type): - from calibre.ebooks.metadata.rtf import set_metadata - set_metadata(stream, mi) - class MOBIMetadataWriter(MetadataWriterPlugin): - + name = 'Set MOBI metadata' file_types = set(['mobi', 'prc', 'azw']) description = _('Set metadata in %s files')%'MOBI' author = 'Marshall T. Vandegrift' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.mobi import set_metadata set_metadata(stream, mi) - + +class PDBMetadataWriter(MetadataWriterPlugin): + + name = 'Set PDB metadata' + file_types = set(['pdb']) + description = _('Set metadata from %s files') % 'PDB' + author = 'John Schember' + + def set_metadata(self, stream, mi, type): + from calibre.ebooks.metadata.pdb import set_metadata + set_metadata(stream, mi) + class PDFMetadataWriter(MetadataWriterPlugin): name = 'Set PDF metadata' file_types = set(['pdf']) description = _('Set metadata in %s files') % 'PDF' - author = 'John Schember' - + author = 'Kovid Goyal' + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.pdf import set_metadata set_metadata(stream, mi) +class RTFMetadataWriter(MetadataWriterPlugin): -plugins = [HTML2ZIP] + name = 'Set RTF metadata' + file_types = set(['rtf']) + description = _('Set metadata in %s files')%'RTF' + + def set_metadata(self, stream, mi, type): + from calibre.ebooks.metadata.rtf import set_metadata + set_metadata(stream, mi) + + +from calibre.ebooks.comic.input import ComicInput +from calibre.ebooks.epub.input import EPUBInput +from calibre.ebooks.fb2.input import FB2Input +from calibre.ebooks.html.input import HTMLInput +from calibre.ebooks.lit.input import LITInput +from calibre.ebooks.mobi.input import MOBIInput +from calibre.ebooks.odt.input import ODTInput +from calibre.ebooks.pdb.input import PDBInput +from calibre.ebooks.pdf.input import PDFInput +from calibre.ebooks.pml.input import PMLInput +from calibre.ebooks.rb.input import RBInput +from calibre.web.feeds.input import RecipeInput +from calibre.ebooks.rtf.input import RTFInput +from calibre.ebooks.txt.input import TXTInput + +from calibre.ebooks.epub.output import EPUBOutput +from calibre.ebooks.fb2.output import FB2Output +from calibre.ebooks.lit.output import LITOutput +from calibre.ebooks.lrf.output import LRFOutput +from calibre.ebooks.mobi.output import MOBIOutput +from calibre.ebooks.oeb.output import OEBOutput +from calibre.ebooks.pdb.output import PDBOutput +from calibre.ebooks.pdf.output import PDFOutput +from calibre.ebooks.pml.output import PMLOutput +from calibre.ebooks.rb.output import RBOutput +from calibre.ebooks.rtf.output import RTFOutput +from calibre.ebooks.txt.output import TXTOutput + +from calibre.customize.profiles import input_profiles, output_profiles + + +from calibre.devices.bebook.driver import BEBOOK, BEBOOK_MINI +from calibre.devices.blackberry.driver import BLACKBERRY +from calibre.devices.cybookg3.driver import CYBOOKG3 +from calibre.devices.eb600.driver import EB600 +from calibre.devices.jetbook.driver import JETBOOK +from calibre.devices.kindle.driver import KINDLE +from calibre.devices.kindle.driver import KINDLE2 +from calibre.devices.prs500.driver import PRS500 +from calibre.devices.prs505.driver import PRS505 +from calibre.devices.prs700.driver import PRS700 + + +plugins = [] +plugins += [ + ComicInput, + EPUBInput, + FB2Input, + HTMLInput, + LITInput, + MOBIInput, + ODTInput, + PDBInput, + PDFInput, + PMLInput, + RBInput, + RecipeInput, + RTFInput, + TXTInput, +] +plugins += [ + EPUBOutput, + FB2Output, + LITOutput, + LRFOutput, + MOBIOutput, + OEBOutput, + PDBOutput, + PDFOutput, + PMLOutput, + RBOutput, + RTFOutput, + TXTOutput, +] +plugins += [ + BEBOOK, + BEBOOK_MINI, + BLACKBERRY, + CYBOOKG3, + EB600, + JETBOOK, + KINDLE, + KINDLE2, + PRS500, + PRS505, + PRS700, +] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataWriter')] +plugins += input_profiles + output_profiles diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py new file mode 100644 index 0000000000..4d19ba4fad --- /dev/null +++ b/src/calibre/customize/conversion.py @@ -0,0 +1,293 @@ +from __future__ import with_statement +''' +Defines the plugin system for conversions. +''' +import re, os, shutil + +from calibre import CurrentDir +from calibre.customize import Plugin + +class ConversionOption(object): + + ''' + Class representing conversion options + ''' + + def __init__(self, name=None, help=None, long_switch=None, + short_switch=None, choices=None): + self.name = name + self.help = help + self.long_switch = long_switch + self.short_switch = short_switch + self.choices = choices + + if self.long_switch is None: + self.long_switch = self.name.replace('_', '-') + + self.validate_parameters() + + def validate_parameters(self): + ''' + Validate the parameters passed to :method:`__init__`. + ''' + if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None: + raise ValueError(self.name + ' is not a valid Python identifier') + if not self.help: + raise ValueError('You must set the help text') + + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return hash(self) == hash(other) + + def clone(self): + return ConversionOption(name=self.name, help=self.help, + long_switch=self.long_switch, short_switch=self.short_switch, + choices=self.choices) + +class OptionRecommendation(object): + LOW = 1 + MED = 2 + HIGH = 3 + + def __init__(self, recommended_value=None, level=LOW, **kwargs): + ''' + An option recommendation. That is, an option as well as its recommended + value and the level of the recommendation. + ''' + self.level = level + self.recommended_value = recommended_value + self.option = kwargs.pop('option', None) + if self.option is None: + self.option = ConversionOption(**kwargs) + + self.validate_parameters() + + @property + def help(self): + return self.option.help + + def clone(self): + return OptionRecommendation(recommended_value=self.recommended_value, + level=self.level, option=self.option.clone()) + + def validate_parameters(self): + if self.option.choices and self.recommended_value not in \ + self.option.choices: + raise ValueError('OpRec: %s: Recommended value not in choices'% + self.option.name) + if not (isinstance(self.recommended_value, (int, float, str, unicode))\ + or self.recommended_value is None): + raise ValueError('OpRec: %s:'%self.option.name + + repr(self.recommended_value) + + ' is not a string or a number') + +class DummyReporter(object): + + def __call__(self, percent, msg=''): + pass + +class InputFormatPlugin(Plugin): + ''' + InputFormatPlugins are responsible for converting a document into + HTML+OPF+CSS+etc. + The results of the conversion *must* be encoded in UTF-8. + The main action happens in :method:`convert`. + ''' + + type = _('Conversion Input') + can_be_disabled = False + supported_platforms = ['windows', 'osx', 'linux'] + + #: Set of file types for which this plugin should be run + #: For example: ``set(['azw', 'mobi', 'prc'])`` + file_types = set([]) + + #: If True, this input plugin generates a collection of images, + #: one per HTML file. You can obtain access to the images via + #: convenience method, :method:`get_image_collection`. + is_image_collection = False + + #: Options shared by all Input format plugins. Do not override + #: in sub-classes. Use :member:`options` instead. Every option must be an + #: instance of :class:`OptionRecommendation`. + common_options = set([ + OptionRecommendation(name='debug_input', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Save the output from the input plugin to the specified ' + 'directory. Useful if you are unsure at which stage ' + 'of the conversion process a bug is occurring. ' + 'WARNING: This completely deletes the contents of ' + 'the specified directory.') + ), + + OptionRecommendation(name='input_encoding', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Specify the character encoding of the input document. If ' + 'set this option will override any encoding declared by the ' + 'document itself. Particularly useful for documents that ' + 'do not declare an encoding or that have erroneous ' + 'encoding declarations.') + ), + + ]) + + #: Options to customize the behavior of this plugin. Every option must be an + #: instance of :class:`OptionRecommendation`. + options = set([]) + + #: A set of 3-tuples of the form + #: (option_name, recommended_value, recommendation_level) + recommendations = set([]) + + def __init__(self, *args): + Plugin.__init__(self, *args) + self.report_progress = DummyReporter() + + def get_images(self): + ''' + Return a list of absolute paths to the images, if this input plugin + represents an image collection. The list of images is in the same order + as the spine and the TOC. + ''' + raise NotImplementedError() + + def preprocess_html(self, html): + ''' + This method is called by the conversion pipeline on all HTML before it + is parsed. It is meant to be used to do any required preprocessing on + the HTML, like removing hard line breaks, etc. + + :param html: A unicode string + :return: A unicode string + ''' + return html + + + def convert(self, stream, options, file_ext, log, accelerators): + ''' + This method must be implemented in sub-classes. It must return + the path to the created OPF file or an :class:`OEBBook` instance. + All output should be contained in the current directory. + If this plugin creates files outside the current + directory they must be deleted/marked for deletion before this method + returns. + + :param stream: A file like object that contains the input file. + + :param options: Options to customize the conversion process. + Guaranteed to have attributes corresponding + to all the options declared by this plugin. In + addition, it will have a verbose attribute that + takes integral values from zero upwards. Higher numbers + mean be more verbose. Another useful attribute is + ``input_profile`` that is an instance of + :class:`calibre.customize.profiles.InputProfile`. + + :param file_ext: The extension (without the .) of the input file. It + is guaranteed to be one of the `file_types` supported + by this plugin. + + :param log: A :class:`calibre.utils.logging.Log` object. All output + should use this object. + + :param accelarators: A dictionary of various information that the input + plugin can get easily that would speed up the + subsequent stages of the conversion. + + ''' + raise NotImplementedError + + def __call__(self, stream, options, file_ext, log, + accelerators, output_dir): + log('InputFormatPlugin: %s running'%self.name, end=' ') + if hasattr(stream, 'name'): + log('on', stream.name) + + with CurrentDir(output_dir): + for x in os.listdir('.'): + shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) + + ret = self.convert(stream, options, file_ext, + log, accelerators) + + if options.debug_input is not None: + options.debug_input = os.path.abspath(options.debug_input) + if not os.path.exists(options.debug_input): + os.makedirs(options.debug_input) + if isinstance(ret, basestring): + shutil.rmtree(options.debug_input) + shutil.copytree(output_dir, options.debug_input) + else: + from calibre.ebooks.oeb.writer import OEBWriter + w = OEBWriter(pretty_print=options.pretty_print) + w(ret, options.debug_input) + + log.info('Input debug saved to:', options.debug_input) + + return ret + + +class OutputFormatPlugin(Plugin): + ''' + OutputFormatPlugins are responsible for converting an OEB document + (OPF+HTML) into an output ebook. + + The OEB document can be assumed to be encoded in UTF-8. + The main action happens in :method:`convert`. + ''' + + type = _('Conversion Output') + can_be_disabled = False + supported_platforms = ['windows', 'osx', 'linux'] + + #: The file type (extension without leading period) that this + #: plugin outputs + file_type = None + + #: Options shared by all Input format plugins. Do not override + #: in sub-classes. Use :member:`options` instead. Every option must be an + #: instance of :class:`OptionRecommendation`. + common_options = set([ + OptionRecommendation(name='pretty_print', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('If specified, the output plugin will try to create output ' + 'that is as human readable as possible. May not have any effect ' + 'for some output plugins.') + ), + ]) + + #: Options to customize the behavior of this plugin. Every option must be an + #: instance of :class:`OptionRecommendation`. + options = set([]) + + #: A set of 3-tuples of the form + #: (option_name, recommended_value, recommendation_level) + recommendations = set([]) + + def __init__(self, *args): + Plugin.__init__(self, *args) + self.report_progress = DummyReporter() + + + def convert(self, oeb_book, output, input_plugin, opts, log): + ''' + Render the contents of `oeb_book` (which is an instance of + :class:`calibre.ebooks.oeb.OEBBook` to the file specified by output. + + :param output: Either a file like object or a string. If it is a string + it is the path to a directory that may or may not exist. The output + plugin should write its output into that directory. If it is a file like + object, the output plugin should write its output into the file. + + :param input_plugin: The input plugin that was used at the beginning of + the conversion pipeline. + + :param opts: Conversion options. Guaranteed to have attributes + corresponding to the OptionRecommendations of this plugin. + + :param log: The logger. Print debug/info messages etc. using this. + ''' + raise NotImplementedError + diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py new file mode 100644 index 0000000000..4c184ca36d --- /dev/null +++ b/src/calibre/customize/profiles.py @@ -0,0 +1,241 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from itertools import izip + +from calibre.customize import Plugin as _Plugin + +FONT_SIZES = [('xx-small', 1), + ('x-small', None), + ('small', 2), + ('medium', 3), + ('large', 4), + ('x-large', 5), + ('xx-large', 6), + (None, 7)] + + +class Plugin(_Plugin): + + fbase = 12 + fsizes = [5, 7, 9, 12, 13.5, 17, 20, 22, 24] + screen_size = (1600, 1200) + dpi = 100 + + def __init__(self, *args, **kwargs): + _Plugin.__init__(self, *args, **kwargs) + self.width, self.height = self.screen_size + fsizes = list(self.fsizes) + self.fkey = list(self.fsizes) + self.fsizes = [] + for (name, num), size in izip(FONT_SIZES, fsizes): + self.fsizes.append((name, num, float(size))) + self.fnames = dict((name, sz) for name, _, sz in self.fsizes if name) + self.fnums = dict((num, sz) for _, num, sz in self.fsizes if num) + + +class InputProfile(Plugin): + + author = 'Kovid Goyal' + supported_platforms = set(['windows', 'osx', 'linux']) + can_be_disabled = False + type = _('Input profile') + + name = 'Default Input Profile' + short_name = 'default' # Used in the CLI so dont use spaces etc. in it + description = _('This profile tries to provide sane defaults and is useful ' + 'if you know nothing about the input document.') + + +class SonyReaderInput(InputProfile): + + name = 'Sony Reader' + short_name = 'sony' + description = _('This profile is intended for the SONY PRS line. ' + 'The 500/505/700 etc.') + + screen_size = (584, 754) + dpi = 168.451 + fbase = 12 + fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24] + + +class MSReaderInput(InputProfile): + + name = 'Microsoft Reader' + short_name = 'msreader' + description = _('This profile is intended for the Microsoft Reader.') + + screen_size = (480, 652) + dpi = 96 + fbase = 13 + fsizes = [10, 11, 13, 16, 18, 20, 22, 26] + +class MobipocketInput(InputProfile): + + name = 'Mobipocket Books' + short_name = 'mobipocket' + description = _('This profile is intended for the Mobipocket books.') + + # Unfortunately MOBI books are not narrowly targeted, so this information is + # quite likely to be spurious + screen_size = (600, 800) + dpi = 96 + fbase = 18 + fsizes = [14, 14, 16, 18, 20, 22, 24, 26] + +class HanlinV3Input(InputProfile): + + name = 'Hanlin V3' + short_name = 'hanlinv3' + description = _('This profile is intended for the Hanlin V3 and its clones.') + + # Screen size is a best guess + screen_size = (584, 754) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + +class CybookG3Input(InputProfile): + + name = 'Cybook G3' + short_name = 'cybookg3' + description = _('This profile is intended for the Cybook G3.') + + # Screen size is a best guess + screen_size = (600, 800) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + +class KindleInput(InputProfile): + + name = 'Kindle' + short_name = 'kindle' + description = _('This profile is intended for the Amazon Kindle.') + + # Screen size is a best guess + screen_size = (525, 640) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + + +input_profiles = [InputProfile, SonyReaderInput, MSReaderInput, + MobipocketInput, HanlinV3Input, CybookG3Input, KindleInput] + + +class OutputProfile(Plugin): + + author = 'Kovid Goyal' + supported_platforms = set(['windows', 'osx', 'linux']) + can_be_disabled = False + type = _('Output profile') + + name = 'Default Output Profile' + short_name = 'default' # Used in the CLI so dont use spaces etc. in it + description = _('This profile tries to provide sane defaults and is useful ' + 'if you want to produce a document intended to be read at a ' + 'computer or on a range of devices.') + + # The image size for comics + comic_screen_size = (584, 754) + + @classmethod + def tags_to_string(cls, tags): + return ', '.join(tags) + +class SonyReaderOutput(OutputProfile): + + name = 'Sony Reader' + short_name = 'sony' + description = _('This profile is intended for the SONY PRS line. ' + 'The 500/505/700 etc.') + + screen_size = (600, 775) + dpi = 168.451 + fbase = 12 + fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24] + +class SonyReaderLandscapeOutput(SonyReaderOutput): + + name = 'Sony Reader Landscape' + short_name = 'sony-landscape' + description = _('This profile is intended for the SONY PRS line. ' + 'The 500/505/700 etc, in landscape mode. Mainly useful ' + 'for comics.') + + screen_size = (784, 1012) + comic_screen_size = (784, 1012) + + +class MSReaderOutput(OutputProfile): + + name = 'Microsoft Reader' + short_name = 'msreader' + description = _('This profile is intended for the Microsoft Reader.') + + screen_size = (480, 652) + dpi = 96 + fbase = 13 + fsizes = [10, 11, 13, 16, 18, 20, 22, 26] + +class MobipocketOutput(OutputProfile): + + name = 'Mobipocket Books' + short_name = 'mobipocket' + description = _('This profile is intended for the Mobipocket books.') + + # Unfortunately MOBI books are not narrowly targeted, so this information is + # quite likely to be spurious + screen_size = (600, 800) + dpi = 96 + fbase = 18 + fsizes = [14, 14, 16, 18, 20, 22, 24, 26] + +class HanlinV3Output(OutputProfile): + + name = 'Hanlin V3' + short_name = 'hanlinv3' + description = _('This profile is intended for the Hanlin V3 and its clones.') + + # Screen size is a best guess + screen_size = (584, 754) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + +class CybookG3Output(OutputProfile): + + name = 'Cybook G3' + short_name = 'cybookg3' + description = _('This profile is intended for the Cybook G3.') + + # Screen size is a best guess + screen_size = (600, 800) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + +class KindleOutput(OutputProfile): + + name = 'Kindle' + short_name = 'kindle' + description = _('This profile is intended for the Amazon Kindle.') + + # Screen size is a best guess + screen_size = (525, 640) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + + @classmethod + def tags_to_string(cls, tags): + return 'ttt '.join(tags)+'ttt ' + + +output_profiles = [OutputProfile, SonyReaderOutput, MSReaderOutput, + MobipocketOutput, HanlinV3Output, CybookG3Output, KindleOutput, + SonyReaderLandscapeOutput] diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index e19c17a169..f6ab19a910 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -6,13 +6,15 @@ import os, shutil, traceback, functools, sys, re from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \ MetadataWriterPlugin +from calibre.customize.conversion import InputFormatPlugin, OutputFormatPlugin +from calibre.customize.profiles import InputProfile, OutputProfile from calibre.customize.builtins import plugins as builtin_plugins from calibre.constants import __version__, iswindows, isosx +from calibre.devices.interface import DevicePlugin from calibre.ebooks.metadata import MetaInformation from calibre.utils.config import make_config_dir, Config, ConfigProxy, \ plugin_dir, OptionParser - version = tuple([int(x) for x in __version__.split('.')]) platform = 'linux' @@ -47,7 +49,7 @@ def load_plugin(path_to_zip_file): :return: A :class:`Plugin` instance. ''' - print 'Loading plugin from', path_to_zip_file + #print 'Loading plugin from', path_to_zip_file if not os.access(path_to_zip_file, os.R_OK): raise PluginNotFound zf = ZipFile(path_to_zip_file) @@ -77,6 +79,15 @@ _on_import = {} _on_preprocess = {} _on_postprocess = {} +def input_profiles(): + for plugin in _initialized_plugins: + if isinstance(plugin, InputProfile): + yield plugin + +def output_profiles(): + for plugin in _initialized_plugins: + if isinstance(plugin, OutputProfile): + yield plugin def reread_filetype_plugins(): @@ -121,7 +132,19 @@ def reread_metadata_plugins(): _metadata_writers[ft] = [] _metadata_writers[ft].append(plugin) +def metadata_readers(): + ans = set([]) + for plugins in _metadata_readers.values(): + for plugin in plugins: + ans.add(plugin) + return ans +def metadata_writers(): + ans = set([]) + for plugins in _metadata_writers.values(): + for plugin in plugins: + ans.add(plugin) + return ans def get_file_type_metadata(stream, ftype): mi = MetaInformation(None, None) @@ -229,6 +252,47 @@ def find_plugin(name): if plugin.name == name: return plugin +def input_format_plugins(): + for plugin in _initialized_plugins: + if isinstance(plugin, InputFormatPlugin): + yield plugin + +def plugin_for_input_format(fmt): + for plugin in input_format_plugins(): + if fmt.lower() in plugin.file_types: + return plugin + +def available_input_formats(): + formats = set([]) + for plugin in input_format_plugins(): + if not is_disabled(plugin): + for format in plugin.file_types: + formats.add(format) + return formats + +def output_format_plugins(): + for plugin in _initialized_plugins: + if isinstance(plugin, OutputFormatPlugin): + yield plugin + +def plugin_for_output_format(fmt): + for plugin in output_format_plugins(): + if fmt.lower() == plugin.file_type: + return plugin + +def available_output_formats(): + formats = set([]) + for plugin in output_format_plugins(): + if not is_disabled(plugin): + formats.add(plugin.file_type) + return formats + +def device_plugins(): + for plugin in _initialized_plugins: + if isinstance(plugin, DevicePlugin): + if not is_disabled(plugin): + yield plugin + def disable_plugin(plugin_or_name): x = getattr(plugin_or_name, 'name', plugin_or_name) plugin = find_plugin(x) diff --git a/src/calibre/devices/__init__.py b/src/calibre/devices/__init__.py index 06efbc6434..874de7c070 100644 --- a/src/calibre/devices/__init__.py +++ b/src/calibre/devices/__init__.py @@ -5,21 +5,6 @@ __copyright__ = '2008, Kovid Goyal ' Device drivers. ''' -def devices(): - from calibre.devices.prs500.driver import PRS500 - from calibre.devices.prs505.driver import PRS505 - from calibre.devices.prs700.driver import PRS700 - from calibre.devices.cybookg3.driver import CYBOOKG3 - from calibre.devices.kindle.driver import KINDLE - from calibre.devices.kindle.driver import KINDLE2 - from calibre.devices.bebook.driver import BEBOOK - from calibre.devices.bebook.driver import BEBOOKMINI - from calibre.devices.blackberry.driver import BLACKBERRY - from calibre.devices.eb600.driver import EB600 - from calibre.devices.jetbook.driver import JETBOOK - return (PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, - BEBOOK, BEBOOKMINI, BLACKBERRY, EB600, JETBOOK) - import time DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) diff --git a/src/calibre/devices/bebook/driver.py b/src/calibre/devices/bebook/driver.py index 7313c24253..b6655d37ec 100644 --- a/src/calibre/devices/bebook/driver.py +++ b/src/calibre/devices/bebook/driver.py @@ -7,19 +7,25 @@ Device driver for BeBook from calibre.devices.usbms.driver import USBMS class BEBOOK(USBMS): + name = 'BeBook driver' + description = _('Communicate with the BeBook eBook reader.') + author = _('Tijmen Ruizendaal') + supported_platforms = ['windows', 'osx', 'linux'] + + # Ordered list of supported formats FORMATS = ['mobi', 'epub', 'pdf', 'txt'] VENDOR_ID = [0x0525] PRODUCT_ID = [0x8803, 0x6803] - BCD = [0x312] + BCD = [0x312] - VENDOR_NAME = 'LINUX' + VENDOR_NAME = 'LINUX' WINDOWS_MAIN_MEM = 'FILE-STOR_GADGET' WINDOWS_CARD_MEM = 'FILE-STOR_GADGET' OSX_MAIN_MEM = 'BeBook Internal Memory' - OSX_CARD_MEM = 'BeBook Storage Card' + OSX_CARD_A_MEM = 'BeBook Storage Card' MAIN_MEMORY_VOLUME_LABEL = 'BeBook Internal Memory' STORAGE_CARD_VOLUME_LABEL = 'BeBook Storage Card' @@ -30,20 +36,22 @@ class BEBOOK(USBMS): def windows_sort_drives(self, drives): main = drives.get('main', None) - card = drives.get('card', None) + card = drives.get('carda', None) if card and main and card < main: drives['main'] = card - drives['card'] = main + drives['carda'] = main return drives +class BEBOOK_MINI(BEBOOK): + name = 'BeBook Mini driver' + description = _('Communicate with the BeBook Mini eBook reader.') -class BEBOOKMINI(BEBOOK): VENDOR_ID = [0x0492] PRODUCT_ID = [0x8813] - BCD = [0x319] + BCD = [0x319] OSX_MAIN_MEM = 'BeBook Mini Internal Memory' OSX_CARD_MEM = 'BeBook Mini Storage Card' diff --git a/src/calibre/devices/blackberry/driver.py b/src/calibre/devices/blackberry/driver.py index f6c615b0de..da2328419a 100644 --- a/src/calibre/devices/blackberry/driver.py +++ b/src/calibre/devices/blackberry/driver.py @@ -7,6 +7,12 @@ __docformat__ = 'restructuredtext en' from calibre.devices.usbms.driver import USBMS class BLACKBERRY(USBMS): + + name = 'Blackberry Device Interface' + description = _('Communicate with the Blackberry smart phone.') + author = _('Kovid Goyal') + supported_platforms = ['windows', 'linux'] + # Ordered list of supported formats FORMATS = ['mobi', 'prc'] @@ -16,15 +22,11 @@ class BLACKBERRY(USBMS): VENDOR_NAME = 'RIM' WINDOWS_MAIN_MEM = 'BLACKBERRY_SD' - #WINDOWS_CARD_MEM = 'CARD_STORAGE' #OSX_MAIN_MEM = 'Kindle Internal Storage Media' - #OSX_CARD_MEM = 'Kindle Card Storage Media' MAIN_MEMORY_VOLUME_LABEL = 'Blackberry Main Memory' - #STORAGE_CARD_VOLUME_LABEL = 'Kindle Storage Card' EBOOK_DIR_MAIN = 'ebooks' - #EBOOK_DIR_CARD = "documents" SUPPORTS_SUB_DIRS = True diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index dcde8b873c..5dde9ab51d 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -7,11 +7,17 @@ Device driver for Bookeen's Cybook Gen 3 import os, shutil from itertools import cycle -from calibre.devices.errors import FreeSpaceError +from calibre.devices.errors import DeviceError, FreeSpaceError from calibre.devices.usbms.driver import USBMS import calibre.devices.cybookg3.t2b as t2b class CYBOOKG3(USBMS): + name = 'Cybook Gen 3 Device Interface' + description = _('Communicate with the Cybook eBook reader.') + author = _('John Schember') + supported_platforms = ['windows', 'osx', 'linux'] + + # Ordered list of supported formats # Be sure these have an entry in calibre.devices.mime FORMATS = ['mobi', 'prc', 'html', 'pdf', 'rtf', 'txt'] @@ -22,60 +28,45 @@ class CYBOOKG3(USBMS): VENDOR_NAME = 'BOOKEEN' WINDOWS_MAIN_MEM = 'CYBOOK_GEN3__-FD' - WINDOWS_CARD_MEM = 'CYBOOK_GEN3__-SD' + WINDOWS_CARD_A_MEM = 'CYBOOK_GEN3__-SD' OSX_MAIN_MEM = 'Bookeen Cybook Gen3 -FD Media' - OSX_CARD_MEM = 'Bookeen Cybook Gen3 -SD Media' + OSX_CARD_A_MEM = 'Bookeen Cybook Gen3 -SD Media' MAIN_MEMORY_VOLUME_LABEL = 'Cybook Gen 3 Main Memory' STORAGE_CARD_VOLUME_LABEL = 'Cybook Gen 3 Storage Card' EBOOK_DIR_MAIN = "eBooks" - EBOOK_DIR_CARD = "eBooks" + EBOOK_DIR_CARD_A = "eBooks" THUMBNAIL_HEIGHT = 144 SUPPORTS_SUB_DIRS = True - def upload_books(self, files, names, on_card=False, end_session=True, + def upload_books(self, files, names, on_card=None, end_session=True, metadata=None): - if on_card and not self._card_prefix: - raise ValueError(_('The reader has no storage card connected.')) - - if not on_card: - path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN) - else: - path = os.path.join(self._card_prefix, self.EBOOK_DIR_CARD) - - def get_size(obj): - if hasattr(obj, 'seek'): - obj.seek(0, os.SEEK_END) - size = obj.tell() - obj.seek(0) - return size - return os.path.getsize(obj) - - sizes = [get_size(f) for f in files] - size = sum(sizes) - - if on_card and size > self.free_space()[2] - 1024*1024: - raise FreeSpaceError(_("There is insufficient free space on the storage card")) - if not on_card and size > self.free_space()[0] - 2*1024*1024: - raise FreeSpaceError(_("There is insufficient free space in main memory")) + path = self._sanity_check(on_card, files) paths = [] names = iter(names) metadata = iter(metadata) - for infile in files: + for i, infile in enumerate(files): newpath = path mdata = metadata.next() - if self.SUPPORTS_SUB_DIRS: - if 'tags' in mdata.keys(): - for tag in mdata['tags']: - if tag.startswith('/'): - newpath += tag - newpath = os.path.normpath(newpath) - break + if 'tags' in mdata.keys(): + for tag in mdata['tags']: + if tag.startswith(_('News')): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + elif tag.startswith('/'): + newpath += tag + newpath = os.path.normpath(newpath) + break + + if newpath == path: + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) if not os.path.exists(newpath): os.makedirs(newpath) @@ -103,10 +94,15 @@ class CYBOOKG3(USBMS): t2b.write_t2b(t2bfile, coverdata) t2bfile.close() + self.report_progress(i / float(len(files)), _('Transferring books to device...')) + + self.report_progress(1.0, _('Transferring books to device...')) + return zip(paths, cycle([on_card])) def delete_books(self, paths, end_session=True): - for path in paths: + for i, path in enumerate(paths): + self.report_progress((i+1) / float(len(paths)), _('Removing books from device...')) if os.path.exists(path): os.unlink(path) @@ -115,6 +111,8 @@ class CYBOOKG3(USBMS): # Delete the ebook auxiliary file if os.path.exists(filepath + '.mbp'): os.unlink(filepath + '.mbp') + if os.path.exists(filepath + '.dat'): + os.unlink(filepath + '.dat') # Delete the thumbnails file auto generated for the ebook if os.path.exists(filepath + '_6090.t2b'): @@ -124,4 +122,4 @@ class CYBOOKG3(USBMS): os.removedirs(os.path.dirname(path)) except: pass - + self.report_progress(1.0, _('Removing books from device...')) diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py old mode 100755 new mode 100644 index 4b9204ebd0..b42c77f172 --- a/src/calibre/devices/eb600/driver.py +++ b/src/calibre/devices/eb600/driver.py @@ -14,6 +14,11 @@ Windows PNP strings: from calibre.devices.usbms.driver import USBMS class EB600(USBMS): + name = 'Netronix EB600 Device Interface' + description = _('Communicate with the EB600 eBook reader.') + author = _('Kovid Goyal') + supported_platforms = ['windows', 'osx', 'linux'] + # Ordered list of supported formats FORMATS = ['epub', 'prc', 'chm', 'djvu', 'html', 'rtf', 'txt', 'pdf'] DRM_FORMATS = ['prc', 'mobi', 'html', 'pdf', 'txt'] @@ -24,24 +29,24 @@ class EB600(USBMS): VENDOR_NAME = 'NETRONIX' WINDOWS_MAIN_MEM = 'EBOOK' - WINDOWS_CARD_MEM = 'EBOOK' + WINDOWS_CARD_A_MEM = 'EBOOK' OSX_MAIN_MEM = 'EB600 Internal Storage Media' - OSX_CARD_MEM = 'EB600 Card Storage Media' + OSX_CARD_A_MEM = 'EB600 Card Storage Media' MAIN_MEMORY_VOLUME_LABEL = 'EB600 Main Memory' STORAGE_CARD_VOLUME_LABEL = 'EB600 Storage Card' EBOOK_DIR_MAIN = '' - EBOOK_DIR_CARD = '' + EBOOK_DIR_CARD_A = '' SUPPORTS_SUB_DIRS = True def windows_sort_drives(self, drives): main = drives.get('main', None) - card = drives.get('card', None) + card = drives.get('carda', None) if card and main and card < main: drives['main'] = card - drives['card'] = main + drives['carda'] = main return drives diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py index 21790e3c46..ff9d71f370 100644 --- a/src/calibre/devices/interface.py +++ b/src/calibre/devices/interface.py @@ -6,43 +6,46 @@ the GUI. A device backend must subclass the L{Device} class. See prs500.py for a backend that implement the Device interface for the SONY PRS500 Reader. """ +from calibre.customize import Plugin + +class DevicePlugin(Plugin): + """ + Defines the interface that should be implemented by backends that + communicate with an ebook reader. -class Device(object): - """ - Defines the interface that should be implemented by backends that - communicate with an ebook reader. - The C{end_session} variables are used for USB session management. Sometimes - the front-end needs to call several methods one after another, in which case + the front-end needs to call several methods one after another, in which case the USB session should not be closed after each method call. """ + type = _('Device Interface') + # Ordered list of supported formats FORMATS = ["lrf", "rtf", "pdf", "txt"] VENDOR_ID = 0x0000 PRODUCT_ID = 0x0000 - # BCD can be either None to not distinguish between devices based on BCD, or + # BCD can be either None to not distinguish between devices based on BCD, or # it can be a list of the BCD numbers of all devices supported by this driver. BCD = None THUMBNAIL_HEIGHT = 68 # Height for thumbnails on device # Whether the metadata on books can be set via the GUI. CAN_SET_METADATA = True - - def __init__(self, key='-1', log_packets=False, report_progress=None) : - """ + + def reset(self, key='-1', log_packets=False, report_progress=None) : + """ @param key: The key to unlock the device - @param log_packets: If true the packet stream to/from the device is logged - @param report_progress: Function that is called with a % progress + @param log_packets: If true the packet stream to/from the device is logged + @param report_progress: Function that is called with a % progress (number between 0 and 100) for various tasks - If it is called with -1 that means that the + If it is called with -1 that means that the task does not have any progress information """ raise NotImplementedError() - + @classmethod def get_fdi(cls): '''Return the FDI description of this device for HAL on linux.''' return '' - + @classmethod def can_handle(cls, device_info): ''' @@ -51,60 +54,66 @@ class Device(object): is only called after the vendor, product ids and the bcd have matched, so it can do some relatively time intensive checks. The default implementation returns True. - - :param device_info: On windows a device ID string. On Unix a tuple of - ``(vendor_id, product_id, bcd)``. + + :param device_info: On windows a device ID string. On Unix a tuple of + ``(vendor_id, product_id, bcd)``. ''' return True - + def open(self): ''' Perform any device specific initialization. Called after the device is detected but before any other functions that communicate with the device. For example: For devices that present themselves as USB Mass storage devices, this method would be responsible for mounting the device or - if the device has been automounted, for finding out where it has been + if the device has been automounted, for finding out where it has been mounted. The driver for the PRS505 has a implementation of this function that should serve as a good example for USB Mass storage devices. ''' raise NotImplementedError() - + def set_progress_reporter(self, report_progress): ''' - @param report_progress: Function that is called with a % progress + @param report_progress: Function that is called with a % progress (number between 0 and 100) for various tasks - If it is called with -1 that means that the + If it is called with -1 that means that the task does not have any progress information ''' raise NotImplementedError() - + def get_device_information(self, end_session=True): - """ - Ask device for device information. See L{DeviceInfoQuery}. + """ + Ask device for device information. See L{DeviceInfoQuery}. @return: (device name, device version, software version on device, mime type) """ raise NotImplementedError() - + def card_prefix(self, end_session=True): ''' - Return prefix to paths on the card or '' if no cards present. + Return a 2 element list of the prefix to paths on the cards. + If no card is present None is set for the card's prefix. + E.G. + ('/place', '/place2') + (None, 'place2') + ('place', None) + (None, None) ''' raise NotImplementedError() - + def total_space(self, end_session=True): - """ + """ Get total space available on the mountpoints: 1. Main memory - 2. Memory Stick - 3. SD Card + 2. Memory Card A + 3. Memory Card B @return: A 3 element list with total space in bytes of (1, 2, 3). If a particular device doesn't have any of these locations it should return 0. """ raise NotImplementedError() - + def free_space(self, end_session=True): - """ + """ Get free space available on the mountpoints: 1. Main memory 2. Card A @@ -112,48 +121,49 @@ class Device(object): @return: A 3 element list with free space in bytes of (1, 2, 3). If a particular device doesn't have any of these locations it should return -1. - """ + """ raise NotImplementedError() - - def books(self, oncard=False, end_session=True): - """ + + def books(self, oncard=None, end_session=True): + """ Return a list of ebooks on the device. - @param oncard: If True return a list of ebooks on the storage card, - otherwise return list of ebooks in main memory of device. - If True and no books on card return empty list. - @return: A BookList. - """ + @param oncard: If 'carda' or 'cardb' return a list of ebooks on the + specific storage card, otherwise return list of ebooks + in main memory of device. If a card is specified and no + books are on the card return empty list. + @return: A BookList. + """ raise NotImplementedError() - - def upload_books(self, files, names, on_card=False, end_session=True, + + def upload_books(self, files, names, on_card=None, end_session=True, metadata=None): ''' Upload a list of books to the device. If a file already exists on the device, it should be replaced. This method should raise a L{FreeSpaceError} if there is not enough free space on the device. The text of the FreeSpaceError must contain the - word "card" if C{on_card} is True otherwise it must contain the word "memory". + word "card" if C{on_card} is not None otherwise it must contain the word "memory". @param files: A list of paths and/or file-like objects. - @param names: A list of file names that the books should have + @param names: A list of file names that the books should have once uploaded to the device. len(names) == len(files) - @return: A list of 3-element tuples. The list is meant to be passed + @return: A list of 3-element tuples. The list is meant to be passed to L{add_books_to_metadata}. - @param metadata: If not None, it is a list of dictionaries. Each dictionary + @param metadata: If not None, it is a list of dictionaries. Each dictionary will have at least the key tags to allow the driver to choose book location based on tags. len(metadata) == len(files). If your device does not support hierarchical ebook folders, you can safely ignore this parameter. ''' raise NotImplementedError() - + @classmethod def add_books_to_metadata(cls, locations, metadata, booklists): ''' - Add locations to the booklists. This function must not communicate with - the device. + Add locations to the booklists. This function must not communicate with + the device. @param locations: Result of a call to L{upload_books} @param metadata: List of dictionaries. Each dictionary must have the - keys C{title}, C{authors}, C{author_sort}, C{cover}, C{tags}. - The value of the C{cover} + keys C{title}, C{authors}, C{author_sort}, C{cover}, C{tags}. + The value of the C{cover} element can be None or a three element tuple (width, height, data) where data is the image data in JPEG format as a string. C{tags} must be a possibly empty list of strings. C{authors} must be a string. @@ -162,45 +172,72 @@ class Device(object): The dictionary can also have an optional key "tag order" which should be another dictionary that maps tag names to lists of book ids. The ids are ids from the book database. - @param booklists: A tuple containing the result of calls to - (L{books}(oncard=False), L{books}(oncard=True)). + @param booklists: A tuple containing the result of calls to + (L{books}(oncard=None), L{books}(oncard='carda'), + L{books}(oncard='cardb')). ''' raise NotImplementedError - + def delete_books(self, paths, end_session=True): ''' Delete books at paths on device. ''' raise NotImplementedError() - + @classmethod def remove_books_from_metadata(cls, paths, booklists): ''' - Remove books from the metadata list. This function must not communicate + Remove books from the metadata list. This function must not communicate with the device. @param paths: paths to books on the device. - @param booklists: A tuple containing the result of calls to - (L{books}(oncard=False), L{books}(oncard=True)). + @param booklists: A tuple containing the result of calls to + (L{books}(oncard=None), L{books}(oncard='carda'), + L{books}(oncard='cardb')). ''' raise NotImplementedError() - + def sync_booklists(self, booklists, end_session=True): ''' Update metadata on device. - @param booklists: A tuple containing the result of calls to - (L{books}(oncard=False), L{books}(oncard=True)). + @param booklists: A tuple containing the result of calls to + (L{books}(oncard=None), L{books}(oncard='carda'), + L{books}(oncard='cardb')). ''' raise NotImplementedError() - - def get_file(self, path, outfile, end_session=True): + + def get_file(self, path, outfile, end_session=True): ''' Read the file at C{path} on the device and write it to outfile. @param outfile: file object like C{sys.stdout} or the result of an C{open} call ''' - raise NotImplementedError() + raise NotImplementedError() + + @classmethod + def config_widget(cls): + ''' + Should return a QWidget. The QWidget contains the settings for the device interface + ''' + raise NotImplementedError() + + @classmethod + def save_settings(cls, settings_widget): + ''' + Should save settings to disk. Takes the widget created in config_widget + and saves all settings to disk. + ''' + raise NotImplementedError() + + @classmethod + def settings(cls): + ''' + Should return an opts object. The opts object should have one attribute + `format_map` which is an ordered list of formats for the device. + ''' + raise NotImplementedError() + + - class BookList(list): ''' A list of books. Each Book object must have the fields: @@ -210,21 +247,21 @@ class BookList(list): 4. datetime (a UTC time tuple) 5. path (path on the device to the book) 6. thumbnail (can be None) - 7. tags (a list of strings, can be empty). + 7. tags (a list of strings, can be empty). ''' - + __getslice__ = None __setslice__ = None - + def supports_tags(self): ''' Return True if the the device supports tags (collections) for this book list. ''' raise NotImplementedError() - + def set_tags(self, book, tags): ''' - Set the tags for C{book} to C{tags}. + Set the tags for C{book} to C{tags}. @param tags: A list of strings. Can be empty. - @param book: A book object that is in this BookList. + @param book: A book object that is in this BookList. ''' raise NotImplementedError() diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py index 3fb9b1cdd1..199566357b 100644 --- a/src/calibre/devices/jetbook/driver.py +++ b/src/calibre/devices/jetbook/driver.py @@ -7,10 +7,16 @@ Device driver for Ectaco Jetbook firmware >= JL04_v030e import os, re, sys, shutil from itertools import cycle -from calibre.devices.usbms.driver import USBMS, metadata_from_formats +from calibre.devices.usbms.driver import USBMS from calibre import sanitize_file_name as sanitize class JETBOOK(USBMS): + name = 'Ectaco JetBook Device Interface' + description = _('Communicate with the JetBook eBook reader.') + author = _('James Ralston') + supported_platforms = ['windows', 'osx', 'linux'] + + # Ordered list of supported formats # Be sure these have an entry in calibre.devices.mime FORMATS = ['epub', 'mobi', 'prc', 'txt', 'rtf', 'pdf'] @@ -46,27 +52,34 @@ class JETBOOK(USBMS): names = iter(names) metadata = iter(metadata) - for infile in files: + for i, infile in enumerate(files): newpath = path - if self.SUPPORTS_SUB_DIRS: - mdata = metadata.next() + mdata = metadata.next() - if 'tags' in mdata.keys(): - for tag in mdata['tags']: - if tag.startswith('/'): - newpath += tag - newpath = os.path.normpath(newpath) - break - - if not os.path.exists(newpath): - os.makedirs(newpath) + if 'tags' in mdata.keys(): + for tag in mdata['tags']: + if tag.startswith(_('News')): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + break + elif tag.startswith('/'): + newpath += tag + newpath = os.path.normpath(newpath) + break author = sanitize(mdata.get('authors','Unknown')).replace(' ', '_') title = sanitize(mdata.get('title', 'Unknown')).replace(' ', '_') fileext = os.path.splitext(os.path.basename(names.next()))[1] fname = '%s#%s%s' % (author, title, fileext) + if newpath == path: + newpath = os.path.join(newpath, author, title) + + if not os.path.exists(newpath): + os.makedirs(newpath) + filepath = os.path.join(newpath, fname) paths.append(filepath) @@ -81,6 +94,10 @@ class JETBOOK(USBMS): else: shutil.copy2(infile, filepath) + self.report_progress((i+1) / float(len(files)), _('Transferring books to device...')) + + self.report_progress(1.0, _('Transferring books to device...')) + return zip(paths, cycle([on_card])) @classmethod @@ -93,6 +110,7 @@ class JETBOOK(USBMS): return txt + from calibre.devices.usbms.driver import metadata_from_formats mi = metadata_from_formats([path]) if (mi.title==_('Unknown') or mi.authors==[_('Unknown')]) \ @@ -108,10 +126,10 @@ class JETBOOK(USBMS): def windows_sort_drives(self, drives): main = drives.get('main', None) - card = drives.get('card', None) + card = drives.get('carda', None) if card and main and card < main: drives['main'] = card - drives['card'] = main + drives['carda'] = main return drives diff --git a/src/calibre/devices/kindle/driver.py b/src/calibre/devices/kindle/driver.py old mode 100755 new mode 100644 index a5775dec8a..de8cf0272c --- a/src/calibre/devices/kindle/driver.py +++ b/src/calibre/devices/kindle/driver.py @@ -6,9 +6,14 @@ Device driver for Amazon's Kindle import os, re, sys -from calibre.devices.usbms.driver import USBMS, metadata_from_formats +from calibre.devices.usbms.driver import USBMS class KINDLE(USBMS): + name = 'Kindle Device Interface' + description = _('Communicate with the Kindle eBook reader.') + author = _('John Schember') + supported_platforms = ['windows', 'osx', 'linux'] + # Ordered list of supported formats FORMATS = ['azw', 'mobi', 'prc', 'azw1', 'tpz', 'txt'] @@ -18,23 +23,24 @@ class KINDLE(USBMS): VENDOR_NAME = 'KINDLE' WINDOWS_MAIN_MEM = 'INTERNAL_STORAGE' - WINDOWS_CARD_MEM = 'CARD_STORAGE' + WINDOWS_CARD_A_MEM = 'CARD_STORAGE' OSX_MAIN_MEM = 'Kindle Internal Storage Media' - OSX_CARD_MEM = 'Kindle Card Storage Media' + OSX_CARD_A_MEM = 'Kindle Card Storage Media' MAIN_MEMORY_VOLUME_LABEL = 'Kindle Main Memory' STORAGE_CARD_VOLUME_LABEL = 'Kindle Storage Card' EBOOK_DIR_MAIN = "documents" - EBOOK_DIR_CARD = "documents" + EBOOK_DIR_CARD_A = "documents" SUPPORTS_SUB_DIRS = True WIRELESS_FILE_NAME_PATTERN = re.compile( r'(?P[^-]+)-asin_(?P<asin>[a-zA-Z\d]{10,})-type_(?P<type>\w{4})-v_(?P<index>\d+).*') def delete_books(self, paths, end_session=True): - for path in paths: + for i, path in enumerate(paths): + self.report_progress((i+1) / float(len(paths)), _('Removing books from device...')) if os.path.exists(path): os.unlink(path) @@ -43,9 +49,11 @@ class KINDLE(USBMS): # Delete the ebook auxiliary file if os.path.exists(filepath + '.mbp'): os.unlink(filepath + '.mbp') + self.report_progress(1.0, _('Removing books from device...')) @classmethod def metadata_from_path(cls, path): + from calibre.ebooks.metadata.meta import metadata_from_formats mi = metadata_from_formats([path]) if mi.title == _('Unknown') or ('-asin' in mi.title and '-type' in mi.title): match = cls.WIRELESS_FILE_NAME_PATTERN.match(os.path.basename(path)) @@ -58,6 +66,10 @@ class KINDLE(USBMS): class KINDLE2(KINDLE): + name = 'Kindle 2 Device Interface' + description = _('Communicate with the Kindle 2 eBook reader.') + author = _('John Schember') + supported_platforms = ['windows', 'osx', 'linux'] PRODUCT_ID = [0x0002] BCD = [0x0100] diff --git a/src/calibre/devices/libusb.py b/src/calibre/devices/libusb.py index 226a99f239..09261e10c5 100644 --- a/src/calibre/devices/libusb.py +++ b/src/calibre/devices/libusb.py @@ -116,8 +116,8 @@ class Device(Structure): raise Error("Cannot open device") return handle.contents - @apply - def configurations(): + @dynamic_property + def configurations(self): doc = """ List of device configurations. See L{ConfigDescriptor} """ def fget(self): ans = [] @@ -127,8 +127,8 @@ class Device(Structure): return property(doc=doc, fget=fget) class Bus(Structure): - @apply - def device_list(): + @dynamic_property + def device_list(self): doc = \ """ Flat list of devices on this bus. @@ -360,4 +360,4 @@ def get_devices(): for dev in devices: device = (dev.device_descriptor.idVendor, dev.device_descriptor.idProduct, dev.device_descriptor.bcdDevice) ans.append(device) - return ans + return ans \ No newline at end of file diff --git a/src/calibre/devices/prs500/books.py b/src/calibre/devices/prs500/books.py index 6c57920487..5eb8d7f011 100644 --- a/src/calibre/devices/prs500/books.py +++ b/src/calibre/devices/prs500/books.py @@ -1,8 +1,8 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' -""" -This module contains the logic for dealing with XML book lists found -in the reader cache. +""" +This module contains the logic for dealing with XML book lists found +in the reader cache. """ import xml.dom.minidom as dom from base64 import b64decode as decode @@ -25,16 +25,16 @@ def sortable_title(title): class book_metadata_field(object): """ Represents metadata stored as an attribute """ - def __init__(self, attr, formatter=None, setter=None): - self.attr = attr + def __init__(self, attr, formatter=None, setter=None): + self.attr = attr self.formatter = formatter self.setter = setter - + def __get__(self, obj, typ=None): """ Return a string. String may be empty if self.attr is absent """ return self.formatter(obj.elem.getAttribute(self.attr)) if \ self.formatter else obj.elem.getAttribute(self.attr).strip() - + def __set__(self, obj, val): """ Set the attribute """ val = self.setter(val) if self.setter else val @@ -44,7 +44,7 @@ class book_metadata_field(object): class Book(object): """ Provides a view onto the XML element that represents a book """ - + title = book_metadata_field("title") authors = book_metadata_field("author", \ formatter=lambda x: x if x and x.strip() else "Unknown") @@ -55,8 +55,8 @@ class Book(object): size = book_metadata_field("size", formatter=int) # When setting this attribute you must use an epoch datetime = book_metadata_field("date", formatter=strptime, setter=strftime) - @apply - def title_sorter(): + @dynamic_property + def title_sorter(self): doc = '''String to sort the title. If absent, title is returned''' def fget(self): src = self.elem.getAttribute('titleSorter').strip() @@ -66,12 +66,12 @@ class Book(object): def fset(self, val): self.elem.setAttribute('titleSorter', sortable_title(unicode(val))) return property(doc=doc, fget=fget, fset=fset) - - @apply - def thumbnail(): + + @dynamic_property + def thumbnail(self): doc = \ - """ - The thumbnail. Should be a height 68 image. + """ + The thumbnail. Should be a height 68 image. Setting is not supported. """ def fget(self): @@ -83,40 +83,40 @@ class Book(object): break rc = "" for node in th.childNodes: - if node.nodeType == node.TEXT_NODE: + if node.nodeType == node.TEXT_NODE: rc += node.data return decode(rc) return property(fget=fget, doc=doc) - - @apply - def path(): + + @dynamic_property + def path(self): doc = """ Absolute path to book on device. Setting not supported. """ - def fget(self): + def fget(self): return self.root + self.rpath return property(fget=fget, doc=doc) - - @apply - def db_id(): + + @dynamic_property + def db_id(self): doc = '''The database id in the application database that this file corresponds to''' def fget(self): match = re.search(r'_(\d+)$', self.rpath.rpartition('.')[0]) if match: return int(match.group(1)) return property(fget=fget, doc=doc) - + def __init__(self, node, tags=[], prefix="", root="/Data/media/"): self.elem = node self.prefix = prefix self.root = root self.tags = tags - + def __str__(self): """ Return a utf-8 encoded string with title author and path information """ return self.title.encode('utf-8') + " by " + \ self.authors.encode('utf-8') + " at " + self.path.encode('utf-8') -def fix_ids(media, cache): +def fix_ids(media, cache, *args): ''' Adjust ids in cache to correspond with media. ''' @@ -131,16 +131,16 @@ def fix_ids(media, cache): child.setAttribute("id", str(cid)) cid += 1 media.set_next_id(str(cid)) - - + + class BookList(_BookList): - """ - A list of L{Book}s. Created from an XML file. Can write list + """ + A list of L{Book}s. Created from an XML file. Can write list to an XML file. """ __getslice__ = None __setslice__ = None - + def __init__(self, root="/Data/media/", sfile=None): _BookList.__init__(self) self.tag_order = {} @@ -163,25 +163,25 @@ class BookList(_BookList): if records: self.prefix = 'xs1:' self.root = records[0] - self.proot = root - + self.proot = root + for book in self.document.getElementsByTagName(self.prefix + "text"): id = book.getAttribute('id') pl = [i.getAttribute('title') for i in self.get_playlists(id)] self.append(Book(book, root=root, prefix=self.prefix, tags=pl)) - + def supports_tags(self): return bool(self.prefix) - + def playlists(self): return self.root.getElementsByTagName(self.prefix+'playlist') - - def playlist_items(self): + + def playlist_items(self): plitems = [] for pl in self.playlists(): plitems.extend(pl.getElementsByTagName(self.prefix+'item')) return plitems - + def purge_corrupted_files(self): if not self.root: return [] @@ -193,32 +193,32 @@ class BookList(_BookList): c.parentNode.removeChild(c) c.unlink() return paths - + def purge_empty_playlists(self): ''' Remove all playlist entries that have no children. ''' for pl in self.playlists(): if not pl.getElementsByTagName(self.prefix + 'item'): pl.parentNode.removeChild(pl) pl.unlink() - + def _delete_book(self, node): nid = node.getAttribute('id') node.parentNode.removeChild(node) node.unlink() self.remove_from_playlists(nid) - - + + def delete_book(self, cid): - ''' + ''' Remove DOM node corresponding to book with C{id == cid}. Also remove book from any collections it is part of. ''' for book in self: if str(book.id) == str(cid): self.remove(book) - self._delete_book(book.elem) + self._delete_book(book.elem) break - + def remove_book(self, path): ''' Remove DOM node corresponding to book with C{path == path}. @@ -227,15 +227,15 @@ class BookList(_BookList): for book in self: if path.endswith(book.rpath): self.remove(book) - self._delete_book(book.elem) + self._delete_book(book.elem) break - + def next_id(self): return self.document.documentElement.getAttribute('nextID') - + def set_next_id(self, id): self.document.documentElement.setAttribute('nextID', str(id)) - + def max_id(self): max = 0 for child in self.root.childNodes: @@ -243,15 +243,15 @@ class BookList(_BookList): nid = int(child.getAttribute('id')) if nid > max: max = nid - return max - + return max + def book_by_path(self, path): for child in self.root.childNodes: if child.nodeType == child.ELEMENT_NODE and child.hasAttribute("path"): if path == child.getAttribute('path'): return child return None - + def add_book(self, info, name, size, ctime): """ Add a node into DOM tree representing a book """ book = self.book_by_path(name) @@ -262,23 +262,23 @@ class BookList(_BookList): cid = self.max_id()+1 sourceid = str(self[0].sourceid) if len(self) else "1" attrs = { - "title" : info["title"], + "title" : info["title"], 'titleSorter' : sortable_title(info['title']), "author" : info["authors"] if info['authors'] else 'Unknown', \ "page":"0", "part":"0", "scale":"0", \ "sourceid":sourceid, "id":str(cid), "date":"", \ "mime":mime, "path":name, "size":str(size) - } + } for attr in attrs.keys(): node.setAttributeNode(self.document.createAttribute(attr)) - node.setAttribute(attr, attrs[attr]) + node.setAttribute(attr, attrs[attr]) try: - w, h, data = info["cover"] + w, h, data = info["cover"] except TypeError: w, h, data = None, None, None - + if data: - th = self.document.createElement(self.prefix + "thumbnail") + th = self.document.createElement(self.prefix + "thumbnail") th.setAttribute("width", str(w)) th.setAttribute("height", str(h)) jpeg = self.document.createElement(self.prefix + "jpeg") @@ -294,15 +294,15 @@ class BookList(_BookList): if info.has_key('tag order'): self.tag_order.update(info['tag order']) self.set_playlists(book.id, info['tags']) - - + + def playlist_by_title(self, title): for pl in self.playlists(): if pl.getAttribute('title').lower() == title.lower(): return pl - + def add_playlist(self, title): - cid = self.max_id()+1 + cid = self.max_id()+1 pl = self.document.createElement(self.prefix+'playlist') pl.setAttribute('sourceid', '0') pl.setAttribute('id', str(cid)) @@ -316,18 +316,18 @@ class BookList(_BookList): except AttributeError: continue return pl - - + + def remove_from_playlists(self, id): for pli in self.playlist_items(): if pli.getAttribute('id') == str(id): pli.parentNode.removeChild(pli) pli.unlink() - + def set_tags(self, book, tags): book.tags = tags self.set_playlists(book.id, tags) - + def set_playlists(self, id, collections): self.remove_from_playlists(id) for collection in set(collections): @@ -337,7 +337,7 @@ class BookList(_BookList): item = self.document.createElement(self.prefix+'item') item.setAttribute('id', str(id)) coll.appendChild(item) - + def get_playlists(self, id): ans = [] for pl in self.playlists(): @@ -346,12 +346,12 @@ class BookList(_BookList): ans.append(pl) continue return ans - + def book_by_id(self, id): for book in self: if str(book.id) == str(id): return book - + def reorder_playlists(self): for title in self.tag_order.keys(): pl = self.playlist_by_title(title) @@ -364,7 +364,7 @@ class BookList(_BookList): map[i] = j pl_book_ids = [i for i in pl_book_ids if i is not None] ordered_ids = [i for i in self.tag_order[title] if i in pl_book_ids] - + if len(ordered_ids) < len(pl.childNodes): continue children = [i for i in pl.childNodes if hasattr(i, 'getAttribute')] @@ -374,8 +374,8 @@ class BookList(_BookList): for id in ordered_ids: item = self.document.createElement(self.prefix+'item') item.setAttribute('id', str(map[id])) - pl.appendChild(item) - + pl.appendChild(item) + def write(self, stream): """ Write XML representation of DOM tree to C{stream} """ stream.write(self.document.toxml('utf-8')) diff --git a/src/calibre/devices/prs500/cli/main.py b/src/calibre/devices/prs500/cli/main.py index dfd3eb1ed6..9211fcff41 100755 --- a/src/calibre/devices/prs500/cli/main.py +++ b/src/calibre/devices/prs500/cli/main.py @@ -13,7 +13,7 @@ from calibre import __version__, iswindows, __appname__ from calibre.devices.errors import PathError from calibre.utils.terminfo import TerminalController from calibre.devices.errors import ArgumentError, DeviceError, DeviceLocked -from calibre.devices import devices +from calibre.customize.ui import device_plugins from calibre.devices.scanner import DeviceScanner MINIMUM_COL_WIDTH = 12 #: Minimum width of columns in ls output @@ -39,8 +39,8 @@ class FileFormatter(object): self.name = file.name self.path = file.path - @apply - def mode_string(): + @dynamic_property + def mode_string(self): doc=""" The mode string for this file. There are only two modes read-only and read-write """ def fget(self): mode, x = "-", "-" @@ -50,8 +50,8 @@ class FileFormatter(object): return mode return property(doc=doc, fget=fget) - @apply - def isdir_name(): + @dynamic_property + def isdir_name(self): doc='''Return self.name + '/' if self is a directory''' def fget(self): name = self.name @@ -61,8 +61,8 @@ class FileFormatter(object): return property(doc=doc, fget=fget) - @apply - def name_in_color(): + @dynamic_property + def name_in_color(self): doc=""" The name in ANSI text. Directories are blue, ebooks are green """ def fget(self): cname = self.name @@ -75,22 +75,22 @@ class FileFormatter(object): return cname return property(doc=doc, fget=fget) - @apply - def human_readable_size(): + @dynamic_property + def human_readable_size(self): doc=""" File size in human readable form """ def fget(self): return human_readable(self.size) return property(doc=doc, fget=fget) - @apply - def modification_time(): + @dynamic_property + def modification_time(self): doc=""" Last modified time in the Linux ls -l format """ def fget(self): return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.wtime)) return property(doc=doc, fget=fget) - @apply - def creation_time(): + @dynamic_property + def creation_time(self): doc=""" Last modified time in the Linux ls -l format """ def fget(self): return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.ctime)) @@ -203,9 +203,10 @@ def main(): _wmi = wmi.WMI() scanner = DeviceScanner(_wmi) scanner.scan() - for d in devices(): + for d in device_plugins(): if scanner.is_device_connected(d): - dev = d(log_packets=options.log_packets) + dev = d + dev.reset(log_packets=options.log_packets) if dev is None: print >>sys.stderr, 'Unable to find a connected ebook reader.' diff --git a/src/calibre/devices/prs500/driver.py b/src/calibre/devices/prs500/driver.py old mode 100755 new mode 100644 index a905a314ae..c6cd120283 --- a/src/calibre/devices/prs500/driver.py +++ b/src/calibre/devices/prs500/driver.py @@ -40,13 +40,14 @@ from array import array from functools import wraps from StringIO import StringIO -from calibre.devices.interface import Device +from calibre.devices.interface import DevicePlugin from calibre.devices.libusb import Error as USBError from calibre.devices.libusb import get_device_by_id from calibre.devices.prs500.prstypes import * from calibre.devices.errors import * from calibre.devices.prs500.books import BookList, fix_ids from calibre import __author__, __appname__ +from calibre.devices.usbms.deviceconfig import DeviceConfig # Protocol versions this driver has been tested with KNOWN_USB_PROTOCOL_VERSIONS = [0x3030303030303130L] @@ -76,12 +77,16 @@ class File(object): return self.name -class PRS500(Device): +class PRS500(DeviceConfig, DevicePlugin): """ Implements the backend for communication with the SONY Reader. Each method decorated by C{safe} performs a task. """ + name = 'PRS-500 Device Interface' + description = _('Communicate with the Sony PRS-500 eBook reader.') + author = _('Kovid Goyal') + supported_platforms = ['windows', 'osx', 'linux'] VENDOR_ID = 0x054c #: SONY Vendor Id PRODUCT_ID = 0x029b #: Product Id for the PRS-500 @@ -181,7 +186,7 @@ class PRS500(Device): return run_session - def __init__(self, key='-1', log_packets=False, report_progress=None) : + def reset(self, key='-1', log_packets=False, report_progress=None) : """ @param key: The key to unlock the device @param log_packets: If true the packet stream to/from the device is logged @@ -620,6 +625,8 @@ class PRS500(Device): data_type=FreeSpaceAnswer, \ command_number=FreeSpaceQuery.NUMBER)[0] data.append( pkt.free ) + data = [x for x in data if x != 0] + data.append(0) return data def _exists(self, path): diff --git a/src/calibre/devices/prs500/prstypes.py b/src/calibre/devices/prs500/prstypes.py index 4e1294fc1c..3efbfcab31 100755 --- a/src/calibre/devices/prs500/prstypes.py +++ b/src/calibre/devices/prs500/prstypes.py @@ -284,8 +284,8 @@ class Command(TransferBuffer): # Length of the data part of this packet length = field(start=12, fmt=DWORD) - @apply - def data(): + @dynamic_property + def data(self): doc = \ """ The data part of this command. Returned/set as/by a TransferBuffer. @@ -447,8 +447,8 @@ class LongCommand(Command): self.length = 16 self.command = command - @apply - def command(): + @dynamic_property + def command(self): doc = \ """ Usually carries extra information needed for the command @@ -568,8 +568,8 @@ class FileOpen(PathCommand): PathCommand.__init__(self, path, FileOpen.NUMBER, path_len_at_byte=20) self.mode = mode - @apply - def mode(): + @dynamic_property + def mode(self): doc = \ """ The file open mode. Is either L{FileOpen.READ} @@ -651,8 +651,8 @@ class Response(Command): raise PacketError("Response packets must have their number set to " \ + hex(0x00001000)) - @apply - def data(): + @dynamic_property + def data(self): doc = \ """ The last 3 DWORDs (12 bytes) of data in this @@ -681,43 +681,43 @@ class ListResponse(Response): PATH_NOT_FOUND = 0xffffffd7 #: Queried path is not found PERMISSION_DENIED = 0xffffffd6 #: Permission denied - @apply - def is_file(): + @dynamic_property + def is_file(self): doc = """ True iff queried path is a file """ def fget(self): return self.code == ListResponse.IS_FILE return property(doc=doc, fget=fget) - @apply - def is_invalid(): + @dynamic_property + def is_invalid(self): doc = """ True iff queried path is invalid """ def fget(self): return self.code == ListResponse.IS_INVALID return property(doc=doc, fget=fget) - @apply - def path_not_found(): + @dynamic_property + def path_not_found(self): doc = """ True iff queried path is not found """ def fget(self): return self.code == ListResponse.PATH_NOT_FOUND return property(doc=doc, fget=fget) - @apply - def permission_denied(): + @dynamic_property + def permission_denied(self): doc = """ True iff permission is denied for path operations """ def fget(self): return self.code == ListResponse.PERMISSION_DENIED return property(doc=doc, fget=fget) - @apply - def is_unmounted(): + @dynamic_property + def is_unmounted(self): doc = """ True iff queried path is unmounted (i.e. removed storage card) """ def fget(self): return self.code == ListResponse.IS_UNMOUNTED return property(doc=doc, fget=fget) - @apply - def is_eol(): + @dynamic_property + def is_eol(self): doc = """ True iff there are no more items in the list """ def fget(self): return self.code == ListResponse.IS_EOL @@ -759,8 +759,8 @@ class FileProperties(Answer): # 0 = default permissions, 4 = read only permissions = field(start=36, fmt=DWORD) - @apply - def is_dir(): + @dynamic_property + def is_dir(self): doc = """True if path points to a directory, False if it points to a file.""" def fget(self): @@ -776,8 +776,8 @@ class FileProperties(Answer): return property(doc=doc, fget=fget, fset=fset) - @apply - def is_readonly(): + @dynamic_property + def is_readonly(self): doc = """ Whether this file is readonly.""" def fget(self): @@ -801,8 +801,8 @@ class IdAnswer(Answer): """ Defines the structure of packets that contain identifiers for queries. """ - @apply - def id(): + @dynamic_property + def id(self): doc = \ """ The identifier. C{unsigned int} stored in 4 bytes @@ -841,8 +841,8 @@ class ListAnswer(Answer): name_length = field(start=20, fmt=DWORD) name = stringfield(name_length, start=24) - @apply - def is_dir(): + @dynamic_property + def is_dir(self): doc = \ """ True if list item points to a directory, False if it points to a file. @@ -859,4 +859,3 @@ class ListAnswer(Answer): return property(doc=doc, fget=fget, fset=fset) - diff --git a/src/calibre/devices/prs505/books.py b/src/calibre/devices/prs505/books.py index 38b708a312..528770d3c5 100644 --- a/src/calibre/devices/prs505/books.py +++ b/src/calibre/devices/prs505/books.py @@ -15,11 +15,11 @@ from calibre.devices import strptime strftime = functools.partial(_strftime, zone=time.gmtime) -MIME_MAP = { +MIME_MAP = { "lrf" : "application/x-sony-bbeb", - 'lrx' : 'application/x-sony-bbeb', - "rtf" : "application/rtf", - "pdf" : "application/pdf", + 'lrx' : 'application/x-sony-bbeb', + "rtf" : "application/rtf", + "pdf" : "application/pdf", "txt" : "text/plain" , 'epub': 'application/epub+zip', } @@ -32,16 +32,16 @@ def sortable_title(title): class book_metadata_field(object): """ Represents metadata stored as an attribute """ - def __init__(self, attr, formatter=None, setter=None): - self.attr = attr + def __init__(self, attr, formatter=None, setter=None): + self.attr = attr self.formatter = formatter self.setter = setter - + def __get__(self, obj, typ=None): """ Return a string. String may be empty if self.attr is absent """ return self.formatter(obj.elem.getAttribute(self.attr)) if \ self.formatter else obj.elem.getAttribute(self.attr).strip() - + def __set__(self, obj, val): """ Set the attribute """ val = self.setter(val) if self.setter else val @@ -52,7 +52,7 @@ class book_metadata_field(object): class Book(object): """ Provides a view onto the XML element that represents a book """ - + title = book_metadata_field("title") authors = book_metadata_field("author", \ formatter=lambda x: x if x and x.strip() else _('Unknown')) @@ -63,9 +63,9 @@ class Book(object): size = book_metadata_field("size", formatter=lambda x : int(float(x))) # When setting this attribute you must use an epoch datetime = book_metadata_field("date", formatter=strptime, setter=strftime) - - @apply - def title_sorter(): + + @dynamic_property + def title_sorter(self): doc = '''String to sort the title. If absent, title is returned''' def fget(self): src = self.elem.getAttribute('titleSorter').strip() @@ -75,12 +75,12 @@ class Book(object): def fset(self, val): self.elem.setAttribute('titleSorter', sortable_title(unicode(val))) return property(doc=doc, fget=fget, fset=fset) - - @apply - def thumbnail(): + + @dynamic_property + def thumbnail(self): doc = \ - """ - The thumbnail. Should be a height 68 image. + """ + The thumbnail. Should be a height 68 image. Setting is not supported. """ def fget(self): @@ -94,33 +94,33 @@ class Book(object): break rc = "" for node in th.childNodes: - if node.nodeType == node.TEXT_NODE: + if node.nodeType == node.TEXT_NODE: rc += node.data return decode(rc) return property(fget=fget, doc=doc) - - @apply - def path(): + + @dynamic_property + def path(self): doc = """ Absolute path to book on device. Setting not supported. """ - def fget(self): + def fget(self): return self.mountpath + self.rpath return property(fget=fget, doc=doc) - - @apply - def db_id(): + + @dynamic_property + def db_id(self): doc = '''The database id in the application database that this file corresponds to''' def fget(self): match = re.search(r'_(\d+)$', self.rpath.rpartition('.')[0]) if match: return int(match.group(1)) return property(fget=fget, doc=doc) - + def __init__(self, node, mountpath, tags, prefix=""): self.elem = node self.prefix = prefix self.tags = tags self.mountpath = mountpath - + def __str__(self): """ Return a utf-8 encoded string with title author and path information """ return self.title.encode('utf-8') + " by " + \ @@ -128,8 +128,8 @@ class Book(object): class BookList(_BookList): - - def __init__(self, xml_file, mountpath): + + def __init__(self, xml_file, mountpath, report_progress=None): _BookList.__init__(self) xml_file.seek(0) self.document = dom.parse(xml_file) @@ -143,12 +143,15 @@ class BookList(_BookList): self.root_element = records[0] else: self.prefix = '' - - for book in self.root_element.childNodes: + + nodes = self.root_element.childNodes + for i, book in enumerate(nodes): + if report_progress: + report_progress((i+1) / float(len(nodes)), _('Getting list of books on device...')) if hasattr(book, 'tagName') and book.tagName.endswith('text'): tags = [i.getAttribute('title') for i in self.get_playlists(book.getAttribute('id'))] self.append(Book(book, mountpath, tags, prefix=self.prefix)) - + def max_id(self): max = 0 for child in self.root_element.childNodes: @@ -157,7 +160,7 @@ class BookList(_BookList): if nid > max: max = nid return max - + def is_id_valid(self, id): '''Return True iff there is an element with C{id==id}.''' id = str(id) @@ -166,23 +169,23 @@ class BookList(_BookList): if child.getAttribute('id') == id: return True return False - + def supports_tags(self): return True - + def book_by_path(self, path): for child in self.root_element.childNodes: if child.nodeType == child.ELEMENT_NODE and child.hasAttribute("path"): if path == child.getAttribute('path'): return child return None - + def add_book(self, info, name, size, ctime): """ Add a node into the DOM tree, representing a book """ book = self.book_by_path(name) if book is not None: self.remove_book(name) - + node = self.document.createElement(self.prefix + "text") mime = MIME_MAP[name.rpartition('.')[-1].lower()] cid = self.max_id()+1 @@ -191,23 +194,23 @@ class BookList(_BookList): except: sourceid = '1' attrs = { - "title" : info["title"], + "title" : info["title"], 'titleSorter' : sortable_title(info['title']), - "author" : info["authors"] if info['authors'] else _('Unknown'), + "author" : info["authors"] if info['authors'] else _('Unknown'), "page":"0", "part":"0", "scale":"0", \ "sourceid":sourceid, "id":str(cid), "date":"", \ "mime":mime, "path":name, "size":str(size) - } + } for attr in attrs.keys(): node.setAttributeNode(self.document.createAttribute(attr)) - node.setAttribute(attr, attrs[attr]) + node.setAttribute(attr, attrs[attr]) try: - w, h, data = info["cover"] + w, h, data = info["cover"] except TypeError: w, h, data = None, None, None - + if data: - th = self.document.createElement(self.prefix + "thumbnail") + th = self.document.createElement(self.prefix + "thumbnail") th.setAttribute("width", str(w)) th.setAttribute("height", str(h)) jpeg = self.document.createElement(self.prefix + "jpeg") @@ -222,24 +225,24 @@ class BookList(_BookList): if info.has_key('tag order'): self.tag_order.update(info['tag order']) self.set_tags(book, info['tags']) - + def _delete_book(self, node): nid = node.getAttribute('id') self.remove_from_playlists(nid) node.parentNode.removeChild(node) node.unlink() - + def delete_book(self, cid): - ''' + ''' Remove DOM node corresponding to book with C{id == cid}. Also remove book from any collections it is part of. ''' for book in self: if str(book.id) == str(cid): self.remove(book) - self._delete_book(book.elem) + self._delete_book(book.elem) break - + def remove_book(self, path): ''' Remove DOM node corresponding to book with C{path == path}. @@ -248,24 +251,24 @@ class BookList(_BookList): for book in self: if path.endswith(book.rpath): self.remove(book) - self._delete_book(book.elem) + self._delete_book(book.elem) break - + def playlists(self): ans = [] for c in self.root_element.childNodes: if hasattr(c, 'tagName') and c.tagName.endswith('playlist'): ans.append(c) return ans - - def playlist_items(self): + + def playlist_items(self): plitems = [] for pl in self.playlists(): for c in pl.childNodes: if hasattr(c, 'tagName') and c.tagName.endswith('item'): plitems.append(c) return plitems - + def purge_corrupted_files(self): if not self.root_element: return [] @@ -276,7 +279,7 @@ class BookList(_BookList): c.parentNode.removeChild(c) c.unlink() return paths - + def purge_empty_playlists(self): ''' Remove all playlists that have no children. Also removes any invalid playlist items.''' for pli in self.playlist_items(): @@ -295,32 +298,32 @@ class BookList(_BookList): if empty: pl.parentNode.removeChild(pl) pl.unlink() - + def playlist_by_title(self, title): for pl in self.playlists(): if pl.getAttribute('title').lower() == title.lower(): return pl - + def add_playlist(self, title): - cid = self.max_id()+1 + cid = self.max_id()+1 pl = self.document.createElement(self.prefix+'playlist') pl.setAttribute('id', str(cid)) pl.setAttribute('title', title) pl.setAttribute('uuid', uuid()) self.root_element.insertBefore(pl, self.root_element.childNodes[-1]) return pl - + def remove_from_playlists(self, id): for pli in self.playlist_items(): if pli.getAttribute('id') == str(id): pli.parentNode.removeChild(pli) pli.unlink() - + def set_tags(self, book, tags): tags = [t for t in tags if t] book.tags = tags self.set_playlists(book.id, tags) - + def set_playlists(self, id, collections): self.remove_from_playlists(id) for collection in set(collections): @@ -330,7 +333,7 @@ class BookList(_BookList): item = self.document.createElement(self.prefix+'item') item.setAttribute('id', str(id)) coll.appendChild(item) - + def get_playlists(self, bookid): ans = [] for pl in self.playlists(): @@ -339,23 +342,23 @@ class BookList(_BookList): if item.getAttribute('id') == str(bookid): ans.append(pl) return ans - + def next_id(self): return self.document.documentElement.getAttribute('nextID') - + def set_next_id(self, id): self.document.documentElement.setAttribute('nextID', str(id)) - + def write(self, stream): """ Write XML representation of DOM tree to C{stream} """ src = self.document.toxml('utf-8') + '\n' stream.write(src.replace("'", ''')) - + def book_by_id(self, id): for book in self: if str(book.id) == str(id): return book - + def reorder_playlists(self): for title in self.tag_order.keys(): pl = self.playlist_by_title(title) @@ -368,7 +371,7 @@ class BookList(_BookList): map[i] = j pl_book_ids = [i for i in pl_book_ids if i is not None] ordered_ids = [i for i in self.tag_order[title] if i in pl_book_ids] - + if len(ordered_ids) < len(pl.childNodes): continue children = [i for i in pl.childNodes if hasattr(i, 'getAttribute')] @@ -379,16 +382,18 @@ class BookList(_BookList): item = self.document.createElement(self.prefix+'item') item.setAttribute('id', str(map[id])) pl.appendChild(item) - -def fix_ids(main, card): + +def fix_ids(main, carda, cardb): ''' Adjust ids the XML databases. ''' if hasattr(main, 'purge_empty_playlists'): main.purge_empty_playlists() - if hasattr(card, 'purge_empty_playlists'): - card.purge_empty_playlists() - + if hasattr(carda, 'purge_empty_playlists'): + carda.purge_empty_playlists() + if hasattr(cardb, 'purge_empty_playlists'): + cardb.purge_empty_playlists() + def regen_ids(db): if not hasattr(db, 'root_element'): return @@ -397,11 +402,11 @@ def fix_ids(main, card): cid = 0 if db == main else 1 for child in db.root_element.childNodes: if child.nodeType == child.ELEMENT_NODE and child.hasAttribute('id'): - id_map[child.getAttribute('id')] = str(cid) + id_map[child.getAttribute('id')] = str(cid) child.setAttribute("sourceid", '1') - child.setAttribute('id', str(cid)) + child.setAttribute('id', str(cid)) cid += 1 - + for item in db.playlist_items(): oid = item.getAttribute('id') try: @@ -409,10 +414,11 @@ def fix_ids(main, card): except KeyError: item.parentNode.removeChild(item) item.unlink() - + db.reorder_playlists() - + regen_ids(main) - regen_ids(card) - + regen_ids(carda) + regen_ids(cardb) + main.set_next_id(str(main.max_id()+1)) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index 00cb78b06b..e75f67223a 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -1,399 +1,120 @@ __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' +__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \ + '2009, John Schember <john at nachtimwald.com>' ''' Device driver for the SONY PRS-505 ''' -import sys, os, shutil, time, subprocess, re +import os, time from itertools import cycle -from calibre.devices.interface import Device +from calibre.devices.usbms.cli import CLI +from calibre.devices.usbms.device import Device from calibre.devices.errors import DeviceError, FreeSpaceError from calibre.devices.prs505.books import BookList, fix_ids -from calibre import iswindows, islinux, isosx, __appname__ -from calibre.devices.errors import PathError +from calibre import __appname__ -class File(object): - def __init__(self, path): - stats = os.stat(path) - self.is_dir = os.path.isdir(path) - self.is_readonly = not os.access(path, os.W_OK) - self.ctime = stats.st_ctime - self.wtime = stats.st_mtime - self.size = stats.st_size - if path.endswith(os.sep): - path = path[:-1] - self.path = path - self.name = os.path.basename(path) +class PRS505(CLI, Device): + name = 'PRS-505 Device Interface' + description = _('Communicate with the Sony PRS-505 eBook reader.') + author = _('Kovid Goyal and John Schember') + supported_platforms = ['windows', 'osx', 'linux'] -class PRS505(Device): - VENDOR_ID = 0x054c #: SONY Vendor Id - PRODUCT_ID = 0x031e #: Product Id for the PRS-505 - BCD = [0x229] #: Needed to disambiguate 505 and 700 on linux - PRODUCT_NAME = 'PRS-505' - VENDOR_NAME = 'SONY' FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt'] - MEDIA_XML = 'database/cache/media.xml' - CACHE_XML = 'Sony Reader/database/cache.xml' + VENDOR_ID = [0x054c] #: SONY Vendor Id + PRODUCT_ID = [0x031e] #: Product Id for the PRS-505 + BCD = [0x229] #: Needed to disambiguate 505 and 700 on linux + + VENDOR_NAME = 'SONY' + WINDOWS_MAIN_MEM = 'PRS-505' + WINDOWS_CARD_A_MEM = 'PRS-505/UC:MS' + WINDOWS_CARD_B_MEM = 'PRS-505/UC:SD' + + OSX_MAIN_MEM = 'Sony PRS-505/UC Media' + OSX_CARD_A_MEM = 'Sony PRS-505/UC:MS Media' + OSX_CARD_B_MEM = 'Sony PRS-505/UC:SD' MAIN_MEMORY_VOLUME_LABEL = 'Sony Reader Main Memory' STORAGE_CARD_VOLUME_LABEL = 'Sony Reader Storage Card' - OSX_NAME = 'Sony PRS-505' + MEDIA_XML = 'database/cache/media.xml' + CACHE_XML = 'Sony Reader/database/cache.xml' CARD_PATH_PREFIX = __appname__ - FDI_TEMPLATE = \ -''' - <device> - <match key="info.category" string="volume"> - <match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.vendor_id" int="%(vendor_id)s"> - <match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.product_id" int="%(product_id)s"> - <match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.device_revision_bcd" int="%(bcd)s"> - <match key="volume.is_partition" bool="false"> - <merge key="volume.label" type="string">%(main_memory)s</merge> - <merge key="%(app)s.mainvolume" type="string">%(deviceclass)s</merge> - </match> - </match> - </match> - </match> - </match> - </device> - <device> - <match key="info.category" string="volume"> - <match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.vendor_id" int="%(vendor_id)s"> - <match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.product_id" int="%(product_id)s"> - <match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.device_revision_bcd" int="%(bcd)s"> - <match key="volume.is_partition" bool="true"> - <merge key="volume.label" type="string">%(storage_card)s</merge> - <merge key="%(app)s.cardvolume" type="string">%(deviceclass)s</merge> - </match> - </match> - </match> - </match> - </match> - </device> -'''.replace('%(app)s', __appname__) - - - def __init__(self, log_packets=False): - self._main_prefix = self._card_prefix = None - - @classmethod - def get_fdi(cls): - return cls.FDI_TEMPLATE%dict( - deviceclass=cls.__name__, - vendor_id=hex(cls.VENDOR_ID), - product_id=hex(cls.PRODUCT_ID), - bcd=hex(cls.BCD[0]), - main_memory=cls.MAIN_MEMORY_VOLUME_LABEL, - storage_card=cls.STORAGE_CARD_VOLUME_LABEL, - ) - - @classmethod - def is_device(cls, device_id): - device_id = device_id.upper() - if 'VEN_'+cls.VENDOR_NAME in device_id and \ - 'PROD_'+cls.PRODUCT_NAME in device_id: - return True - vid, pid = hex(cls.VENDOR_ID)[2:], hex(cls.PRODUCT_ID)[2:] - if len(vid) < 4: vid = '0'+vid - if len(pid) < 4: pid = '0'+pid - if 'VID_'+vid in device_id and \ - 'PID_'+pid in device_id: - return True - return False - - @classmethod - def get_osx_mountpoints(cls, raw=None): - if raw is None: - ioreg = '/usr/sbin/ioreg' - if not os.access(ioreg, os.X_OK): - ioreg = 'ioreg' - raw = subprocess.Popen((ioreg+' -w 0 -S -c IOMedia').split(), - stdout=subprocess.PIPE).communicate()[0] - lines = raw.splitlines() - names = {} - for i, line in enumerate(lines): - if line.strip().endswith('<class IOMedia>') and cls.OSX_NAME in line: - loc = 'stick' if ':MS' in line else 'card' if ':SD' in line else 'main' - for line in lines[i+1:]: - line = line.strip() - if line.endswith('}'): - break - match = re.search(r'"BSD Name"\s+=\s+"(.*?)"', line) - if match is not None: - names[loc] = match.group(1) - break - if len(names.keys()) == 3: - break - return names - - - def open_osx(self): - mount = subprocess.Popen('mount', shell=True, - stdout=subprocess.PIPE).stdout.read() - names = self.get_osx_mountpoints() - dev_pat = r'/dev/%s(\w*)\s+on\s+([^\(]+)\s+' - if 'main' not in names.keys(): - raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__) - main_pat = dev_pat%names['main'] - self._main_prefix = re.search(main_pat, mount).group(2) + os.sep - card_pat = names['stick'] if 'stick' in names.keys() else names['card'] if 'card' in names.keys() else None - if card_pat is not None: - card_pat = dev_pat%card_pat - self._card_prefix = re.search(card_pat, mount).group(2) + os.sep - - - def open_windows(self): - time.sleep(6) - drives = [] - wmi = __import__('wmi', globals(), locals(), [], -1) - c = wmi.WMI(find_classes=False) - for drive in c.Win32_DiskDrive(): - if self.__class__.is_device(str(drive.PNPDeviceID)): - if drive.Partitions == 0: - continue - try: - partition = drive.associators("Win32_DiskDriveToDiskPartition")[0] - logical_disk = partition.associators('Win32_LogicalDiskToPartition')[0] - prefix = logical_disk.DeviceID+os.sep - drives.append((drive.Index, prefix)) - except IndexError: - continue - - - if not drives: - raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__) - - drives.sort(cmp=lambda a, b: cmp(a[0], b[0])) - self._main_prefix = drives[0][1] - if len(drives) > 1: - self._card_prefix = drives[1][1] - - - def open_linux(self): - import dbus - bus = dbus.SystemBus() - hm = dbus.Interface(bus.get_object("org.freedesktop.Hal", "/org/freedesktop/Hal/Manager"), "org.freedesktop.Hal.Manager") - - def conditional_mount(dev, main_mem=True): - mmo = bus.get_object("org.freedesktop.Hal", dev) - label = mmo.GetPropertyString('volume.label', dbus_interface='org.freedesktop.Hal.Device') - is_mounted = mmo.GetPropertyString('volume.is_mounted', dbus_interface='org.freedesktop.Hal.Device') - mount_point = mmo.GetPropertyString('volume.mount_point', dbus_interface='org.freedesktop.Hal.Device') - fstype = mmo.GetPropertyString('volume.fstype', dbus_interface='org.freedesktop.Hal.Device') - if is_mounted: - return str(mount_point) - mmo.Mount(label, fstype, ['umask=077', 'uid='+str(os.getuid()), 'sync'], - dbus_interface='org.freedesktop.Hal.Device.Volume') - return os.path.normpath('/media/'+label)+'/' - - - mm = hm.FindDeviceStringMatch(__appname__+'.mainvolume', self.__class__.__name__) - if not mm: - raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%(self.__class__.__name__,)) - self._main_prefix = None - for dev in mm: - try: - self._main_prefix = conditional_mount(dev)+os.sep - break - except dbus.exceptions.DBusException: - continue - - - if not self._main_prefix: - raise DeviceError('Could not open device for reading. Try a reboot.') - - self._card_prefix = None - cards = hm.FindDeviceStringMatch(__appname__+'.cardvolume', self.__class__.__name__) - keys = [] - for card in cards: - keys.append(int('UC_SD' in bus.get_object("org.freedesktop.Hal", card).GetPropertyString('info.parent', dbus_interface='org.freedesktop.Hal.Device'))) - - cards = zip(cards, keys) - cards.sort(cmp=lambda x, y: cmp(x[1], y[1])) - cards = [i[0] for i in cards] - - for dev in cards: - try: - self._card_prefix = conditional_mount(dev, False)+os.sep - break - except: - import traceback - print traceback - continue - - def open(self): - time.sleep(5) - self._main_prefix = self._card_prefix = None - if islinux: + Device.open(self) + + def write_cache(prefix): try: - self.open_linux() - except DeviceError: - time.sleep(3) - self.open_linux() - if iswindows: - try: - self.open_windows() - except DeviceError: - time.sleep(3) - self.open_windows() - if isosx: - try: - self.open_osx() - except DeviceError: - time.sleep(3) - self.open_osx() - if self._card_prefix is not None: - try: - cachep = os.path.join(self._card_prefix, self.CACHE_XML) + cachep = os.path.join(prefix, self.CACHE_XML) if not os.path.exists(cachep): try: os.makedirs(os.path.dirname(cachep), mode=0777) except: time.sleep(5) os.makedirs(os.path.dirname(cachep), mode=0777) - f = open(cachep, 'wb') - f.write(u'''<?xml version="1.0" encoding="UTF-8"?> -<cache xmlns="http://www.kinoma.com/FskCache/1"> -</cache> -'''.encode('utf8')) - f.close() + with open(cachep, 'wb') as f: + f.write(u'''<?xml version="1.0" encoding="UTF-8"?> + <cache xmlns="http://www.kinoma.com/FskCache/1"> + </cache> + '''.encode('utf8')) + return True except: self._card_prefix = None import traceback traceback.print_exc() + return False - def set_progress_reporter(self, pr): - self.report_progress = pr + if self._card_a_prefix is not None: + if not write_cache(self._card_a_prefix): + self._card_a_prefix = None + if self._card_b_prefix is not None: + if not write_cache(self._card_b_prefix): + self._card_b_prefix = None def get_device_information(self, end_session=True): + self.report_progress(1.0, _('Get device information...')) return (self.__class__.__name__, '', '', '') - def card_prefix(self, end_session=True): - return self._card_prefix - - @classmethod - def _windows_space(cls, prefix): - if prefix is None: - return 0, 0 - win32file = __import__('win32file', globals(), locals(), [], -1) - try: - sectors_per_cluster, bytes_per_sector, free_clusters, total_clusters = \ - win32file.GetDiskFreeSpace(prefix[:-1]) - except Exception, err: - if getattr(err, 'args', [None])[0] == 21: # Disk not ready - time.sleep(3) - sectors_per_cluster, bytes_per_sector, free_clusters, total_clusters = \ - win32file.GetDiskFreeSpace(prefix[:-1]) - else: raise - mult = sectors_per_cluster * bytes_per_sector - return total_clusters * mult, free_clusters * mult - - def total_space(self, end_session=True): - msz = csz = 0 - if not iswindows: - if self._main_prefix is not None: - stats = os.statvfs(self._main_prefix) - msz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree) - if self._card_prefix is not None: - stats = os.statvfs(self._card_prefix) - csz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree) - else: - msz = self._windows_space(self._main_prefix)[0] - csz = self._windows_space(self._card_prefix)[0] - - return (msz, 0, csz) - - def free_space(self, end_session=True): - msz = csz = 0 - if not iswindows: - if self._main_prefix is not None: - stats = os.statvfs(self._main_prefix) - msz = stats.f_frsize * stats.f_bavail - if self._card_prefix is not None: - stats = os.statvfs(self._card_prefix) - csz = stats.f_frsize * stats.f_bavail - else: - msz = self._windows_space(self._main_prefix)[1] - csz = self._windows_space(self._card_prefix)[1] - - return (msz, 0, csz) - - def books(self, oncard=False, end_session=True): - if oncard and self._card_prefix is None: + def books(self, oncard=None, end_session=True): + if oncard == 'carda' and not self._card_a_prefix: + self.report_progress(1.0, _('Getting list of books on device...')) return [] + elif oncard == 'cardb' and not self._card_b_prefix: + self.report_progress(1.0, _('Getting list of books on device...')) + return [] + elif oncard and oncard != 'carda' and oncard != 'cardb': + self.report_progress(1.0, _('Getting list of books on device...')) + return [] + db = self.__class__.CACHE_XML if oncard else self.__class__.MEDIA_XML - prefix = self._card_prefix if oncard else self._main_prefix - bl = BookList(open(prefix + db, 'rb'), prefix) + prefix = self._card_a_prefix if oncard == 'carda' else self._card_b_prefix if oncard == 'cardb' else self._main_prefix + bl = BookList(open(prefix + db, 'rb'), prefix, self.report_progress) paths = bl.purge_corrupted_files() for path in paths: - path = os.path.join(self._card_prefix if oncard else self._main_prefix, path) + path = os.path.join(prefix, path) if os.path.exists(path): os.unlink(path) + self.report_progress(1.0, _('Getting list of books on device...')) return bl - def munge_path(self, path): - if path.startswith('/') and not (path.startswith(self._main_prefix) or \ - (self._card_prefix and path.startswith(self._card_prefix))): - path = self._main_prefix + path[1:] - elif path.startswith('card:'): - path = path.replace('card:', self._card_prefix[:-1]) - return path - - def mkdir(self, path, end_session=True): - """ Make directory """ - path = self.munge_path(path) - os.mkdir(path) - - def list(self, path, recurse=False, end_session=True, munge=True): - if munge: - path = self.munge_path(path) - if os.path.isfile(path): - return [(os.path.dirname(path), [File(path)])] - entries = [File(os.path.join(path, f)) for f in os.listdir(path)] - dirs = [(path, entries)] - for _file in entries: - if recurse and _file.is_dir: - dirs[len(dirs):] = self.list(_file.path, recurse=True, munge=False) - return dirs - - def get_file(self, path, outfile, end_session=True): - path = self.munge_path(path) - src = open(path, 'rb') - shutil.copyfileobj(src, outfile, 10*1024*1024) - - def put_file(self, infile, path, replace_file=False, end_session=True): - path = self.munge_path(path) - if os.path.isdir(path): - path = os.path.join(path, infile.name) - if not replace_file and os.path.exists(path): - raise PathError('File already exists: '+path) - dest = open(path, 'wb') - shutil.copyfileobj(infile, dest, 10*1024*1024) - dest.flush() - dest.close() - - def rm(self, path, end_session=True): - path = self.munge_path(path) - os.unlink(path) - - def touch(self, path, end_session=True): - path = self.munge_path(path) - if not os.path.exists(path): - open(path, 'w').close() - if not os.path.isdir(path): - os.utime(path, None) - - def upload_books(self, files, names, on_card=False, end_session=True, + def upload_books(self, files, names, on_card=None, end_session=True, metadata=None): - if on_card and not self._card_prefix: - raise ValueError(_('The reader has no storage card connected.')) - path = os.path.join(self._card_prefix, self.CARD_PATH_PREFIX) if on_card \ - else os.path.join(self._main_prefix, 'database', 'media', 'books') + if on_card == 'carda' and not self._card_a_prefix: + raise ValueError(_('The reader has no storage card in this slot.')) + elif on_card == 'cardb' and not self._card_b_prefix: + raise ValueError(_('The reader has no storage card in this slot.')) + elif on_card and on_card not in ('carda', 'cardb'): + raise DeviceError(_('The reader has no storage card in this slot.')) + + if on_card == 'carda': + path = os.path.join(self._card_a_prefix, self.CARD_PATH_PREFIX) + elif on_card == 'cardb': + path = os.path.join(self._card_b_prefix, self.CARD_PATH_PREFIX) + else: + path = os.path.join(self._main_prefix, 'database', 'media', 'books') def get_size(obj): if hasattr(obj, 'seek'): @@ -403,34 +124,61 @@ class PRS505(Device): return size return os.path.getsize(obj) - sizes = map(get_size, files) + sizes = [get_size(f) for f in files] size = sum(sizes) - space = self.free_space() - mspace = space[0] - cspace = space[2] - if on_card and size > cspace - 1024*1024: - raise FreeSpaceError("There is insufficient free space "+\ - "on the storage card") - if not on_card and size > mspace - 2*1024*1024: - raise FreeSpaceError("There is insufficient free space " +\ - "in main memory") + + if not on_card and size > self.free_space()[0] - 2*1024*1024: + raise FreeSpaceError(_("There is insufficient free space in main memory")) + if on_card == 'carda' and size > self.free_space()[1] - 1024*1024: + raise FreeSpaceError(_("There is insufficient free space on the storage card")) + if on_card == 'cardb' and size > self.free_space()[2] - 1024*1024: + raise FreeSpaceError(_("There is insufficient free space on the storage card")) paths, ctimes = [], [] names = iter(names) - for infile in files: + metadata = iter(metadata) + for i, infile in enumerate(files): close = False if not hasattr(infile, 'read'): infile, close = open(infile, 'rb'), True infile.seek(0) - name = names.next() - paths.append(os.path.join(path, name)) - if not os.path.exists(os.path.dirname(paths[-1])): - os.makedirs(os.path.dirname(paths[-1])) + + newpath = path + mdata = metadata.next() + + if 'tags' in mdata.keys(): + for tag in mdata['tags']: + if tag.startswith(_('News')): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + elif tag.startswith('/'): + newpath = path + newpath += tag + newpath = os.path.normpath(newpath) + break + + if newpath == path: + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) + + if not os.path.exists(newpath): + os.makedirs(newpath) + + filepath = os.path.join(newpath, names.next()) + paths.append(filepath) + self.put_file(infile, paths[-1], replace_file=True) + if close: infile.close() ctimes.append(os.path.getctime(paths[-1])) + + self.report_progress((i+1) / float(len(files)), _('Transferring books to device...')) + + self.report_progress(1.0, _('Transferring books to device...')) + return zip(paths, sizes, ctimes, cycle([on_card])) @classmethod @@ -439,17 +187,19 @@ class PRS505(Device): for location in locations: info = metadata.next() path = location[0] - on_card = 1 if location[3] else 0 + blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0 name = path.rpartition(os.sep)[2] - name = (cls.CARD_PATH_PREFIX+'/' if on_card else 'database/media/books/') + name + name = (cls.CARD_PATH_PREFIX+'/' if blist else 'database/media/books/') + name name = name.replace('//', '/') - booklists[on_card].add_book(info, name, *location[1:-1]) + booklists[blist].add_book(info, name, *location[1:-1]) fix_ids(*booklists) def delete_books(self, paths, end_session=True): - for path in paths: + for i, path in enumerate(paths): + self.report_progress((i+1) / float(len(paths)), _('Removing books from device...')) if os.path.exists(path): os.unlink(path) + self.report_progress(1.0, _('Removing books from device...')) @classmethod def remove_books_from_metadata(cls, paths, booklists): @@ -466,18 +216,15 @@ class PRS505(Device): f = open(self._main_prefix + self.__class__.MEDIA_XML, 'wb') booklists[0].write(f) f.close() - if self._card_prefix is not None and hasattr(booklists[1], 'write'): - if not os.path.exists(self._card_prefix): - os.makedirs(self._card_prefix) - f = open(self._card_prefix + self.__class__.CACHE_XML, 'wb') - booklists[1].write(f) - f.close() - - - -def main(args=sys.argv): - return 0 - -if __name__ == '__main__': - sys.exit(main()) + def write_card_prefix(prefix, listid): + if prefix is not None and hasattr(booklists[listid], 'write'): + if not os.path.exists(prefix): + os.makedirs(prefix) + f = open(prefix + self.__class__.CACHE_XML, 'wb') + booklists[listid].write(f) + f.close() + write_card_prefix(self._card_a_prefix, 1) + write_card_prefix(self._card_b_prefix, 2) + + self.report_progress(1.0, _('Sending metadata to device...')) diff --git a/src/calibre/devices/prs700/driver.py b/src/calibre/devices/prs700/driver.py index 5db60ef506..a79902fe10 100644 --- a/src/calibre/devices/prs700/driver.py +++ b/src/calibre/devices/prs700/driver.py @@ -8,8 +8,19 @@ Device driver for the SONY PRS-700 from calibre.devices.prs505.driver import PRS505 class PRS700(PRS505): + + name = 'PRS-700 Device Interface' + description = _('Communicate with the Sony PRS-700 eBook reader.') + author = _('Kovid Goyal and John Schember') + supported_platforms = ['windows', 'osx', 'linux'] BCD = [0x31a] - PRODUCT_NAME = 'PRS-700' - OSX_NAME = 'Sony PRS-700' - + + WINDOWS_MAIN_MEM = 'PRS-700' + WINDOWS_CARD_A_MEM = 'PRS-700/UC:MS' + WINDOWS_CARD_B_MEM = 'PRS-700/UC:SD' + + OSX_MAIN_MEM = 'Sony PRS-700/UC Media' + OSX_CARD_A_MEM = 'Sony PRS-700/UC:MS Media' + OSX_CARD_B_MEM = 'Sony PRS-700/UC:SD' + diff --git a/src/calibre/devices/usbms/books.py b/src/calibre/devices/usbms/books.py index fffed41549..2875c04b88 100644 --- a/src/calibre/devices/usbms/books.py +++ b/src/calibre/devices/usbms/books.py @@ -21,15 +21,15 @@ class Book(object): def __eq__(self, other): return self.path == other.path - @apply - def title_sorter(): + @dynamic_property + def title_sorter(self): doc = '''String to sort the title. If absent, title is returned''' def fget(self): return re.sub('^\s*A\s+|^\s*The\s+|^\s*An\s+', '', self.title).rstrip() return property(doc=doc, fget=fget) - @apply - def thumbnail(): + @dynamic_property + def thumbnail(self): return None def __str__(self): @@ -44,4 +44,3 @@ class BookList(_BookList): def set_tags(self, book, tags): pass - diff --git a/src/calibre/devices/usbms/cli.py b/src/calibre/devices/usbms/cli.py new file mode 100644 index 0000000000..40e2225486 --- /dev/null +++ b/src/calibre/devices/usbms/cli.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os, shutil + +from calibre.devices.errors import PathError + +class File(object): + + def __init__(self, path): + stats = os.stat(path) + self.is_dir = os.path.isdir(path) + self.is_readonly = not os.access(path, os.W_OK) + self.ctime = stats.st_ctime + self.wtime = stats.st_mtime + self.size = stats.st_size + if path.endswith(os.sep): + path = path[:-1] + self.path = path + self.name = os.path.basename(path) + + +class CLI(object): + + def get_file(self, path, outfile, end_session=True): + path = self.munge_path(path) + with open(path, 'rb') as src: + shutil.copyfileobj(src, outfile, 10*1024*1024) + + def put_file(self, infile, path, replace_file=False, end_session=True): + path = self.munge_path(path) + if os.path.isdir(path): + path = os.path.join(path, infile.name) + if not replace_file and os.path.exists(path): + raise PathError('File already exists: ' + path) + dest = open(path, 'wb') + shutil.copyfileobj(infile, dest, 10*1024*1024) + dest.flush() + dest.close() + + def munge_path(self, path): + if path.startswith('/') and not (path.startswith(self._main_prefix) or \ + (self._card_a_prefix and path.startswith(self._card_a_prefix)) or \ + (self._card_b_prefix and path.startswith(self._card_b_prefix))): + path = self._main_prefix + path[1:] + elif path.startswith('carda:'): + path = path.replace('carda:', self._card_prefix[:-1]) + elif path.startswith('cardb:'): + path = path.replace('cardb:', self._card_prefix[:-1]) + return path + + def list(self, path, recurse=False, end_session=True, munge=True): + if munge: + path = self.munge_path(path) + if os.path.isfile(path): + return [(os.path.dirname(path), [File(path)])] + entries = [File(os.path.join(path, f)) for f in os.listdir(path)] + dirs = [(path, entries)] + for _file in entries: + if recurse and _file.is_dir: + dirs[len(dirs):] = self.list(_file.path, recurse=True, munge=False) + return dirs + + def mkdir(self, path, end_session=True): + if self.SUPPORTS_SUB_DIRS: + path = self.munge_path(path) + os.mkdir(path) + + def rm(self, path, end_session=True): + path = self.munge_path(path) + self.delete_books([path]) + + def touch(self, path, end_session=True): + path = self.munge_path(path) + if not os.path.exists(path): + open(path, 'w').close() + if not os.path.isdir(path): + os.utime(path, None) diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 9f6b3cbd34..9b56509351 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -8,11 +8,12 @@ device. This class handles device detection. import os, subprocess, time, re -from calibre.devices.interface import Device as _Device +from calibre.devices.interface import DevicePlugin from calibre.devices.errors import DeviceError +from calibre.devices.usbms.deviceconfig import DeviceConfig from calibre import iswindows, islinux, isosx, __appname__ -class Device(_Device): +class Device(DeviceConfig, DevicePlugin): ''' This class provides logic common to all drivers for devices that export themselves as USB Mass Storage devices. If you are writing such a driver, inherit from this @@ -25,10 +26,12 @@ class Device(_Device): VENDOR_NAME = None WINDOWS_MAIN_MEM = None - WINDOWS_CARD_MEM = None + WINDOWS_CARD_A_MEM = None + WINDOWS_CARD_B_MEM = None OSX_MAIN_MEM = None - OSX_CARD_MEM = None + OSX_CARD_A_MEM = None + OSX_CARD_B_MEM = None MAIN_MEMORY_VOLUME_LABEL = '' STORAGE_CARD_VOLUME_LABEL = '' @@ -63,18 +66,30 @@ class Device(_Device): </match> </match> </device> + <device> + <match key="info.category" string="volume"> + <match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.vendor_id" int="%(vendor_id)s"> + <match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.product_id" int="%(product_id)s"> + %(BCD_start)s + <match key="@info.parent:storage.lun" int="%(lun2)d"> + <merge key="volume.label" type="string">%(storage_card)s</merge> + <merge key="%(app)s.cardvolume" type="string">%(deviceclass)s</merge> + </match> + %(BCD_end)s + </match> + </match> + </match> + </device> ''' - FDI_BCD_TEMPLATE = '<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.device_revision_bcd" int="%(bcd)s">' FDI_LUNS = {'lun0':0, 'lun1':1, 'lun2':2} + FDI_BCD_TEMPLATE = '<match key="@info.parent:@info.parent:@info.parent:@info.parent:usb.device_revision_bcd" int="%(bcd)s">' - - def __init__(self, key='-1', log_packets=False, report_progress=None) : - self._main_prefix = self._card_prefix = None + def reset(self, key='-1', log_packets=False, report_progress=None) : + self._main_prefix = self._card_a_prefix = self._card_b_prefix = None @classmethod def get_fdi(cls): fdi = '' - for vid in cls.VENDOR_ID: for pid in cls.PRODUCT_ID: fdi_base_values = dict( @@ -85,7 +100,6 @@ class Device(_Device): main_memory=cls.MAIN_MEMORY_VOLUME_LABEL, storage_card=cls.STORAGE_CARD_VOLUME_LABEL, ) - fdi_base_values.update(cls.FDI_LUNS) if cls.BCD is None: @@ -105,7 +119,7 @@ class Device(_Device): self.report_progress = report_progress def card_prefix(self, end_session=True): - return self._card_prefix + return (self._card_a_prefix, self._card_b_prefix) @classmethod def _windows_space(cls, prefix): @@ -125,34 +139,41 @@ class Device(_Device): return total_clusters * mult, free_clusters * mult def total_space(self, end_session=True): - msz = csz = 0 + msz = casz = cbsz = 0 if not iswindows: if self._main_prefix is not None: stats = os.statvfs(self._main_prefix) msz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree) - if self._card_prefix is not None: - stats = os.statvfs(self._card_prefix) - csz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree) + if self._card_a_prefix is not None: + stats = os.statvfs(self._card_a_prefix) + casz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree) + if self._card_b_prefix is not None: + stats = os.statvfs(self._card_b_prefix) + cbsz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree) else: msz = self._windows_space(self._main_prefix)[0] - csz = self._windows_space(self._card_prefix)[0] + casz = self._windows_space(self._card_a_prefix)[0] + cbsz = self._windows_space(self._card_b_prefix)[0] - return (msz, 0, csz) + return (msz, casz, cbsz) def free_space(self, end_session=True): - msz = csz = 0 + msz = casz = cbsz = 0 if not iswindows: if self._main_prefix is not None: stats = os.statvfs(self._main_prefix) msz = stats.f_frsize * stats.f_bavail - if self._card_prefix is not None: - stats = os.statvfs(self._card_prefix) - csz = stats.f_frsize * stats.f_bavail + if self._card_a_prefix is not None: + stats = os.statvfs(self._card_a_prefix) + casz = stats.f_frsize * stats.f_bavail + if self._card_b_prefix is not None: + stats = os.statvfs(self._card_b_prefix) + cbsz = stats.f_frsize * stats.f_bavail else: msz = self._windows_space(self._main_prefix)[1] csz = self._windows_space(self._card_prefix)[1] - return (msz, 0, csz) + return (msz, casz, cbsz) def windows_match_device(self, pnp_id, device_id): pnp_id = pnp_id.upper() @@ -193,10 +214,12 @@ class Device(_Device): for drive in c.Win32_DiskDrive(): if self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_MAIN_MEM): drives['main'] = self.windows_get_drive_prefix(drive) - elif self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_CARD_MEM): - drives['card'] = self.windows_get_drive_prefix(drive) + elif self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_CARD_A_MEM): + drives['carda'] = self.windows_get_drive_prefix(drive) + elif self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_CARD_B_MEM): + drives['cardb'] = self.windows_get_drive_prefix(drive) - if 'main' in drives.keys() and 'card' in drives.keys(): + if 'main' in drives.keys() and 'carda' in drives.keys() and 'cardb' in drives.keys(): break if 'main' not in drives: @@ -206,7 +229,8 @@ class Device(_Device): drives = self.windows_sort_drives(drives) self._main_prefix = drives.get('main') - self._card_prefix = drives.get('card', None) + self._card_a_prefix = drives.get('carda', None) + self._card_b_prefix = drives.get('cardb', None) @classmethod def run_ioreg(cls, raw=None): @@ -237,9 +261,11 @@ class Device(_Device): for i, line in enumerate(lines): if self.OSX_MAIN_MEM is not None and line.strip().endswith('<class IOMedia>') and self.OSX_MAIN_MEM in line: get_dev_node(lines[i+1:], 'main') - if self.OSX_CARD_MEM is not None and line.strip().endswith('<class IOMedia>') and self.OSX_CARD_MEM in line: - get_dev_node(lines[i+1:], 'card') - if len(names.keys()) == 2: + if self.OSX_CARD_A_MEM is not None and line.strip().endswith('<class IOMedia>') and self.OSX_CARD_A_MEM in line: + get_dev_node(lines[i+1:], 'carda') + if self.OSX_CARD_B_MEM is not None and line.strip().endswith('<class IOMedia>') and self.OSX_CARD_B_MEM in line: + get_dev_node(lines[i+1:], 'cardb') + if len(names.keys()) == 3: break return names @@ -251,10 +277,18 @@ class Device(_Device): raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__) main_pat = dev_pat % names['main'] self._main_prefix = re.search(main_pat, mount).group(2) + os.sep - card_pat = names['card'] if 'card' in names.keys() else None - if card_pat is not None: - card_pat = dev_pat % card_pat - self._card_prefix = re.search(card_pat, mount).group(2) + os.sep + card_a_pat = names['carda'] if 'carda' in names.keys() else None + card_b_pat = names['cardb'] if 'cardb' in names.keys() else None + + def get_card_prefix(pat): + if pat is not None: + pat = dev_pat % pat + return re.search(pat, mount).group(2) + os.sep + else: + return None + + self._card_a_prefix = get_card_prefix(card_a_pat) + self._card_b_prefix = get_card_prefix(card_b_pat) def open_linux(self): import dbus @@ -287,21 +321,24 @@ class Device(_Device): if not self._main_prefix: raise DeviceError('Could not open device for reading. Try a reboot.') - self._card_prefix = None + self._card_a_prefix = self._card_b_prefix = None cards = hm.FindDeviceStringMatch(__appname__+'.cardvolume', self.__class__.__name__) - for dev in cards: + def mount_card(dev): try: - self._card_prefix = conditional_mount(dev)+os.sep - break + return conditional_mount(dev)+os.sep except: import traceback print traceback - continue + + if len(cards) >= 1: + self._card_a_prefix = mount_card(cards[0]) + if len(cards) >=2: + self._card_b_prefix = mount_card(cards[1]) def open(self): time.sleep(5) - self._main_prefix = self._card_prefix = None + self._main_prefix = self._card_a_prefix = self._card_b_prefix = None if islinux: try: self.open_linux() diff --git a/src/calibre/devices/usbms/deviceconfig.py b/src/calibre/devices/usbms/deviceconfig.py new file mode 100644 index 0000000000..bbe3a13646 --- /dev/null +++ b/src/calibre/devices/usbms/deviceconfig.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +from calibre.utils.config import Config, ConfigProxy + +class DeviceConfig(object): + + HELP_MESSAGE = _('Ordered list of formats the device will accept') + + @classmethod + def _config(cls): + klass = cls if isinstance(cls, type) else cls.__class__ + c = Config('device_drivers_%s' % klass.__name__, _('settings for device drivers')) + c.add_opt('format_map', default=cls.FORMATS, help=cls.HELP_MESSAGE) + return c + + @classmethod + def _configProxy(cls): + return ConfigProxy(cls._config()) + + @classmethod + def config_widget(cls): + from calibre.gui2.device_drivers.configwidget import ConfigWidget + cw = ConfigWidget(cls.settings(), cls.FORMATS) + return cw + + @classmethod + def save_settings(cls, config_widget): + cls._configProxy()['format_map'] = config_widget.format_map() + + @classmethod + def settings(cls): + return cls._config().parse() + + def customization_help(cls, gui=False): + return cls.HELP_MESSAGE + diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index fdb553f15b..700a072c5b 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -10,71 +10,89 @@ for a particular device. import os, fnmatch, shutil from itertools import cycle -from calibre.ebooks.metadata.meta import metadata_from_formats, path_to_ext from calibre.ebooks.metadata import authors_to_string +from calibre.devices.usbms.cli import CLI from calibre.devices.usbms.device import Device from calibre.devices.usbms.books import BookList, Book -from calibre.devices.errors import FreeSpaceError, PathError +from calibre.devices.errors import DeviceError, FreeSpaceError from calibre.devices.mime import mime_type_ext -class File(object): - def __init__(self, path): - stats = os.stat(path) - self.is_dir = os.path.isdir(path) - self.is_readonly = not os.access(path, os.W_OK) - self.ctime = stats.st_ctime - self.wtime = stats.st_mtime - self.size = stats.st_size - if path.endswith(os.sep): - path = path[:-1] - self.path = path - self.name = os.path.basename(path) +# CLI must come before Device as it implments the CLI functions that +# are inherited from the device interface in Device. +class USBMS(CLI, Device): + + name = 'USBMS Base Device Interface' + description = _('Communicate with an eBook reader.') + author = _('John Schember') + supported_platforms = ['windows', 'osx', 'linux'] -class USBMS(Device): FORMATS = [] EBOOK_DIR_MAIN = '' - EBOOK_DIR_CARD = '' + EBOOK_DIR_CARD_A = '' + EBOOK_DIR_CARD_B = '' SUPPORTS_SUB_DIRS = False CAN_SET_METADATA = False - def __init__(self, key='-1', log_packets=False, report_progress=None): - Device.__init__(self, key=key, log_packets=log_packets, + def reset(self, key='-1', log_packets=False, report_progress=None): + Device.reset(self, key=key, log_packets=log_packets, report_progress=report_progress) def get_device_information(self, end_session=True): + self.report_progress(1.0, _('Get device information...')) return (self.__class__.__name__, '', '', '') - def books(self, oncard=False, end_session=True): + def books(self, oncard=None, end_session=True): + from calibre.ebooks.metadata.meta import path_to_ext bl = BookList() - if oncard and self._card_prefix is None: + if oncard == 'carda' and not self._card_a_prefix: + self.report_progress(1.0, _('Getting list of books on device...')) + return bl + elif oncard == 'cardb' and not self._card_b_prefix: + self.report_progress(1.0, _('Getting list of books on device...')) + return bl + elif oncard and oncard != 'carda' and oncard != 'cardb': + self.report_progress(1.0, _('Getting list of books on device...')) return bl - prefix = self._card_prefix if oncard else self._main_prefix - ebook_dir = self.EBOOK_DIR_CARD if oncard else self.EBOOK_DIR_MAIN + prefix = self._card_a_prefix if oncard == 'carda' else self._card_b_prefix if oncard == 'cardb' else self._main_prefix + ebook_dir = self.EBOOK_DIR_CARD_A if oncard == 'carda' else self.EBOOK_DIR_CARD_B if oncard == 'cardb' else self.EBOOK_DIR_MAIN # Get all books in the ebook_dir directory if self.SUPPORTS_SUB_DIRS: for path, dirs, files in os.walk(os.path.join(prefix, ebook_dir)): # Filter out anything that isn't in the list of supported ebook types for book_type in self.FORMATS: - for filename in fnmatch.filter(files, '*.%s' % (book_type)): + match = fnmatch.filter(files, '*.%s' % (book_type)) + for i, filename in enumerate(match): + self.report_progress((i+1) / float(len(match)), _('Getting list of books on device...')) bl.append(self.__class__.book_from_path(os.path.join(path, filename))) else: path = os.path.join(prefix, ebook_dir) - for filename in os.listdir(path): + paths = os.listdir(path) + for i, filename in enumerate(paths): + self.report_progress((i+1) / float(len(paths)), _('Getting list of books on device...')) if path_to_ext(filename) in self.FORMATS: bl.append(self.__class__.book_from_path(os.path.join(path, filename))) + + self.report_progress(1.0, _('Getting list of books on device...')) + return bl def _sanity_check(self, on_card, files): - if on_card and not self._card_prefix: - raise ValueError(_('The reader has no storage card connected.')) + if on_card == 'carda' and not self._card_a_prefix: + raise ValueError(_('The reader has no storage card in this slot.')) + elif on_card == 'cardb' and not self._card_b_prefix: + raise ValueError(_('The reader has no storage card in this slot.')) + elif on_card and on_card not in ('carda', 'cardb'): + raise DeviceError(_('The reader has no storage card in this slot.')) - if not on_card: - path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN) + if on_card == 'carda': + path = os.path.join(self._card_a_prefix, self.EBOOK_DIR_CARD_A) + elif on_card == 'cardb': + path = os.path.join(self._card_b_prefix, self.EBOOK_DIR_CARD_B) else: - path = os.path.join(self._card_prefix, self.EBOOK_DIR_CARD) + path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN) def get_size(obj): if hasattr(obj, 'seek'): @@ -87,13 +105,15 @@ class USBMS(Device): sizes = [get_size(f) for f in files] size = sum(sizes) - if on_card and size > self.free_space()[2] - 1024*1024: - raise FreeSpaceError(_("There is insufficient free space on the storage card")) if not on_card and size > self.free_space()[0] - 2*1024*1024: raise FreeSpaceError(_("There is insufficient free space in main memory")) + if on_card == 'carda' and size > self.free_space()[1] - 1024*1024: + raise FreeSpaceError(_("There is insufficient free space on the storage card")) + if on_card == 'cardb' and size > self.free_space()[2] - 1024*1024: + raise FreeSpaceError(_("There is insufficient free space on the storage card")) return path - def upload_books(self, files, names, on_card=False, end_session=True, + def upload_books(self, files, names, on_card=None, end_session=True, metadata=None): path = self._sanity_check(on_card, files) @@ -102,7 +122,7 @@ class USBMS(Device): names = iter(names) metadata = iter(metadata) - for infile in files: + for i, infile in enumerate(files): newpath = path if self.SUPPORTS_SUB_DIRS: @@ -110,11 +130,21 @@ class USBMS(Device): if 'tags' in mdata.keys(): for tag in mdata['tags']: - if tag.startswith('/'): + if tag.startswith(_('News')): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + break + elif tag.startswith('/'): newpath += tag newpath = os.path.normpath(newpath) break + if newpath == path: + newpath = os.path.join(newpath, + mdata.get('authors', _('Unknown')), + mdata.get('title', _('Unknown'))) + if not os.path.exists(newpath): os.makedirs(newpath) @@ -132,22 +162,28 @@ class USBMS(Device): else: shutil.copy2(infile, filepath) + self.report_progress((i+1) / float(len(files)), _('Transferring books to device...')) + + self.report_progress(1.0, _('Transferring books to device...')) + return zip(paths, cycle([on_card])) - @classmethod - def add_books_to_metadata(cls, locations, metadata, booklists): - for location in locations: + def add_books_to_metadata(self, locations, metadata, booklists): + for i, location in enumerate(locations): + self.report_progress((i+1) / float(len(locations)), _('Adding books to device metadata listing...')) path = location[0] - on_card = 1 if location[1] else 0 + blist = 2 if location[1] == 'cardb' else 1 if location[1] == 'carda' else 0 - book = cls.book_from_path(path) + book = self.book_from_path(path) - if not book in booklists[on_card]: - booklists[on_card].append(book) + if not book in booklists[blist]: + booklists[blist].append(book) + self.report_progress(1.0, _('Adding books to device metadata listing...')) def delete_books(self, paths, end_session=True): - for path in paths: + for i, path in enumerate(paths): + self.report_progress((i+1) / float(len(paths)), _('Removing books from device...')) if os.path.exists(path): # Delete the ebook os.unlink(path) @@ -156,79 +192,31 @@ class USBMS(Device): os.removedirs(os.path.dirname(path)) except: pass + self.report_progress(1.0, _('Removing books from device...')) - @classmethod - def remove_books_from_metadata(cls, paths, booklists): - for path in paths: + def remove_books_from_metadata(self, paths, booklists): + for i, path in enumerate(paths): + self.report_progress((i+1) / float(len(paths)), _('Removing books from device metadata listing...')) for bl in booklists: for book in bl: if path.endswith(book.path): bl.remove(book) + self.report_progress(1.0, _('Removing books from device metadata listing...')) def sync_booklists(self, booklists, end_session=True): # There is no meta data on the device to update. The device is treated # as a mass storage device and does not use a meta data xml file like # the Sony Readers. - pass - - def get_file(self, path, outfile, end_session=True): - path = self.munge_path(path) - with open(path, 'rb') as src: - shutil.copyfileobj(src, outfile, 10*1024*1024) - - def put_file(self, infile, path, replace_file=False, end_session=True): - path = self.munge_path(path) - if os.path.isdir(path): - path = os.path.join(path, infile.name) - if not replace_file and os.path.exists(path): - raise PathError('File already exists: ' + path) - dest = open(path, 'wb') - shutil.copyfileobj(infile, dest, 10*1024*1024) - dest.flush() - dest.close() - - def munge_path(self, path): - if path.startswith('/') and not (path.startswith(self._main_prefix) or \ - (self._card_prefix and path.startswith(self._card_prefix))): - path = self._main_prefix + path[1:] - elif path.startswith('card:'): - path = path.replace('card:', self._card_prefix[:-1]) - return path - - def list(self, path, recurse=False, end_session=True, munge=True): - if munge: - path = self.munge_path(path) - if os.path.isfile(path): - return [(os.path.dirname(path), [File(path)])] - entries = [File(os.path.join(path, f)) for f in os.listdir(path)] - dirs = [(path, entries)] - for _file in entries: - if recurse and _file.is_dir: - dirs[len(dirs):] = self.list(_file.path, recurse=True, munge=False) - return dirs - - def mkdir(self, path, end_session=True): - if self.SUPPORTS_SUB_DIRS: - path = self.munge_path(path) - os.mkdir(path) - - def rm(self, path, end_session=True): - path = self.munge_path(path) - self.delete_books([path]) - - def touch(self, path, end_session=True): - path = self.munge_path(path) - if not os.path.exists(path): - open(path, 'w').close() - if not os.path.isdir(path): - os.utime(path, None) + self.report_progress(1.0, _('Sending metadata to device...')) @classmethod def metadata_from_path(cls, path): + from calibre.ebooks.metadata.meta import metadata_from_formats return metadata_from_formats([path]) @classmethod def book_from_path(cls, path): + from calibre.ebooks.metadata.meta import path_to_ext fileext = path_to_ext(path) mi = cls.metadata_from_path(path) mime = mime_type_ext(fileext) diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index 26d2394818..416fe61789 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -60,6 +60,8 @@ class HTMLRenderer(object): def render_html(path_to_html, width=590, height=750): from PyQt4.QtWebKit import QWebPage from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize + from calibre.gui2 import is_ok_to_use_qt + if not is_ok_to_use_qt(): return None path_to_html = os.path.abspath(path_to_html) with CurrentDir(os.path.dirname(path_to_html)): page = QWebPage() diff --git a/src/calibre/ebooks/lrf/comic/__init__.py b/src/calibre/ebooks/comic/__init__.py similarity index 100% rename from src/calibre/ebooks/lrf/comic/__init__.py rename to src/calibre/ebooks/comic/__init__.py diff --git a/src/calibre/ebooks/comic/input.py b/src/calibre/ebooks/comic/input.py new file mode 100755 index 0000000000..bf2aac1162 --- /dev/null +++ b/src/calibre/ebooks/comic/input.py @@ -0,0 +1,473 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Based on ideas from comiclrf created by FangornUK. +''' + +import os, shutil, traceback, textwrap, time +from Queue import Empty + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre import extract, CurrentDir, prints +from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.utils.ipc.server import Server +from calibre.utils.ipc.job import ParallelJob + +def extract_comic(path_to_comic_file): + ''' + Un-archive the comic file. + ''' + tdir = PersistentTemporaryDirectory(suffix='_comic_extract') + extract(path_to_comic_file, tdir) + return tdir + +def find_pages(dir, sort_on_mtime=False, verbose=False): + ''' + Find valid comic pages in a previously un-archived comic. + + :param dir: Directory in which extracted comic lives + :param sort_on_mtime: If True sort pages based on their last modified time. + Otherwise, sort alphabetically. + ''' + extensions = ['jpeg', 'jpg', 'gif', 'png'] + pages = [] + for datum in os.walk(dir): + for name in datum[-1]: + path = os.path.join(datum[0], name) + if '__MACOSX' in path: continue + for ext in extensions: + if path.lower().endswith('.'+ext): + pages.append(path) + break + if sort_on_mtime: + comparator = lambda x, y : cmp(os.stat(x).st_mtime, os.stat(y).st_mtime) + else: + comparator = lambda x, y : cmp(os.path.basename(x), os.path.basename(y)) + + pages.sort(cmp=comparator) + if verbose: + prints('Found comic pages...') + prints('\t'+'\n\t'.join([os.path.basename(p) for p in pages])) + return pages + +class PageProcessor(list): + ''' + Contains the actual image rendering logic. See :method:`render` and + :method:`process_pages`. + ''' + + def __init__(self, path_to_page, dest, opts, num): + list.__init__(self) + self.path_to_page = path_to_page + self.opts = opts + self.num = num + self.dest = dest + self.rotate = False + self.render() + + + def render(self): + import calibre.utils.PythonMagickWand as pw + img = pw.NewMagickWand() + if img < 0: + raise RuntimeError('Cannot create wand.') + if not pw.MagickReadImage(img, self.path_to_page): + raise IOError('Failed to read image from: %'%self.path_to_page) + width = pw.MagickGetImageWidth(img) + height = pw.MagickGetImageHeight(img) + if self.num == 0: # First image so create a thumbnail from it + thumb = pw.CloneMagickWand(img) + if thumb < 0: + raise RuntimeError('Cannot create wand.') + pw.MagickThumbnailImage(thumb, 60, 80) + pw.MagickWriteImage(thumb, os.path.join(self.dest, 'thumbnail.png')) + pw.DestroyMagickWand(thumb) + self.pages = [img] + if width > height: + if self.opts.landscape: + self.rotate = True + else: + split1, split2 = map(pw.CloneMagickWand, (img, img)) + pw.DestroyMagickWand(img) + if split1 < 0 or split2 < 0: + raise RuntimeError('Cannot create wand.') + pw.MagickCropImage(split1, (width/2)-1, height, 0, 0) + pw.MagickCropImage(split2, (width/2)-1, height, width/2, 0 ) + self.pages = [split2, split1] if self.opts.right2left else [split1, split2] + self.process_pages() + + def process_pages(self): + import calibre.utils.PythonMagickWand as p + for i, wand in enumerate(self.pages): + pw = p.NewPixelWand() + try: + if pw < 0: + raise RuntimeError('Cannot create wand.') + p.PixelSetColor(pw, 'white') + + p.MagickSetImageBorderColor(wand, pw) + if self.rotate: + p.MagickRotateImage(wand, pw, -90) + + # 25 percent fuzzy trim? + if not self.opts.disable_trim: + p.MagickTrimImage(wand, 25*65535/100) + p.MagickSetImagePage(wand, 0,0,0,0) #Clear page after trim, like a "+repage" + # Do the Photoshop "Auto Levels" equivalent + if not self.opts.dont_normalize: + p.MagickNormalizeImage(wand) + sizex = p.MagickGetImageWidth(wand) + sizey = p.MagickGetImageHeight(wand) + + SCRWIDTH, SCRHEIGHT = self.opts.output_profile.comic_screen_size + + if self.opts.keep_aspect_ratio: + # Preserve the aspect ratio by adding border + aspect = float(sizex) / float(sizey) + if aspect <= (float(SCRWIDTH) / float(SCRHEIGHT)): + newsizey = SCRHEIGHT + newsizex = int(newsizey * aspect) + deltax = (SCRWIDTH - newsizex) / 2 + deltay = 0 + else: + newsizex = SCRWIDTH + newsizey = int(newsizex / aspect) + deltax = 0 + deltay = (SCRHEIGHT - newsizey) / 2 + p.MagickResizeImage(wand, newsizex, newsizey, p.CatromFilter, 1.0) + p.MagickSetImageBorderColor(wand, pw) + p.MagickBorderImage(wand, pw, deltax, deltay) + elif self.opts.wide: + # Keep aspect and Use device height as scaled image width so landscape mode is clean + aspect = float(sizex) / float(sizey) + screen_aspect = float(SCRWIDTH) / float(SCRHEIGHT) + # Get dimensions of the landscape mode screen + # Add 25px back to height for the battery bar. + wscreenx = SCRHEIGHT + 25 + wscreeny = int(wscreenx / screen_aspect) + if aspect <= screen_aspect: + newsizey = wscreeny + newsizex = int(newsizey * aspect) + deltax = (wscreenx - newsizex) / 2 + deltay = 0 + else: + newsizex = wscreenx + newsizey = int(newsizex / aspect) + deltax = 0 + deltay = (wscreeny - newsizey) / 2 + p.MagickResizeImage(wand, newsizex, newsizey, p.CatromFilter, 1.0) + p.MagickSetImageBorderColor(wand, pw) + p.MagickBorderImage(wand, pw, deltax, deltay) + else: + p.MagickResizeImage(wand, SCRWIDTH, SCRHEIGHT, p.CatromFilter, 1.0) + + if not self.opts.dont_sharpen: + p.MagickSharpenImage(wand, 0.0, 1.0) + + p.MagickSetImageType(wand, p.GrayscaleType) + + if self.opts.despeckle: + p.MagickDespeckleImage(wand) + + p.MagickQuantizeImage(wand, self.opts.colors, p.RGBColorspace, 0, 1, 0) + dest = '%d_%d.png'%(self.num, i) + dest = os.path.join(self.dest, dest) + p.MagickWriteImage(wand, dest+'8') + os.rename(dest+'8', dest) + self.append(dest) + finally: + if pw > 0: + p.DestroyPixelWand(pw) + p.DestroyMagickWand(wand) + +def render_pages(tasks, dest, opts, notification=lambda x, y: x): + ''' + Entry point for the job server. + ''' + failures, pages = [], [] + from calibre.utils.PythonMagickWand import ImageMagick + with ImageMagick(): + for num, path in tasks: + try: + pages.extend(PageProcessor(path, dest, opts, num)) + msg = _('Rendered %s')%path + except: + failures.append(path) + msg = _('Failed %s')%path + if opts.verbose: + msg += '\n' + traceback.format_exc() + prints(msg) + notification(0.5, msg) + + return pages, failures + + +class Progress(object): + + def __init__(self, total, update): + self.total = total + self.update = update + self.done = 0 + + def __call__(self, percent, msg=''): + self.done += 1 + #msg = msg%os.path.basename(job.args[0]) + self.update(float(self.done)/self.total, msg) + +def process_pages(pages, opts, update, tdir): + ''' + Render all identified comic pages. + ''' + from calibre.utils.PythonMagickWand import ImageMagick + ImageMagick + + progress = Progress(len(pages), update) + server = Server() + jobs = [] + tasks = [(p, os.path.join(tdir, os.path.basename(p))) for p in pages] + tasks = server.split(pages) + for task in tasks: + jobs.append(ParallelJob('render_pages', '', progress, + args=[task, tdir, opts])) + server.add_job(jobs[-1]) + while True: + time.sleep(1) + running = False + for job in jobs: + while True: + try: + x = job.notifications.get_nowait() + progress(*x) + except Empty: + break + job.update() + if not job.is_finished: + running = True + if not running: + break + server.close() + ans, failures = [], [] + + for job in jobs: + if job.failed: + raw_input() + raise Exception(_('Failed to process comic: \n\n%s')% + job.log_file.read()) + pages, failures_ = job.result + ans += pages + failures += failures_ + return ans, failures + + +class ComicInput(InputFormatPlugin): + + name = 'Comic Input' + author = 'Kovid Goyal' + description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices' + file_types = set(['cbz', 'cbr', 'cbc']) + is_image_collection = True + + options = set([ + OptionRecommendation(name='colors', recommended_value=64, + help=_('Number of colors for grayscale image conversion. Default: %default')), + OptionRecommendation(name='dont_normalize', recommended_value=False, + help=_('Disable normalize (improve contrast) color range ' + 'for pictures. Default: False')), + OptionRecommendation(name='keep_aspect_ratio', recommended_value=False, + help=_('Maintain picture aspect ratio. Default is to fill the screen.')), + OptionRecommendation(name='dont_sharpen', recommended_value=False, + help=_('Disable sharpening.')), + OptionRecommendation(name='disable_trim', recommended_value=False, + help=_('Disable trimming of comic pages. For some comics, ' + 'trimming might remove content as well as borders.')), + OptionRecommendation(name='landspace', recommended_value=False, + help=_("Don't split landscape images into two portrait images")), + OptionRecommendation(name='wide', recommended_value=False, + help=_("Keep aspect ratio and scale image using screen height as " + "image width for viewing in landscape mode.")), + OptionRecommendation(name='right2left', recommended_value=False, + help=_('Used for right-to-left publications like manga. ' + 'Causes landscape pages to be split into portrait pages ' + 'from right to left.')), + OptionRecommendation(name='despeckle', recommended_value=False, + help=_('Enable Despeckle. Reduces speckle noise. ' + 'May greatly increase processing time.')), + OptionRecommendation(name='no_sort', recommended_value=False, + help=_("Don't sort the files found in the comic " + "alphabetically by name. Instead use the order they were " + "added to the comic.")), + OptionRecommendation(name='no_process', recommended_value=False, + help=_("Apply no processing to the image")), + ]) + + recommendations = set([ + ('margin_left', 0, OptionRecommendation.HIGH), + ('margin_top', 0, OptionRecommendation.HIGH), + ('margin_right', 0, OptionRecommendation.HIGH), + ('margin_bottom', 0, OptionRecommendation.HIGH), + ('insert_blank_line', False, OptionRecommendation.HIGH), + ('remove_paragraph_spacing', False, OptionRecommendation.HIGH), + ('dont_justify', True, OptionRecommendation.HIGH), + ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH), + ('chapter', None, OptionRecommendation.HIGH), + ('page_breaks_brefore', None, OptionRecommendation.HIGH), + ('use_auto_toc', False, OptionRecommendation.HIGH), + ('page_breaks_before', None, OptionRecommendation.HIGH), + ('disable_font_rescaling', True, OptionRecommendation.HIGH), + ('linearize_tables', False, OptionRecommendation.HIGH), + ]) + + def get_comics_from_collection(self, stream): + from calibre.libunzip import extract as zipextract + tdir = PersistentTemporaryDirectory('_comic_collection') + zipextract(stream, tdir) + comics = [] + with CurrentDir(tdir): + if not os.path.exists('comics.txt'): + raise ValueError('%s is not a valid comic collection' + %stream.name) + for line in open('comics.txt', + 'rb').read().decode('utf-8').splitlines(): + fname, title = line.partition(':')[0], line.partition(':')[-1] + fname = os.path.join(tdir, *fname.split('/')) + if not title: + title = os.path.basename(fname).rpartition('.')[0] + if os.access(fname, os.R_OK): + comics.append([title, fname]) + if not comics: + raise ValueError('%s has no comics'%stream.name) + return comics + + def get_pages(self, comic, tdir2): + tdir = extract_comic(comic) + new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort, + verbose=self.opts.verbose) + thumbnail = None + if not new_pages: + raise ValueError('Could not find any pages in the comic: %s' + %comic) + if self.opts.no_process: + n2 = [] + for page in new_pages: + n2.append(os.path.join(tdir2, os.path.basename(page))) + shutil.copyfile(page, n2[-1]) + new_pages = n2 + else: + new_pages, failures = process_pages(new_pages, self.opts, + self.report_progress, tdir2) + if not new_pages: + raise ValueError('Could not find any valid pages in comic: %s' + % comic) + if failures: + self.log.warning('Could not process the following pages ' + '(run with --verbose to see why):') + for f in failures: + self.log.warning('\t', f) + thumbnail = os.path.join(tdir2, 'thumbnail.png') + if not os.access(thumbnail, os.R_OK): + thumbnail = None + return new_pages + + def get_images(self): + return self._images + + def convert(self, stream, opts, file_ext, log, accelerators): + from calibre.ebooks.metadata import MetaInformation + from calibre.ebooks.metadata.opf2 import OPFCreator + from calibre.ebooks.metadata.toc import TOC + + self.opts, self.log= opts, log + if file_ext == 'cbc': + comics_ = self.get_comics_from_collection(stream) + else: + comics_ = [['Comic', os.path.abspath(stream.name)]] + stream.close() + comics = [] + for i, x in enumerate(comics_): + title, fname = x + cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.' + cdir = os.path.abspath(cdir) + if not os.path.exists(cdir): + os.makedirs(cdir) + pages = self.get_pages(fname, cdir) + if not pages: continue + wrappers = self.create_wrappers(pages) + comics.append((title, pages, wrappers)) + + if not comics: + raise ValueError('No comic pages found in %s'%stream.name) + + mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0], + [_('Unknown')]) + opf = OPFCreator(os.path.abspath('.'), mi) + entries = [] + + def href(x): + if len(comics) == 1: return os.path.basename(x) + return '/'.join(x.split(os.sep)[-2:]) + + for comic in comics: + pages, wrappers = comic[1:] + entries += [(w, None) for w in map(href, wrappers)] + \ + [(x, None) for x in map(href, pages)] + opf.create_manifest(entries) + spine = [] + for comic in comics: + spine.extend(map(href, comic[2])) + self._images = [] + for comic in comics: + self._images.extend(comic[1]) + opf.create_spine(spine) + toc = TOC() + if len(comics) == 1: + wrappers = comics[0][2] + for i, x in enumerate(wrappers): + toc.add_item(href(x), None, _('Page')+' %d'%(i+1), + play_order=i) + else: + po = 0 + for comic in comics: + po += 1 + wrappers = comic[2] + stoc = toc.add_item(href(wrappers[0]), + None, comic[0], play_order=po) + for i, x in enumerate(wrappers): + stoc.add_item(href(x), None, + _('Page')+' %d'%(i+1), play_order=po) + po += 1 + opf.set_toc(toc) + m, n = open('metadata.opf', 'wb'), open('toc.ncx', 'wb') + opf.render(m, n, 'toc.ncx') + return os.path.abspath('metadata.opf') + + def create_wrappers(self, pages): + from calibre.ebooks.oeb.base import XHTML_NS + wrappers = [] + WRAPPER = textwrap.dedent('''\ + <html xmlns="%s"> + <head> + <title>Page #%d + + + +
+ comic page #%d +
+ + + ''') + dir = os.path.dirname(pages[0]) + for i, page in enumerate(pages): + wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1) + page = os.path.join(dir, 'page_%d.xhtml'%(i+1)) + open(page, 'wb').write(wrapper) + wrappers.append(page) + return wrappers + diff --git a/src/calibre/ebooks/compression/__init__.py b/src/calibre/ebooks/compression/__init__.py new file mode 100644 index 0000000000..9e2aad729c --- /dev/null +++ b/src/calibre/ebooks/compression/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' diff --git a/src/calibre/ebooks/compression/palmdoc.c b/src/calibre/ebooks/compression/palmdoc.c new file mode 100644 index 0000000000..29e9579140 --- /dev/null +++ b/src/calibre/ebooks/compression/palmdoc.c @@ -0,0 +1,204 @@ +/* +:mod:`cPalmdoc` -- Palmdoc compression/decompression +===================================================== + +.. module:: cPalmdoc + :platform: All + :synopsis: Compression decompression of Palmdoc implemented in C for speed + +.. moduleauthor:: Kovid Goyal Copyright 2009 + +*/ + +#define PY_SSIZE_T_CLEAN +#include +#include + +#define DELTA sizeof(Byte)*4096 + +#define BUFFER 6000 + +#define MIN(x, y) ( ((x) < (y)) ? (x) : (y) ) + +typedef unsigned short int Byte; +typedef struct { + Byte *data; + Py_ssize_t len; +} buffer; + +#ifdef bool +#undef bool +#endif +#define bool int + +#ifdef false +#undef false +#endif +#define false 0 + +#ifdef true +#undef true +#endif +#define true 1 + +#define CHAR(x) (( (x) > 127 ) ? (x)-256 : (x)) + +static PyObject * +cpalmdoc_decompress(PyObject *self, PyObject *args) { + const char *_input = NULL; Py_ssize_t input_len = 0; + Py_ssize_t i = 0, o = 0, j = 0, di, n; + if (!PyArg_ParseTuple(args, "t#", &_input, &input_len)) + return NULL; + Byte *input = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len); + if (input == NULL) return PyErr_NoMemory(); + // Map chars to bytes + for (j = 0; j < input_len; j++) + input[j] = (_input[j] < 0) ? _input[j]+256 : _input[j]; + char *output = (char *)PyMem_Malloc(sizeof(char)*BUFFER); + Byte c; + PyObject *ans; + if (output == NULL) return PyErr_NoMemory(); + + while (i < input_len) { + c = input[i++]; + if (c >= 1 && c <= 8) // copy 'c' bytes + while (c--) output[o++] = input[i++]; + + else if (c <= 0x7F) // 0, 09-7F = self + output[o++] = c; + + else if (c >= 0xC0) { // space + ASCII char + output[o++] = ' '; + output[o++] = c ^ 0x80; + } + else { // 80-BF repeat sequences + c = (c << 8) + input[i++]; + di = (c & 0x3FFF) >> 3; + for ( n = (c & 7) + 3; n--; ++o ) + output[o] = output[o - di]; + } + } + ans = Py_BuildValue("s#", output, o); + if (output != NULL) PyMem_Free(output); + if (input != NULL) PyMem_Free(input); + return ans; +} + +static bool +cpalmdoc_memcmp( Byte *a, Byte *b, Py_ssize_t len) { + Py_ssize_t i; + for (i = 0; i < len; i++) if (a[i] != b[i]) return false; + return true; +} + +static Py_ssize_t +cpalmdoc_rfind(Byte *data, Py_ssize_t pos, Py_ssize_t chunk_length) { + Py_ssize_t i; + for (i = pos - chunk_length; i > -1; i--) + if (cpalmdoc_memcmp(data+i, data+pos, chunk_length)) return i; + return pos; +} + + +static Py_ssize_t +cpalmdoc_do_compress(buffer *b, char *output) { + Py_ssize_t i = 0, j, chunk_len, dist; + unsigned compound; + Byte c, n; + bool found; + char *head; + head = output; + buffer temp; + temp.data = (Byte *)PyMem_Malloc(sizeof(Byte)*8); temp.len = 0; + if (temp.data == NULL) return 0; + while (i < b->len) { + c = b->data[i]; + //do repeats + if ( i > 10 && (b->len - i) > 10) { + found = false; + for (chunk_len = 10; chunk_len > 2; chunk_len--) { + j = cpalmdoc_rfind(b->data, i, chunk_len); + dist = i - j; + if (j < i && dist <= 2047) { + found = true; + compound = (dist << 3) + chunk_len-3; + *(output++) = CHAR(0x80 + (compound >> 8 )); + *(output++) = CHAR(compound & 0xFF); + i += chunk_len; + break; + } + } + if (found) continue; + } + + //write single character + i++; + if (c == 32 && i < b->len) { + n = b->data[i]; + if ( n >= 0x40 && n <= 0x7F) { + *(output++) = CHAR(n^0x80); i++; continue; + } + } + if (c == 0 || (c > 8 && c < 0x80)) + *(output++) = CHAR(c); + else { // Write binary data + j = i; + temp.data[0] = c; temp.len = 1; + while (j < b->len && temp.len < 8) { + c = b->data[j]; + if (c == 0 || (c > 8 && c < 0x80)) break; + temp.data[temp.len++] = c; j++; + } + i += temp.len - 1; + *(output++) = temp.len; + for (j=0; j < temp.len; j++) *(output++) = temp.data[j]; + } + } + return output - head; +} + +static PyObject * +cpalmdoc_compress(PyObject *self, PyObject *args) { + const char *_input = NULL; Py_ssize_t input_len = 0; + Py_ssize_t j = 0; + buffer b; + if (!PyArg_ParseTuple(args, "t#", &_input, &input_len)) + return NULL; + b.data = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len); + if (b.data == NULL) return PyErr_NoMemory(); + // Map chars to bytes + for (j = 0; j < input_len; j++) + b.data[j] = (_input[j] < 0) ? _input[j]+256 : _input[j]; + b.len = input_len; + char *output = (char *)PyMem_Malloc(sizeof(char) * b.len); + if (output == NULL) return PyErr_NoMemory(); + j = cpalmdoc_do_compress(&b, output); + if ( j == 0) return PyErr_NoMemory(); + PyObject *ans = Py_BuildValue("s#", output, j); + PyMem_Free(output); + PyMem_Free(b.data); + return ans; +} + +static PyMethodDef cPalmdocMethods[] = { + {"decompress", cpalmdoc_decompress, METH_VARARGS, + "decompress(bytestring) -> decompressed bytestring\n\n" + "Decompress a palmdoc compressed byte string. " + }, + + {"compress", cpalmdoc_compress, METH_VARARGS, + "compress(bytestring) -> compressed bytestring\n\n" + "Palmdoc compress a byte string. " + }, + {NULL, NULL, 0, NULL} +}; + +PyMODINIT_FUNC +initcPalmdoc(void) { + PyObject *m; + m = Py_InitModule3("cPalmdoc", cPalmdocMethods, + "Compress and decompress palmdoc strings." + ); + if (m == NULL) return; +} + diff --git a/src/calibre/ebooks/mobi/palmdoc.py b/src/calibre/ebooks/compression/palmdoc.py similarity index 59% rename from src/calibre/ebooks/mobi/palmdoc.py rename to src/calibre/ebooks/compression/palmdoc.py index ad65967d13..90dabcb5a8 100644 --- a/src/calibre/ebooks/mobi/palmdoc.py +++ b/src/calibre/ebooks/compression/palmdoc.py @@ -2,41 +2,46 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' \ - 'and Marshall T. Vandegrift ' +__copyright__ = '2008, Kovid Goyal ' from cStringIO import StringIO from struct import pack -COUNT_BITS = 3 +from calibre.constants import plugins +cPalmdoc = plugins['cPalmdoc'][0] +if not cPalmdoc: + raise RuntimeError(('Failed to load required cPalmdoc module: ' + '%s')%plugins['cPalmdoc'][1]) def decompress_doc(data): - buffer = [ord(i) for i in data] - res = [] - i = 0 - while i < len(buffer): - c = buffer[i] - i += 1 - if c >= 1 and c <= 8: - res.extend(buffer[i:i+c]) - i += c - elif c <= 0x7f: - res.append(c) - elif c >= 0xc0: - res.extend( (ord(' '), c^0x80) ) - else: - c = (c << 8) + buffer[i] - i += 1 - di = (c & 0x3fff) >> COUNT_BITS - j = len(res) - num = (c & ((1 << COUNT_BITS) - 1)) + 3 - - for k in range( num ): - res.append(res[j - di+k]) - - return ''.join([chr(i) for i in res]) + return cPalmdoc.decompress(data) def compress_doc(data): + return cPalmdoc.compress(data) + +def test(): + TESTS = [ + 'abc\x03\x04\x05\x06ms', # Test binary writing + 'a b c \xfed ', # Test encoding of spaces + '0123456789axyz2bxyz2cdfgfo9iuyerh', + '0123456789asd0123456789asd|yyzzxxffhhjjkk', + ('ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei ' + 'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ') + ] + for test in TESTS: + print 'Test:', repr(test) + print '\tTesting compression...' + good = py_compress_doc(test) + x = compress_doc(test) + print '\t\tgood:', repr(good) + print '\t\tx :', repr(x) + assert x == good + print '\tTesting decompression...' + print '\t\t', repr(decompress_doc(x)) + assert decompress_doc(x) == test + print + +def py_compress_doc(data): out = StringIO() i = 0 ldata = len(data) diff --git a/src/calibre/ebooks/conversion/__init__.py b/src/calibre/ebooks/conversion/__init__.py new file mode 100644 index 0000000000..384ccfb79c --- /dev/null +++ b/src/calibre/ebooks/conversion/__init__.py @@ -0,0 +1,4 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py new file mode 100644 index 0000000000..73e1a1e523 --- /dev/null +++ b/src/calibre/ebooks/conversion/cli.py @@ -0,0 +1,224 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +''' +Command line interface to conversion sub-system +''' + +USAGE = '%prog ' + _('''\ +input_file output_file [options] + +Convert an ebook from one format to another. + +input_file is the input and output_file is the output. Both must be \ +specified as the first two arguments to the command. + +The output ebook format is guessed from the file extension of \ +output_file. output_file can also be of the special format .EXT where \ +EXT is the output file extension. In this case, the name of the output \ +file is derived the name of the input file. Note that the filenames must \ +not start with a hyphen. Finally, if output_file has no extension, then \ +it is treated as a directory and an "open ebook" (OEB) consisting of HTML \ +files is written to that directory. These files are the files that would \ +normally have been passed to the output plugin. + +After specifying the input \ +and output file you can customize the conversion by specifying various \ +options. the available options depend on the input and output file types. \ +To get help on them specify the input and output file and then use the -h \ +option. + +For full documentation of the conversion system see +''') + 'http://calibre.kovidgoyal.net/user_manual/conversion.html' + +import sys, os +from optparse import OptionGroup, Option + +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) + +def check_command_line_options(parser, args, log): + if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'): + print_help(parser, log) + log.error('\n\nYou must specify the input AND output files') + raise SystemExit(1) + + input = os.path.abspath(args[1]) + if not input.endswith('.recipe') and not os.access(input, os.R_OK): + log.error('Cannot read from', input) + raise SystemExit(1) + + output = args[2] + if output.startswith('.') and output != '.': + output = os.path.splitext(os.path.basename(input))[0]+output + output = os.path.abspath(output) + + return input, output + +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + if isinstance(rec.recommended_value, type(True)): + attrs['action'] = 'store_false' if rec.recommended_value else \ + 'store_true' + add_option(Option(*switches, **attrs)) + +def add_input_output_options(parser, plumber): + input_options, output_options = \ + plumber.input_options, plumber.output_options + + def add_options(group, options): + for opt in options: + option_recommendation_to_cli_option(group, opt) + + if input_options: + title = _('INPUT OPTIONS') + io = OptionGroup(parser, title, _('Options to control the processing' + ' of the input %s file')%plumber.input_fmt) + add_options(io.add_option, input_options) + parser.add_option_group(io) + + if output_options: + title = _('OUTPUT OPTIONS') + oo = OptionGroup(parser, title, _('Options to control the processing' + ' of the output %s')%plumber.output_fmt) + add_options(oo.add_option, output_options) + parser.add_option_group(oo) + +def add_pipeline_options(parser, plumber): + groups = { + '' : ('', + [ + 'input_profile', + 'output_profile', + ] + ), + 'LOOK AND FEEL' : ( + _('Options to control the look and feel of the output'), + [ + 'base_font_size', 'disable_font_rescaling', + 'font_size_mapping', + 'line_height', + 'linearize_tables', + 'extra_css', + 'margin_top', 'margin_left', 'margin_right', + 'margin_bottom', 'dont_justify', + 'insert_blank_line', 'remove_paragraph_spacing', + ] + ), + + 'STRUCTURE DETECTION' : ( + _('Control auto-detection of document structure.'), + [ + 'chapter', 'chapter_mark', + 'prefer_metadata_cover', 'remove_first_image', + 'insert_metadata', 'page_breaks_before', + 'preprocess_html', + ] + ), + + 'TABLE OF CONTENTS' : ( + _('Control the automatic generation of a Table of Contents. By ' + 'default, if the source file has a Table of Contents, it will ' + 'be used in preference to the automatically generated one.'), + [ + 'level1_toc', 'level2_toc', 'level3_toc', + 'toc_threshold', 'max_toc_links', 'no_chapters_in_toc', + 'use_auto_toc', 'toc_filter', + ] + ), + + 'METADATA' : (_('Options to set metadata in the output'), + plumber.metadata_option_names, + ), + 'DEBUG': (_('Options to help with debugging the conversion'), + [ + 'verbose', + ]), + + + } + + group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION', + 'TABLE OF CONTENTS', 'METADATA', 'DEBUG'] + + for group in group_order: + desc, options = groups[group] + if group: + group = OptionGroup(parser, group, desc) + parser.add_option_group(group) + add_option = group.add_option if group != '' else parser.add_option + + for name in options: + rec = plumber.get_option_by_name(name) + if rec.level < rec.HIGH: + option_recommendation_to_cli_option(add_option, rec) + + option_recommendation_to_cli_option(parser.add_option, + plumber.get_option_by_name('list_recipes')) + +def option_parser(): + return OptionParser(usage=USAGE) + + +class ProgressBar(object): + + def __init__(self, log): + self.log = log + + def __call__(self, frac, msg=''): + if msg: + percent = int(frac*100) + self.log('%d%% %s'%(percent, msg)) + +def create_option_parser(args, log): + parser = option_parser() + if len(args) < 3: + print_help(parser, log) + raise SystemExit(1) + + input, output = check_command_line_options(parser, args, log) + + from calibre.ebooks.conversion.plumber import Plumber + + reporter = ProgressBar(log) + plumber = Plumber(input, output, log, reporter) + add_input_output_options(parser, plumber) + add_pipeline_options(parser, plumber) + + return parser, plumber + +def main(args=sys.argv): + log = Log() + parser, plumber = create_option_parser(args, log) + opts = parser.parse_args(args)[0] + y = lambda q : os.path.abspath(os.path.expanduser(q)) + for x in ('read_metadata_from_opf', 'cover'): + if getattr(opts, x, None) is not None: + setattr(opts, x, y(getattr(opts, x))) + recommendations = [(n.dest, getattr(opts, n.dest), + OptionRecommendation.HIGH) \ + for n in parser.options_iter() + if n.dest] + plumber.merge_ui_recommendations(recommendations) + + plumber.run() + + if plumber.opts.debug_input is None: + log(_('Output saved to'), ' ', plumber.output) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/conversion/config.py b/src/calibre/ebooks/conversion/config.py new file mode 100644 index 0000000000..e8b923a1d7 --- /dev/null +++ b/src/calibre/ebooks/conversion/config.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.utils.config import config_dir +from calibre.utils.lock import ExclusiveFile +from calibre import sanitize_file_name +from calibre.customize.conversion import OptionRecommendation + + +config_dir = os.path.join(config_dir, 'conversion') +if not os.path.exists(config_dir): + os.makedirs(config_dir) + +def name_to_path(name): + return os.path.join(config_dir, sanitize_file_name(name)+'.py') + +def save_defaults(name, recs): + path = name_to_path(name) + raw = str(recs) + with open(path, 'wb'): + pass + with ExclusiveFile(path) as f: + f.write(raw) + +def load_defaults(name): + path = name_to_path(name) + if not os.path.exists(path): + open(path, 'wb').close() + with ExclusiveFile(path) as f: + raw = f.read() + r = GuiRecommendations() + if raw: + r.from_string(raw) + return r + +def save_specifics(db, book_id, recs): + raw = str(recs) + db.set_conversion_options(book_id, 'PIPE', raw) + +def load_specifics(db, book_id): + raw = db.conversion_options(book_id, 'PIPE') + r = GuiRecommendations() + if raw: + r.from_string(raw) + return r + +class GuiRecommendations(dict): + + def __new__(cls, *args): + dict.__new__(cls) + obj = super(GuiRecommendations, cls).__new__(cls, *args) + obj.disabled_options = set([]) + return obj + + def to_recommendations(self, level=OptionRecommendation.LOW): + ans = [] + for key, val in self.items(): + ans.append((key, val, level)) + return ans + + def __str__(self): + ans = ['{'] + for key, val in self.items(): + ans.append('\t'+repr(key)+' : '+repr(val)+',') + ans.append('}') + return '\n'.join(ans) + + def from_string(self, raw): + try: + d = eval(raw) + except SyntaxError: + d = None + if d: + self.update(d) + + def merge_recommendations(self, get_option, level, options, + only_existing=False): + for name in options: + if only_existing and name not in self: + continue + opt = get_option(name) + if opt is None: continue + if opt.level == OptionRecommendation.HIGH: + self[name] = opt.recommended_value + self.disabled_options.add(name) + elif opt.level > level or name not in self: + self[name] = opt.recommended_value + + diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py new file mode 100644 index 0000000000..9bab5d6701 --- /dev/null +++ b/src/calibre/ebooks/conversion/plumber.py @@ -0,0 +1,690 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, re + +from calibre.customize.conversion import OptionRecommendation, DummyReporter +from calibre.customize.ui import input_profiles, output_profiles, \ + plugin_for_input_format, plugin_for_output_format +from calibre.ebooks.conversion.preprocess import HTMLPreProcessor +from calibre.ptempfile import PersistentTemporaryDirectory +from calibre import extract, walk + +def supported_input_formats(): + from calibre.customize.ui import available_input_formats + fmts = available_input_formats() + for x in ('zip', 'rar', 'oebzip'): + fmts.add(x) + return fmts + +INPUT_FORMAT_PREFERENCES = ['cbr', 'cbz', 'cbc', 'lit', 'mobi', 'prc', 'azw', 'fb2', 'html', + 'rtf', 'pdf', 'txt', 'pdb'] +OUTPUT_FORMAT_PREFERENCES = ['epub', 'mobi', 'lit', 'pdf', 'pdb', 'txt'] + +class OptionValues(object): + pass + +class CompositeProgressReporter(object): + + def __init__(self, global_min, global_max, global_reporter): + self.global_min, self.global_max = global_min, global_max + self.global_reporter = global_reporter + + def __call__(self, fraction, msg=''): + global_frac = self.global_min + fraction * \ + (self.global_max - self.global_min) + self.global_reporter(global_frac, msg) + +class Plumber(object): + ''' + The `Plumber` manages the conversion pipeline. An UI should call the methods + :method:`merge_ui_recommendations` and then :method:`run`. The plumber will + take care of the rest. + ''' + + metadata_option_names = [ + 'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments', + 'publisher', 'series', 'series_index', 'rating', 'isbn', + 'tags', 'book_producer', 'language' + ] + + def __init__(self, input, output, log, report_progress=DummyReporter()): + ''' + :param input: Path to input file. + :param output: Path to output file/directory + ''' + self.input = os.path.abspath(input) + self.output = os.path.abspath(output) + self.log = log + self.ui_reporter = report_progress + + # Initialize the conversion options that are independent of input and + # output formats. The input and output plugins can still disable these + # options via recommendations. + self.pipeline_options = [ + +OptionRecommendation(name='verbose', + recommended_value=0, level=OptionRecommendation.LOW, + short_switch='v', + help=_('Level of verbosity. Specify multiple times for greater ' + 'verbosity.') + ), + +OptionRecommendation(name='input_profile', + recommended_value='default', level=OptionRecommendation.LOW, + choices=[x.short_name for x in input_profiles()], + help=_('Specify the input profile. The input profile gives the ' + 'conversion system information on how to interpret ' + 'various information in the input document. For ' + 'example resolution dependent lengths (i.e. lengths in ' + 'pixels). Choices are:')+\ + ', '.join([x.short_name for x in input_profiles()]) + ), + +OptionRecommendation(name='output_profile', + recommended_value='default', level=OptionRecommendation.LOW, + choices=[x.short_name for x in output_profiles()], + help=_('Specify the output profile. The output profile ' + 'tells the conversion system how to optimize the ' + 'created document for the specified device. In some cases, ' + 'an output profile is required to produce documents that ' + 'will work on a device. For example EPUB on the SONY reader. ' + 'Choices are:') + \ + ', '.join([x.short_name for x in output_profiles()]) + ), + +OptionRecommendation(name='base_font_size', + recommended_value=0, level=OptionRecommendation.LOW, + help=_('The base font size in pts. All font sizes in the produced book ' + 'will be rescaled based on this size. By choosing a larger ' + 'size you can make the fonts in the output bigger and vice ' + 'versa. By default, the base font size is chosen based on ' + 'the output profile you chose.' + ) + ), + +OptionRecommendation(name='font_size_mapping', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Mapping from CSS font names to font sizes in pts. ' + 'An example setting is 12,12,14,16,18,20,22,24. ' + 'These are the mappings for the sizes xx-small to xx-large, ' + 'with the final size being for huge fonts. The font ' + 'rescaling algorithm uses these sizes to intelligently ' + 'rescale fonts. The default is to use a mapping based on ' + 'the output profile you chose.' + ) + ), + +OptionRecommendation(name='disable_font_rescaling', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Disable all rescaling of font sizes.' + ) + ), + + +OptionRecommendation(name='line_height', + recommended_value=0, level=OptionRecommendation.LOW, + help=_('The line height in pts. Controls spacing between consecutive ' + 'lines of text. By default no line height manipulation is ' + 'performed.' + ) + ), + +OptionRecommendation(name='linearize_tables', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Some badly designed documents use tables to control the ' + 'layout of text on the page. When converted these documents ' + 'often have text that runs off the page and other artifacts. ' + 'This option will extract the content from the tables and ' + 'present it in a linear fashion.' + ) + ), + +OptionRecommendation(name='level1_toc', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('XPath expression that specifies all tags that ' + 'should be added to the Table of Contents at level one. If ' + 'this is specified, it takes precedence over other forms ' + 'of auto-detection.' + ) + ), + +OptionRecommendation(name='level2_toc', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('XPath expression that specifies all tags that should be ' + 'added to the Table of Contents at level two. Each entry is added ' + 'under the previous level one entry.' + ) + ), + +OptionRecommendation(name='level3_toc', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('XPath expression that specifies all tags that should be ' + 'added to the Table of Contents at level three. Each entry ' + 'is added under the previous level two entry.' + ) + ), + +OptionRecommendation(name='use_auto_toc', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Normally, if the source file already has a Table of ' + 'Contents, it is used in preference to the auto-generated one. ' + 'With this option, the auto-generated one is always used.' + ) + ), + +OptionRecommendation(name='no_chapters_in_toc', + recommended_value=False, level=OptionRecommendation.LOW, + help=_("Don't add auto-detected chapters to the Table of " + 'Contents.' + ) + ), + +OptionRecommendation(name='toc_threshold', + recommended_value=6, level=OptionRecommendation.LOW, + help=_( + 'If fewer than this number of chapters is detected, then links ' + 'are added to the Table of Contents. Default: %default') + ), + +OptionRecommendation(name='max_toc_links', + recommended_value=50, level=OptionRecommendation.LOW, + help=_('Maximum number of links to insert into the TOC. Set to 0 ' + 'to disable. Default is: %default. Links are only added to the ' + 'TOC if less than the threshold number of chapters were detected.' + ) + ), + +OptionRecommendation(name='toc_filter', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Remove entries from the Table of Contents whose titles ' + 'match the specified regular expression. Matching entries and all ' + 'their children are removed.' + ) + ), + + +OptionRecommendation(name='chapter', + recommended_value="//*[((name()='h1' or name()='h2') and " + r"re:test(., 'chapter|book|section|part\s+', 'i')) or @class " + "= 'chapter']", level=OptionRecommendation.LOW, + help=_('An XPath expression to detect chapter titles. The default ' + 'is to consider

or

tags that contain the words ' + '"chapter","book","section" or "part" as chapter titles as ' + 'well as any tags that have class="chapter". The expression ' + 'used must evaluate to a list of elements. To disable chapter ' + 'detection, use the expression "/". See the XPath Tutorial ' + 'in the calibre User Manual for further help on using this ' + 'feature.' + ) + ), + +OptionRecommendation(name='chapter_mark', + recommended_value='pagebreak', level=OptionRecommendation.LOW, + choices=['pagebreak', 'rule', 'both', 'none'], + help=_('Specify how to mark detected chapters. A value of ' + '"pagebreak" will insert page breaks before chapters. ' + 'A value of "rule" will insert a line before chapters. ' + 'A value of "none" will disable chapter marking and a ' + 'value of "both" will use both page breaks and lines ' + 'to mark chapters.') + ), + +OptionRecommendation(name='extra_css', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Either the path to a CSS stylesheet or raw CSS. ' + 'This CSS will be appended to the style rules from ' + 'the source file, so it can be used to override those ' + 'rules.') + ), + +OptionRecommendation(name='page_breaks_before', + recommended_value="//*[name()='h1' or name()='h2']", + level=OptionRecommendation.LOW, + help=_('An XPath expression. Page breaks are inserted ' + 'before the specified elements.') + ), + +OptionRecommendation(name='margin_top', + recommended_value=5.0, level=OptionRecommendation.LOW, + help=_('Set the top margin in pts. Default is %default. ' + 'Note: 72 pts equals 1 inch')), + +OptionRecommendation(name='margin_bottom', + recommended_value=5.0, level=OptionRecommendation.LOW, + help=_('Set the bottom margin in pts. Default is %default. ' + 'Note: 72 pts equals 1 inch')), + +OptionRecommendation(name='margin_left', + recommended_value=5.0, level=OptionRecommendation.LOW, + help=_('Set the left margin in pts. Default is %default. ' + 'Note: 72 pts equals 1 inch')), + +OptionRecommendation(name='margin_right', + recommended_value=5.0, level=OptionRecommendation.LOW, + help=_('Set the right margin in pts. Default is %default. ' + 'Note: 72 pts equals 1 inch')), + +OptionRecommendation(name='dont_justify', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not force text to be justified in output. Whether text ' + 'is actually displayed justified or not depends on whether ' + 'the ebook format and reading device support justification.') + ), + +OptionRecommendation(name='remove_paragraph_spacing', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Remove spacing between paragraphs. Also sets an indent on ' + 'paragraphs of 1.5em. Spacing removal will not work ' + 'if the source file does not use paragraphs (

or

tags).') + ), + +OptionRecommendation(name='prefer_metadata_cover', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Use the cover detected from the source file in preference ' + 'to the specified cover.') + ), + +OptionRecommendation(name='insert_blank_line', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Insert a blank line between paragraphs. Will not work ' + 'if the source file does not use paragraphs (

or

tags).' + ) + ), + +OptionRecommendation(name='remove_first_image', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Remove the first image from the input ebook. Useful if the ' + 'first image in the source file is a cover and you are specifying ' + 'an external cover.' + ) + ), + +OptionRecommendation(name='insert_metadata', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Insert the book metadata at the start of ' + 'the book. This is useful if your ebook reader does not support ' + 'displaying/searching metadata directly.' + ) + ), + +OptionRecommendation(name='preprocess_html', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Attempt to detect and correct hard line breaks and other ' + 'problems in the source file. This may make things worse, so use ' + 'with care.' + ) + ), + + +OptionRecommendation(name='read_metadata_from_opf', + recommended_value=None, level=OptionRecommendation.LOW, + short_switch='m', + help=_('Read metadata from the specified OPF file. Metadata read ' + 'from this file will override any metadata in the source ' + 'file.') + ), + + +OptionRecommendation(name='title', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the title.')), + +OptionRecommendation(name='authors', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the authors. Multiple authors should be separated by ' + 'ampersands.')), + +OptionRecommendation(name='title_sort', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('The version of the title to be used for sorting. ')), + +OptionRecommendation(name='author_sort', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('String to be used when sorting by author. ')), + +OptionRecommendation(name='cover', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the cover to the specified file.')), + +OptionRecommendation(name='comments', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the ebook description.')), + +OptionRecommendation(name='publisher', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the ebook publisher.')), + +OptionRecommendation(name='series', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the series this ebook belongs to.')), + +OptionRecommendation(name='series_index', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the index of the book in this series.')), + +OptionRecommendation(name='rating', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the rating. Should be a number between 1 and 5.')), + +OptionRecommendation(name='isbn', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the ISBN of the book.')), + +OptionRecommendation(name='tags', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the tags for the book. Should be a comma separated list.')), + +OptionRecommendation(name='book_producer', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the book producer.')), + +OptionRecommendation(name='language', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the language.')), + +OptionRecommendation(name='list_recipes', + recommended_value=False, help=_('List available recipes.')), + +] + + input_fmt = os.path.splitext(self.input)[1] + if not input_fmt: + raise ValueError('Input file must have an extension') + input_fmt = input_fmt[1:].lower() + if input_fmt in ('zip', 'rar', 'oebzip'): + self.log('Processing archive...') + tdir = PersistentTemporaryDirectory('_plumber') + self.input, input_fmt = self.unarchive(self.input, tdir) + + if os.path.exists(self.output) and os.path.isdir(self.output): + output_fmt = 'oeb' + else: + output_fmt = os.path.splitext(self.output)[1] + if not output_fmt: + output_fmt = '.oeb' + output_fmt = output_fmt[1:].lower() + + self.input_plugin = plugin_for_input_format(input_fmt) + self.output_plugin = plugin_for_output_format(output_fmt) + + if self.input_plugin is None: + raise ValueError('No plugin to handle input format: '+input_fmt) + + if self.output_plugin is None: + raise ValueError('No plugin to handle output format: '+output_fmt) + + self.input_fmt = input_fmt + self.output_fmt = output_fmt + + # Build set of all possible options. Two options are equal if their + # names are the same. + self.input_options = self.input_plugin.options.union( + self.input_plugin.common_options) + self.output_options = self.output_plugin.options.union( + self.output_plugin.common_options) + + # Remove the options that have been disabled by recommendations from the + # plugins. + self.merge_plugin_recommendations() + + @classmethod + def unarchive(self, path, tdir): + extract(path, tdir) + files = list(walk(tdir)) + from calibre.customize.ui import available_input_formats + fmts = available_input_formats() + for x in ('htm', 'html', 'xhtm', 'xhtml'): fmts.remove(x) + + for ext in fmts: + for f in files: + if f.lower().endswith('.'+ext): + if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048: + continue + return f, ext + return self.find_html_index(files) + + @classmethod + def find_html_index(self, files): + ''' + Given a list of files, find the most likely root HTML file in the + list. + ''' + html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE) + html_files = [f for f in files if html_pat.search(f) is not None] + if not html_files: + raise ValueError(_('Could not find an ebook inside the archive')) + html_files = [(f, os.stat(f).st_size) for f in html_files] + html_files.sort(cmp = lambda x, y: cmp(x[1], y[1])) + html_files = [f[0] for f in html_files] + for q in ('toc', 'index'): + for f in html_files: + if os.path.splitext(os.path.basename(f))[0].lower() == q: + return f, os.path.splitext(f)[1].lower()[1:] + return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] + + + + def get_option_by_name(self, name): + for group in (self.input_options, self.pipeline_options, + self.output_options): + for rec in group: + if rec.option == name: + return rec + + def get_option_help(self, name): + rec = self.get_option_by_name(name) + help = getattr(rec, 'help', None) + if help is not None: + return help.replace('%default', str(rec.recommended_value)) + + def merge_plugin_recommendations(self): + for source in (self.input_plugin, self.output_plugin): + for name, val, level in source.recommendations: + rec = self.get_option_by_name(name) + if rec is not None and rec.level <= level: + rec.recommended_value = val + rec.level = level + + def merge_ui_recommendations(self, recommendations): + ''' + Merge recommendations from the UI. As long as the UI recommendation + level is >= the baseline recommended level, the UI value is used, + *except* if the baseline has a recommendation level of `HIGH`. + ''' + for name, val, level in recommendations: + rec = self.get_option_by_name(name) + if rec is not None and rec.level <= level and rec.level < rec.HIGH: + rec.recommended_value = val + rec.level = level + + def read_user_metadata(self): + ''' + Read all metadata specified by the user. Command line options override + metadata from a specified OPF file. + ''' + from calibre.ebooks.metadata import MetaInformation, string_to_authors + from calibre.ebooks.metadata.opf2 import OPF + mi = MetaInformation(None, []) + if self.opts.read_metadata_from_opf is not None: + self.opts.read_metadata_from_opf = os.path.abspath( + self.opts.read_metadata_from_opf) + opf = OPF(open(self.opts.read_metadata_from_opf, 'rb'), + os.path.dirname(self.opts.read_metadata_from_opf)) + mi = MetaInformation(opf) + for x in self.metadata_option_names: + val = getattr(self.opts, x, None) + if val is not None: + if x == 'authors': + val = string_to_authors(val) + elif x == 'tags': + val = [i.strip() for i in val.split(',')] + elif x in ('rating', 'series_index'): + val = float(val) + setattr(mi, x, val) + if mi.cover: + mi.cover_data = ('', open(mi.cover, 'rb').read()) + mi.cover = None + self.user_metadata = mi + + def setup_options(self): + ''' + Setup the `self.opts` object. + ''' + self.opts = OptionValues() + for group in (self.input_options, self.pipeline_options, + self.output_options): + for rec in group: + setattr(self.opts, rec.option.name, rec.recommended_value) + + for x in input_profiles(): + if x.short_name == self.opts.input_profile: + self.opts.input_profile = x + break + + for x in output_profiles(): + if x.short_name == self.opts.output_profile: + self.opts.output_profile = x + break + + self.read_user_metadata() + + def run(self): + ''' + Run the conversion pipeline + ''' + # Setup baseline option values + self.setup_options() + if self.opts.verbose: + self.log.filter_level = self.log.DEBUG + if self.opts.list_recipes: + from calibre.web.feeds.recipes import titles + self.log('Available recipes:') + for title in sorted(titles): + self.log('\t'+title) + self.log('%d recipes available'%len(titles)) + raise SystemExit(0) + + # Run any preprocess plugins + from calibre.customize.ui import run_plugins_on_preprocess + self.input = run_plugins_on_preprocess(self.input) + + # Create an OEBBook from the input file. The input plugin does all the + # heavy lifting. + accelerators = {} + + tdir = PersistentTemporaryDirectory('_plumber') + stream = self.input if self.input_fmt == 'recipe' else \ + open(self.input, 'rb') + + if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf': + self.opts.lrf = True + + self.ui_reporter(0.01, _('Converting input to HTML...')) + ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter) + self.input_plugin.report_progress = ir + self.oeb = self.input_plugin(stream, self.opts, + self.input_fmt, self.log, + accelerators, tdir) + if self.opts.debug_input is not None: + self.log('Debug input called, aborting the rest of the pipeline.') + return + if not hasattr(self.oeb, 'manifest'): + self.oeb = create_oebbook(self.log, self.oeb, self.opts, + self.input_plugin) + pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter) + pr(0., _('Running transforms on ebook...')) + + from calibre.ebooks.oeb.transforms.guide import Clean + Clean()(self.oeb, self.opts) + pr(0.1) + + self.opts.source = self.opts.input_profile + self.opts.dest = self.opts.output_profile + + from calibre.ebooks.oeb.transforms.metadata import MergeMetadata + MergeMetadata()(self.oeb, self.user_metadata, + self.opts.prefer_metadata_cover) + pr(0.2) + + from calibre.ebooks.oeb.transforms.structure import DetectStructure + DetectStructure()(self.oeb, self.opts) + pr(0.35) + + from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener + fbase = self.opts.base_font_size + if fbase < 1e-4: + fbase = float(self.opts.dest.fbase) + fkey = self.opts.font_size_mapping + if fkey is None: + fkey = self.opts.dest.fkey + else: + fkey = map(float, fkey.split(',')) + + from calibre.ebooks.oeb.transforms.jacket import Jacket + Jacket()(self.oeb, self.opts, self.user_metadata) + pr(0.4) + + if self.opts.extra_css and os.path.exists(self.opts.extra_css): + self.opts.extra_css = open(self.opts.extra_css, 'rb').read() + + oibl = self.opts.insert_blank_line + orps = self.opts.remove_paragraph_spacing + if self.output_plugin.file_type == 'lrf': + self.opts.insert_blank_line = False + self.opts.remove_paragraph_spacing = False + line_height = self.opts.line_height + if line_height < 1e-4: + line_height = None + flattener = CSSFlattener(fbase=fbase, fkey=fkey, + lineh=line_height, + untable=self.output_plugin.file_type in ('mobi','lit'), + unfloat=self.output_plugin.file_type in ('mobi', 'lit')) + flattener(self.oeb, self.opts) + self.opts.insert_blank_line = oibl + self.opts.remove_paragraph_spacing = orps + + if self.opts.linearize_tables and \ + self.output_plugin.file_type not in ('mobi', 'lrf'): + from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables + LinearizeTables()(self.oeb, self.opts) + pr(0.9) + + from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer + + self.log.info('Cleaning up manifest...') + trimmer = ManifestTrimmer() + trimmer(self.oeb, self.opts) + + self.oeb.toc.rationalize_play_orders() + pr(1.) + + self.log.info('Creating %s...'%self.output_plugin.name) + our = CompositeProgressReporter(0.67, 1., self.ui_reporter) + self.output_plugin.report_progress = our + our(0., _('Creating')+' %s'%self.output_plugin.name) + self.output_plugin.convert(self.oeb, self.output, self.input_plugin, + self.opts, self.log) + self.ui_reporter(1.) + self.log(self.output_fmt.upper(), 'output written to', self.output) + +def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, + encoding='utf-8'): + ''' + Create an OEBBook. + ''' + from calibre.ebooks.oeb.base import OEBBook + html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, + opts.preprocess_html) + oeb = OEBBook(log, html_preprocessor, + pretty_print=opts.pretty_print, input_encoding=encoding) + # Read OEB Book into OEBBook + log('Parsing all content...') + if reader is None: + from calibre.ebooks.oeb.reader import OEBReader + reader = OEBReader + + reader()(oeb, path_or_stream) + return oeb diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py new file mode 100644 index 0000000000..2dc404e586 --- /dev/null +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re, functools + +from calibre import entity_to_unicode + +XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') +SVG_NS = 'http://www.w3.org/2000/svg' +XLINK_NS = 'http://www.w3.org/1999/xlink' + +convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp']) +_span_pat = re.compile('', re.DOTALL|re.IGNORECASE) + + +def sanitize_head(match): + x = match.group(1) + x = _span_pat.sub('', x) + return '\n'+x+'\n' + +def chap_head(match): + chap = match.group('chap') + title = match.group('title') + if not title: + return '

'+chap+'


\n' + else: + return '

'+chap+'
\n'+title+'


\n' + +def wrap_lines(match): + ital = match.group('ital') + if not ital: + return ' ' + else: + return ital+' ' + +def line_length(raw, percent): + ''' + raw is the raw text to find the line length to use for wrapping. + percentage is a decimal number, 0 - 1 which is used to determine + how far in the list of line lengths to use. + ''' + raw = raw.replace(' ', ' ') + linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) + lines = linere.findall(raw) + + lengths = [] + for line in lines: + if len(line) > 0: + lengths.append(len(line)) + total = sum(lengths) + avg = total / len(lengths) + max_line = avg * 2 + + lengths = sorted(lengths) + for i in range(len(lengths) - 1, -1, -1): + if lengths[i] > max_line: + del lengths[i] + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + index = int(len(lengths) * percent) - 1 + + return lengths[index] + + +class CSSPreProcessor(object): + + PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') + + def __call__(self, data): + data = self.PAGE_PAT.sub('', data) + return data + +class HTMLPreProcessor(object): + + PREPROCESS = [ + # Some idiotic HTML generators (Frontpage I'm looking at you) + # Put all sorts of crap into . This messes up lxml + (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), + sanitize_head), + # Convert all entities, since lxml doesn't handle them well + (re.compile(r'&(\S+?);'), convert_entities), + # Remove the ', re.IGNORECASE), + lambda match: ''), + ] + + # Fix pdftohtml markup + PDFTOHTML = [ + # Fix umlauts + (re.compile(u'¨\s*()*\s*o', re.UNICODE), lambda match: u'ö'), + (re.compile(u'¨\s*()*\s*O', re.UNICODE), lambda match: u'Ö'), + (re.compile(u'¨\s*()*\s*u', re.UNICODE), lambda match: u'ü'), + (re.compile(u'¨\s*()*\s*U', re.UNICODE), lambda match: u'Ü'), + (re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'), + (re.compile(u'¨\s*()*\s*E', re.UNICODE), lambda match: u'Ë'), + (re.compile(u'¨\s*()*\s*i', re.UNICODE), lambda match: u'ï'), + (re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'), + (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), + (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), + + # Remove page links + (re.compile(r'', re.IGNORECASE), lambda match: ''), + # Remove
tags + (re.compile(r'', re.IGNORECASE), lambda match: '
'), + # Replace

with

+ (re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), + + # Remove hyphenation + (re.compile(r'-\n\r?'), lambda match: ''), + + # Remove gray background + (re.compile(r']+>'), lambda match : ''), + + # Remove non breaking spaces + (re.compile(ur'\u00a0'), lambda match : ' '), + + # Detect Chapters to match default XPATH in GUI + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(||)?(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?(||)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + + # Have paragraphs show better + (re.compile(r'<br.*?>'), lambda match : '<p>'), + # Clean up spaces + (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), + # Add space before and after italics + (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), + (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), + ] + + # Fix Book Designer markup + BOOK_DESIGNER = [ + # HR + (re.compile('<hr>', re.IGNORECASE), + lambda match : '<span style="page-break-after:always"> </span>'), + # Create header tags + (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), + lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))), + (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), + lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))), + (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), + lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)), + (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), + lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), + ] + def __init__(self, input_plugin_preprocess, plugin_preprocess): + self.input_plugin_preprocess = input_plugin_preprocess + self.plugin_preprocess = plugin_preprocess + + def is_baen(self, src): + return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"', + re.IGNORECASE).search(src) is not None + + def is_book_designer(self, raw): + return re.search('<H2[^><]*id=BookTitle', raw) is not None + + def is_pdftohtml(self, src): + return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] + + def __call__(self, html, remove_special_chars=None): + if remove_special_chars is not None: + html = remove_special_chars.sub('', html) + html = html.replace('\0', '') + if self.is_baen(html): + rules = [] + elif self.is_book_designer(html): + rules = self.BOOK_DESIGNER + elif self.is_pdftohtml(html): + line_length_rules = [ + # Un wrap using punctuation + (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .3), re.UNICODE), wrap_lines), + ] + + rules = self.PDFTOHTML + line_length_rules + else: + rules = [] + for rule in self.PREPROCESS + rules: + html = rule[0].sub(rule[1], html) + + # Handle broken XHTML w/ SVG (ugh) + if 'svg:' in html and SVG_NS not in html: + html = html.replace( + '<html', '<html xmlns:svg="%s"' % SVG_NS, 1) + if 'xlink:' in html and XLINK_NS not in html: + html = html.replace( + '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1) + + html = XMLDECL_RE.sub('', html) + + if self.plugin_preprocess: + html = self.input_plugin_preprocess(html) + + return html + diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index ecea8d98f6..f5de8421e0 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -6,32 +6,7 @@ __docformat__ = 'restructuredtext en' ''' Conversion to EPUB. ''' -import sys, textwrap, re, os, uuid -from itertools import cycle -from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_STORED -from calibre.ebooks.html import config as common_config, tostring -from lxml import etree - -class DefaultProfile(object): - - flow_size = sys.maxint - screen_size = None - remove_special_chars = False - remove_object_tags = False - -class PRS505(DefaultProfile): - - flow_size = 270000 - screen_size = (590, 765) - remove_special_chars = re.compile(u'[\u200b\u00ad]') - remove_object_tags = True - - -PROFILES = { - 'PRS505' : PRS505, - 'None' : DefaultProfile, - } def rules(stylesheets): for s in stylesheets: @@ -40,38 +15,6 @@ def rules(stylesheets): if r.type == r.STYLE_RULE: yield r -def decrypt_font(key, path): - raw = open(path, 'rb').read() - crypt = raw[:1024] - key = cycle(iter(key)) - decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) - with open(path, 'wb') as f: - f.write(decrypt) - f.write(raw[1024:]) - -def process_encryption(encfile, opf): - key = None - m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read()) - if m: - key = m.group(1) - key = list(map(ord, uuid.UUID(key).bytes)) - try: - root = etree.parse(encfile) - for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): - algorithm = em.get('Algorithm', '') - if algorithm != 'http://ns.adobe.com/pdf/enc#RC': - return False - cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] - uri = cr.get('URI') - path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) - if os.path.exists(path): - decrypt_font(key, path) - return True - except: - import traceback - traceback.print_exc() - return False - def initialize_container(path_to_container, opf_name='metadata.opf'): ''' Create an empty EPUB document, with a default skeleton. @@ -90,152 +33,4 @@ def initialize_container(path_to_container, opf_name='metadata.opf'): zf.writestr('META-INF/container.xml', CONTAINER) return zf -def config(defaults=None, name='epub'): - desc = _('Options to control the conversion to EPUB') - if defaults is None: - c = Config(name, desc) - else: - c = StringConfig(defaults, desc) - - c.update(common_config()) - c.remove_opt('output') - c.remove_opt('zip') - - c.add_opt('output', ['-o', '--output'], default=None, - help=_('The output EPUB file. If not specified, it is ' - 'derived from the input file name.')) - c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()), - help=_('Profile of the target device this EPUB is meant for. ' - 'Set to None to create a device independent EPUB. ' - 'The profile is used for device specific restrictions ' - 'on the EPUB. Choices are: ')+str(list(PROFILES.keys()))) - c.add_opt('override_css', ['--override-css'], default=None, - help=_('Either the path to a CSS stylesheet or raw CSS. ' - 'This CSS will override any existing CSS ' - 'declarations in the source files.')) - structure = c.add_group('structure detection', - _('Control auto-detection of document structure.')) - structure('chapter', ['--chapter'], - default="//*[re:match(name(), 'h[1-2]') and " - "re:test(., 'chapter|book|section|part', 'i')] | " - "//*[@class = 'chapter']", - help=_('''\ -An XPath expression to detect chapter titles. The default is to consider <h1> or -<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as -well as any tags that have class="chapter". -The expression used must evaluate to a list of elements. To disable chapter detection, -use the expression "/". See the XPath Tutorial in the calibre User Manual for further -help on using this feature. -''').replace('\n', ' ')) - structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'], - default='pagebreak', - help=_('Specify how to mark detected chapters. A value of ' - '"pagebreak" will insert page breaks before chapters. ' - 'A value of "rule" will insert a line before chapters. ' - 'A value of "none" will disable chapter marking and a ' - 'value of "both" will use both page breaks and lines ' - 'to mark chapters.')) - structure('cover', ['--cover'], default=None, - help=_('Path to the cover to be used for this book')) - structure('prefer_metadata_cover', ['--prefer-metadata-cover'], default=False, - action='store_true', - help=_('Use the cover detected from the source file in preference ' - 'to the specified cover.')) - structure('remove_first_image', ['--remove-first-image'], default=False, - help=_('Remove the first image from the input ebook. Useful if ' - 'the first image in the source file is a cover and you ' - 'are specifying an external cover.')) - structure('dont_split_on_page_breaks', ['--dont-split-on-page-breaks'], default=False, - help=_('Turn off splitting at page breaks. Normally, input files ' - 'are automatically split at every page break into ' - 'two files. This gives an output ebook that can be parsed ' - 'faster and with less resources. However, splitting is ' - 'slow and if your source file contains a very large ' - 'number of page breaks, you should turn off splitting ' - 'on page breaks.')) - structure('page', ['--page'], default=None, - help=_('XPath expression to detect page boundaries for building ' - 'a custom pagination map, as used by AdobeDE. Default is ' - 'not to build an explicit pagination map.')) - structure('page_names', ['--page-names'], default=None, - help=_('XPath expression to find the name of each page in the ' - 'pagination map relative to its boundary element. ' - 'Default is to number all pages staring with 1.')) - toc = c.add_group('toc', - _('''\ -Control the automatic generation of a Table of Contents. If an OPF file is detected -and it specifies a Table of Contents, then that will be used rather than trying -to auto-generate a Table of Contents. -''').replace('\n', ' ')) - toc('max_toc_links', ['--max-toc-links'], default=50, - help=_('Maximum number of links to insert into the TOC. Set to 0 ' - 'to disable. Default is: %default. Links are only added to the ' - 'TOC if less than the --toc-threshold number of chapters were detected.')) - toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, - help=_("Don't add auto-detected chapters to the Table of Contents.")) - toc('toc_threshold', ['--toc-threshold'], default=6, - help=_('If fewer than this number of chapters is detected, then links ' - 'are added to the Table of Contents. Default: %default')) - toc('level1_toc', ['--level1-toc'], default=None, - help=_('XPath expression that specifies all tags that should be added ' - 'to the Table of Contents at level one. If this is specified, ' - 'it takes precedence over other forms of auto-detection.')) - toc('level2_toc', ['--level2-toc'], default=None, - help=_('XPath expression that specifies all tags that should be added ' - 'to the Table of Contents at level two. Each entry is added ' - 'under the previous level one entry.')) - toc('level3_toc', ['--level3-toc'], default=None, - help=_('XPath expression that specifies all tags that should be added ' - 'to the Table of Contents at level three. Each entry is added ' - 'under the previous level two entry.')) - toc('from_ncx', ['--from-ncx'], default=None, - help=_('Path to a .ncx file that contains the table of contents to use ' - 'for this ebook. The NCX file should contain links relative to ' - 'the directory it is placed in. See ' - 'http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for ' - 'an overview of the NCX format.')) - toc('use_auto_toc', ['--use-auto-toc'], default=False, - help=_('Normally, if the source file already has a Table of Contents, ' - 'it is used in preference to the auto-generated one. ' - 'With this option, the auto-generated one is always used.')) - - layout = c.add_group('page layout', _('Control page layout')) - layout('margin_top', ['--margin-top'], default=5.0, - help=_('Set the top margin in pts. Default is %default')) - layout('margin_bottom', ['--margin-bottom'], default=5.0, - help=_('Set the bottom margin in pts. Default is %default')) - layout('margin_left', ['--margin-left'], default=5.0, - help=_('Set the left margin in pts. Default is %default')) - layout('margin_right', ['--margin-right'], default=5.0, - help=_('Set the right margin in pts. Default is %default')) - layout('base_font_size2', ['--base-font-size'], default=12.0, - help=_('The base font size in pts. Default is %defaultpt. ' - 'Set to 0 to disable rescaling of fonts.')) - layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=False, - help=_('Remove spacing between paragraphs. ' - 'Also sets a indent on paragraphs of 1.5em. ' - 'You can override this by adding p {text-indent: 0cm} to ' - '--override-css. Spacing removal will not work if the source ' - 'file forces inter-paragraph spacing.')) - layout('no_justification', ['--no-justification'], default=False, - help=_('Do not force text to be justified in output.')) - layout('linearize_tables', ['--linearize-tables'], default=False, - help=_('Remove table markup, converting it into paragraphs. ' - 'This is useful if your source file uses a table to manage layout.')) - layout('preserve_tag_structure', ['--preserve-tag-structure'], default=False, - help=_('Preserve the HTML tag structure while splitting large HTML files. ' - 'This is only neccessary if the HTML files contain CSS that ' - 'uses sibling selectors. Enabling this greatly slows down ' - 'processing of large HTML files.')) - - c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', - help=_('Print generated OPF file to stdout')) - c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug', - help=_('Print generated NCX file to stdout')) - c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', - default=False, - help=_('Keep intermediate files during processing by html2epub')) - c.add_opt('extract_to', ['--extract-to'], group='debug', default=None, - help=_('Extract the contents of the produced EPUB file to the ' - 'specified directory.')) - return c + diff --git a/src/calibre/ebooks/epub/fonts.py b/src/calibre/ebooks/epub/fonts.py deleted file mode 100644 index 5d0887f2d0..0000000000 --- a/src/calibre/ebooks/epub/fonts.py +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Font size rationalization. See :function:`relativize`. -''' - -import logging, re, operator, functools, collections, unittest, copy, sys -from xml.dom import SyntaxErr - -from lxml.cssselect import CSSSelector -from lxml import etree -from lxml.html import HtmlElement - -from calibre.ebooks.html import fromstring -from calibre.ebooks.epub import rules -from cssutils import CSSParser - -num = r'[-]?\d+|[-]?\d*\.\d+' -length = r'(?P<zero>0)|(?P<num>{num})(?P<unit>%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num) -absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)' -relative_size = r'(?P<rel>smaller|larger)' - -font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I) -line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) - -PTU = { - 'in' : 72., - 'cm' : 72/2.54, - 'mm' : 72/25.4, - 'pt' : 1.0, - 'pc' : 1/12., - } - -DEFAULT_FONT_SIZE = 12 - -class Rationalizer(object): - - @classmethod - def specificity(cls, s): - '''Map CSS specificity tuple to a single integer''' - return sum([10**(4-i) + x for i,x in enumerate(s)]) - - @classmethod - def compute_font_size(cls, elem): - ''' - Calculate the effective font size of an element traversing its ancestors as far as - neccessary. - ''' - cfs = elem.computed_font_size - if cfs is not None: - return - sfs = elem.specified_font_size - if callable(sfs): - parent = elem.getparent() - cls.compute_font_size(parent) - elem.computed_font_size = sfs(parent.computed_font_size) - else: - elem.computed_font_size = sfs - - @classmethod - def calculate_font_size(cls, style): - 'Return font size in pts from style object. For relative units returns a callable' - match = font_size_pat.search(style.font) - fs = '' - if match: - fs = match.group() - if style.fontSize: - fs = style.fontSize - - match = font_size_pat.search(fs) - if match is None: - return None - match = match.groupdict() - unit = match.get('unit', '') - if unit: unit = unit.lower() - if unit in PTU.keys(): - return PTU[unit] * float(match['num']) - if unit in ('em', 'ex'): - return functools.partial(operator.mul, float(match['num'])) - if unit == '%': - return functools.partial(operator.mul, float(match['num'])/100.) - abs = match.get('abs', '') - if abs: abs = abs.lower() - if abs: - x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1)) - return 12 * x - if match.get('zero', False): - return 0. - return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) - - @classmethod - def resolve_rules(cls, stylesheets): - for sheet in stylesheets: - if hasattr(sheet, 'fs_rules'): - continue - sheet.fs_rules = [] - sheet.lh_rules = [] - for r in sheet: - if r.type == r.STYLE_RULE: - font_size = cls.calculate_font_size(r.style) - if font_size is not None: - for s in r.selectorList: - sheet.fs_rules.append([CSSSelector(s.selectorText), font_size]) - orig = line_height_pat.search(r.style.lineHeight) - if orig is not None: - for s in r.selectorList: - sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]]) - - - @classmethod - def apply_font_size_rules(cls, stylesheets, root): - 'Add a ``specified_font_size`` attribute to every element that has a specified font size' - cls.resolve_rules(stylesheets) - for sheet in stylesheets: - for selector, font_size in sheet.fs_rules: - elems = selector(root) - for elem in elems: - elem.specified_font_size = font_size - - @classmethod - def remove_font_size_information(cls, stylesheets): - for r in rules(stylesheets): - r.style.removeProperty('font-size') - try: - new = font_size_pat.sub('', r.style.font).strip() - if new: - r.style.font = new - else: - r.style.removeProperty('font') - except SyntaxErr: - r.style.removeProperty('font') - if line_height_pat.search(r.style.lineHeight) is not None: - r.style.removeProperty('line-height') - - @classmethod - def compute_font_sizes(cls, root, stylesheets, base=12): - stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')] - cls.apply_font_size_rules(stylesheets, root) - - # Compute the effective font size of all tags - root.computed_font_size = DEFAULT_FONT_SIZE - for elem in root.iter(etree.Element): - cls.compute_font_size(elem) - - extra_css = {} - if base > 0: - # Calculate the "base" (i.e. most common) font size - font_sizes = collections.defaultdict(lambda : 0) - body = root.xpath('//body')[0] - IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6') - for elem in body.iter(etree.Element): - if elem.tag not in IGNORE: - t = getattr(elem, 'text', '') - if t: t = t.strip() - if t: - font_sizes[elem.computed_font_size] += len(t) - - t = getattr(elem, 'tail', '') - if t: t = t.strip() - if t: - parent = elem.getparent() - if parent.tag not in IGNORE: - font_sizes[parent.computed_font_size] += len(t) - - try: - most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0] - scale = base/most_common if most_common > 0 else 1. - except ValueError: - scale = 1. - - # rescale absolute line-heights - counter = 0 - for sheet in stylesheets: - for selector, lh in sheet.lh_rules: - for elem in selector(root): - elem.set('id', elem.get('id', 'cfs_%d'%counter)) - counter += 1 - if not extra_css.has_key(elem.get('id')): - extra_css[elem.get('id')] = [] - extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale)) - - - - # Rescale all computed font sizes - for elem in body.iter(etree.Element): - if isinstance(elem, HtmlElement): - elem.computed_font_size *= scale - - # Remove all font size specifications from the last stylesheet - cls.remove_font_size_information(stylesheets[-1:]) - - # Create the CSS to implement the rescaled font sizes - for elem in body.iter(etree.Element): - cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent())) - if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.: - elem.set('id', elem.get('id', 'cfs_%d'%counter)) - counter += 1 - if not extra_css.has_key(elem.get('id')): - extra_css[elem.get('id')] = [] - extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs))) - - css = CSSParser(loglevel=logging.ERROR).parseString('') - for id, r in extra_css.items(): - css.add('#%s {%s}'%(id, ';'.join(r))) - return css - - @classmethod - def rationalize(cls, stylesheets, root, opts): - logger = logging.getLogger('html2epub') - logger.info('\t\tRationalizing fonts...') - extra_css = None - if opts.base_font_size2 > 0: - try: - extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2) - except: - logger.warning('Failed to rationalize font sizes.') - if opts.verbose > 1: - logger.exception('') - finally: - root.remove_font_size_information() - logger.debug('\t\tDone rationalizing') - return extra_css - -################################################################################ -############## Testing -################################################################################ - -class FontTest(unittest.TestCase): - - def setUp(self): - from calibre.ebooks.epub import config - self.opts = config(defaults='').parse() - self.html = ''' - <html> - <head> - <title>Test document - - -

- -

Some text

-
-

Some other text.

-

The longest piece of single font size text in this entire file. Used to test resizing.

- - - ''' - self.root = fromstring(self.html) - - def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1): - root1 = copy.deepcopy(self.root) - root1.computed_font_size = DEFAULT_FONT_SIZE - stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css) - stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base) - root2 = copy.deepcopy(root1) - root2.remove_font_size_information() - root2.computed_font_size = DEFAULT_FONT_SIZE - Rationalizer.apply_font_size_rules([stylesheet2], root2) - for elem in root2.iter(etree.Element): - Rationalizer.compute_font_size(elem) - for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)): - self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, - msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\ - (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size)) - return stylesheet2.cssText - - def testStripping(self): - 'Test that any original entries are removed from the CSS' - css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }' - css = CSSParser(loglevel=logging.ERROR).parseString(css) - Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css]) - self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), - 'p{font:bolditalic}') - - def testIdentity(self): - 'Test that no unnecessary font size changes are made' - extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}') - self.assertEqual(extra_css.strip(), '') - - def testRelativization(self): - 'Test conversion of absolute to relative sizes' - self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}') - - def testResizing(self): - 'Test resizing of fonts' - self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}') - - -def suite(): - return unittest.TestLoader().loadTestsFromTestCase(FontTest) - -def test(): - unittest.TextTestRunner(verbosity=2).run(suite()) - -if __name__ == '__main__': - sys.exit(test()) - \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py deleted file mode 100644 index a3e266991f..0000000000 --- a/src/calibre/ebooks/epub/from_any.py +++ /dev/null @@ -1,207 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Convert any ebook format to epub. -''' - -import sys, os, re -from contextlib import nested - -from calibre import extract, walk -from calibre.ebooks import DRMError -from calibre.ebooks.epub import config as common_config, process_encryption -from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf2 import OPFCreator, OPF -from calibre.utils.zipfile import ZipFile -from calibre.customize.ui import run_plugins_on_preprocess - -def lit2opf(path, tdir, opts): - from calibre.ebooks.lit.reader import LitReader - print 'Exploding LIT file:', path - reader = LitReader(path) - reader.extract_content(tdir, False) - opf = None - for opf in walk(tdir): - if opf.lower().endswith('.opf'): - break - if not opf.endswith('.opf'): - opf = None - if opf is not None: # Check for url-quoted filenames - _opf = OPF(opf, os.path.dirname(opf)) - replacements = [] - for item in _opf.itermanifest(): - href = item.get('href', '') - path = os.path.join(os.path.dirname(opf), *(href.split('/'))) - if not os.path.exists(path) and os.path.exists(path.replace('&', '%26')): - npath = path - path = path.replace('&', '%26') - replacements.append((path, npath)) - if replacements: - print 'Fixing quoted filenames...' - for path, npath in replacements: - if os.path.exists(path): - os.rename(path, npath) - for f in walk(tdir): - with open(f, 'r+b') as f: - raw = f.read() - for path, npath in replacements: - raw = raw.replace(os.path.basename(path), os.path.basename(npath)) - f.seek(0) - f.truncate() - f.write(raw) - return opf - -def mobi2opf(path, tdir, opts): - from calibre.ebooks.mobi.reader import MobiReader - print 'Exploding MOBI file:', path.encode('utf-8') if isinstance(path, unicode) else path - reader = MobiReader(path) - reader.extract_content(tdir) - files = list(walk(tdir)) - opts.encoding = 'utf-8' - for f in files: - if f.lower().endswith('.opf'): - return f - html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE) - hf = [f for f in files if html_pat.match(os.path.splitext(f)[1]) is not None] - mi = MetaInformation(os.path.splitext(os.path.basename(path))[0], [_('Unknown')]) - opf = OPFCreator(tdir, mi) - opf.create_manifest([(hf[0], None)]) - opf.create_spine([hf[0]]) - ans = os.path.join(tdir, 'metadata.opf') - opf.render(open(ans, 'wb')) - return ans - -def fb22opf(path, tdir, opts): - from calibre.ebooks.lrf.fb2.convert_from import to_html - print 'Converting FB2 to HTML...' - return to_html(path, tdir) - -def rtf2opf(path, tdir, opts): - from calibre.ebooks.lrf.rtf.convert_from import generate_html - generate_html(path, tdir) - return os.path.join(tdir, 'metadata.opf') - -def txt2opf(path, tdir, opts): - from calibre.ebooks.lrf.txt.convert_from import generate_html - generate_html(path, opts.encoding, tdir) - opts.encoding = 'utf-8' - return os.path.join(tdir, 'metadata.opf') - -def pdf2opf(path, tdir, opts): - from calibre.ebooks.lrf.pdf.convert_from import generate_html - generate_html(path, tdir) - opts.dont_split_on_page_breaks = True - return os.path.join(tdir, 'metadata.opf') - -def epub2opf(path, tdir, opts): - zf = ZipFile(path) - zf.extractall(tdir) - opts.chapter_mark = 'none' - encfile = os.path.join(tdir, 'META-INF', 'encryption.xml') - opf = None - for f in walk(tdir): - if f.lower().endswith('.opf'): - opf = f - break - if opf and os.path.exists(encfile): - if not process_encryption(encfile, opf): - raise DRMError(os.path.basename(path)) - - if opf is None: - raise ValueError('%s is not a valid EPUB file'%path) - return opf - -def odt2epub(path, tdir, opts): - from calibre.ebooks.odt.to_oeb import Extract - opts.encoding = 'utf-8' - return Extract()(path, tdir) - -MAP = { - 'lit' : lit2opf, - 'mobi' : mobi2opf, - 'prc' : mobi2opf, - 'azw' : mobi2opf, - 'fb2' : fb22opf, - 'rtf' : rtf2opf, - 'txt' : txt2opf, - 'pdf' : pdf2opf, - 'epub' : epub2opf, - 'odt' : odt2epub, - } -SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', - 'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub'] - -def unarchive(path, tdir): - extract(path, tdir) - files = list(walk(tdir)) - - for ext in ['opf'] + list(MAP.keys()): - for f in files: - if f.lower().endswith('.'+ext): - if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048: - continue - return f, ext - return find_html_index(files) - -def any2epub(opts, path, notification=None, create_epub=True, - oeb_cover=False, extract_to=None): - path = run_plugins_on_preprocess(path) - ext = os.path.splitext(path)[1] - if not ext: - raise ValueError('Unknown file type: '+path) - ext = ext.lower()[1:] - - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub' - - with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2): - if ext in ['rar', 'zip', 'oebzip']: - path, ext = unarchive(path, tdir1) - print 'Found %s file in archive'%(ext.upper()) - - if ext in MAP.keys(): - path = MAP[ext](path, tdir2, opts) - ext = 'opf' - - - if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None: - raise ValueError('Conversion from %s is not supported'%ext.upper()) - - print 'Creating EPUB file...' - html2epub(path, opts, notification=notification, - create_epub=create_epub, oeb_cover=oeb_cover, - extract_to=extract_to) - -def config(defaults=None): - return common_config(defaults=defaults) - - -def formats(): - return ['html', 'rar', 'zip', 'oebzip']+list(MAP.keys()) - -USAGE = _('''\ -%%prog [options] filename - -Convert any of a large number of ebook formats to a %s file. Supported formats are: %s -''') - -def option_parser(usage=USAGE): - return config().option_parser(usage=usage%('EPUB', formats())) - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print 'No input file specified.' - return 1 - any2epub(opts, args[1]) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/epub/from_comic.py b/src/calibre/ebooks/epub/from_comic.py deleted file mode 100644 index c6dff349da..0000000000 --- a/src/calibre/ebooks/epub/from_comic.py +++ /dev/null @@ -1,21 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -'Convert a comic in CBR/CBZ format to epub' - -import sys -from functools import partial -from calibre.ebooks.lrf.comic.convert_from import do_convert, option_parser, config, main as _main - -convert = partial(do_convert, output_format='epub') -main = partial(_main, output_format='epub') - -if __name__ == '__main__': - sys.exit(main()) - -if False: - option_parser - config - \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_feeds.py b/src/calibre/ebooks/epub/from_feeds.py deleted file mode 100644 index 6a12353f50..0000000000 --- a/src/calibre/ebooks/epub/from_feeds.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Convert periodical content into EPUB ebooks. -''' -import sys, glob, os -from calibre.web.feeds.main import config as feeds2disk_config, USAGE, run_recipe -from calibre.ebooks.epub.from_html import config as html2epub_config -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.epub.from_html import convert as html2epub -from calibre import strftime, sanitize_file_name - -def config(defaults=None): - c = feeds2disk_config(defaults=defaults) - c.remove('lrf') - c.remove('epub') - c.remove('output_dir') - c.update(html2epub_config(defaults=defaults)) - c.remove('chapter_mark') - return c - -def option_parser(): - c = config() - return c.option_parser(usage=USAGE) - -def convert(opts, recipe_arg, notification=None): - opts.lrf = False - opts.epub = True - if opts.debug: - opts.verbose = 2 - parser = option_parser() - with TemporaryDirectory('_feeds2epub') as tdir: - opts.output_dir = tdir - recipe = run_recipe(opts, recipe_arg, parser, notification=notification) - c = config() - recipe_opts = c.parse_string(recipe.html2epub_options) - c.smart_update(recipe_opts, opts) - opts = recipe_opts - opts.chapter_mark = 'none' - opts.dont_split_on_page_breaks = True - opf = glob.glob(os.path.join(tdir, '*.opf')) - if not opf: - raise Exception('Downloading of recipe: %s failed'%recipe_arg) - opf = opf[0] - - if opts.output is None: - fname = recipe.title + strftime(recipe.timefmt) + '.epub' - opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname)) - - print 'Generating epub...' - opts.encoding = 'utf-8' - opts.remove_paragraph_spacing = True - html2epub(opf, opts, notification=notification) - - -def main(args=sys.argv, notification=None, handler=None): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) != 2 and opts.feeds is None: - parser.print_help() - return 1 - recipe_arg = args[1] if len(args) > 1 else None - convert(opts, recipe_arg, notification=notification) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py deleted file mode 100644 index 3fd7b082f9..0000000000 --- a/src/calibre/ebooks/epub/from_html.py +++ /dev/null @@ -1,547 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Conversion of HTML/OPF files follows several stages: - - * All links in the HTML files or in the OPF manifest are - followed to build up a list of HTML files to be converted. - This stage is implemented by - :function:`calibre.ebooks.html.traverse` and - :class:`calibre.ebooks.html.HTMLFile`. - - * The HTML is pre-processed to make it more semantic. - All links in the HTML files to other resources like images, - stylesheets, etc. are relativized. The resources are copied - into the `resources` sub directory. This is accomplished by - :class:`calibre.ebooks.html.PreProcessor` and - :class:`calibre.ebooks.html.Parser`. - - * The HTML is processed. Various operations are performed. - All style declarations are extracted and consolidated into - a single style sheet. Chapters are auto-detected and marked. - Various font related manipulations are performed. See - :class:`HTMLProcessor`. - - * The processed HTML is saved and the - :module:`calibre.ebooks.epub.split` module is used to split up - large HTML files into smaller chunks. - - * The EPUB container is created. -''' - -import os, sys, cStringIO, logging, re, functools, shutil - -from lxml.etree import XPath -from lxml import html, etree -from PyQt4.Qt import QApplication, QPixmap, Qt - -from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\ - opf_traverse, create_metadata, rebase_toc, Link, parser -from calibre.ebooks.epub import config as common_config, tostring -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.metadata.opf2 import OPF -from calibre.ebooks.epub import initialize_container, PROFILES -from calibre.ebooks.epub.split import split -from calibre.ebooks.epub.pages import add_page_map -from calibre.ebooks.epub.fonts import Rationalizer -from calibre.constants import preferred_encoding -from calibre.customize.ui import run_plugins_on_postprocess -from calibre import walk, CurrentDir, to_unicode, fit_image - -content = functools.partial(os.path.join, u'content') - -def remove_bad_link(element, attribute, link, pos): - if attribute is not None: - if element.tag in ['link']: - element.getparent().remove(element) - else: - element.set(attribute, '') - del element.attrib[attribute] - -def check_links(opf_path, pretty_print): - ''' - Find and remove all invalid links in the HTML files - ''' - logger = logging.getLogger('html2epub') - logger.info('\tChecking files for bad links...') - pathtoopf = os.path.abspath(opf_path) - with CurrentDir(os.path.dirname(pathtoopf)): - opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) - html_files = [] - for item in opf.itermanifest(): - if 'html' in item.get('media-type', '').lower(): - f = item.get('href').split('/')[-1] - if isinstance(f, str): - f = f.decode('utf-8') - html_files.append(os.path.abspath(content(f))) - - for path in html_files: - if not os.access(path, os.R_OK): - continue - base = os.path.dirname(path) - root = html.fromstring(open(content(path), 'rb').read(), parser=parser) - for element, attribute, link, pos in list(root.iterlinks()): - link = to_unicode(link) - plink = Link(link, base) - bad = False - if plink.path is not None and not os.path.exists(plink.path): - bad = True - if bad: - remove_bad_link(element, attribute, link, pos) - open(content(path), 'wb').write(tostring(root, pretty_print)) - -def find_html_index(files): - ''' - Given a list of files, find the most likely root HTML file in the - list. - ''' - html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE) - html_files = [f for f in files if html_pat.search(f) is not None] - if not html_files: - raise ValueError(_('Could not find an ebook inside the archive')) - html_files = [(f, os.stat(f).st_size) for f in html_files] - html_files.sort(cmp = lambda x, y: cmp(x[1], y[1])) - html_files = [f[0] for f in html_files] - for q in ('toc', 'index'): - for f in html_files: - if os.path.splitext(os.path.basename(f))[0].lower() == q: - return f, os.path.splitext(f)[1].lower()[1:] - return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] - -def rescale_images(imgdir, screen_size, log): - pwidth, pheight = screen_size - if QApplication.instance() is None: - QApplication([]) - for f in os.listdir(imgdir): - path = os.path.join(imgdir, f) - if os.path.splitext(f)[1] in ('.css', '.js'): - continue - - p = QPixmap() - p.load(path) - if p.isNull(): - continue - width, height = p.width(), p.height() - scaled, new_width, new_height = fit_image(width, height, pwidth, - pheight) - if scaled: - log.info('Rescaling image: '+f) - p.scaled(new_width, new_height, Qt.IgnoreAspectRatio, - Qt.SmoothTransformation).save(path, 'JPEG') - - - - - -class HTMLProcessor(Processor, Rationalizer): - - def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets): - Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, - name='html2epub') - if opts.verbose > 2: - self.debug_tree('parsed') - self.detect_chapters() - - self.extract_css(stylesheets) - if self.opts.base_font_size2 > 0: - self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet], - self.root, self.opts) - if opts.verbose > 2: - self.debug_tree('nocss') - - if hasattr(self.body, 'xpath'): - for script in list(self.body.xpath('descendant::script')): - script.getparent().remove(script) - - self.fix_markup() - - def convert_image(self, img): - rpath = img.get('src', '') - path = os.path.join(os.path.dirname(self.save_path()), *rpath.split('/')) - if os.path.exists(path) and os.path.isfile(path): - if QApplication.instance() is None: - app = QApplication([]) - app - p = QPixmap() - p.load(path) - if not p.isNull(): - p.save(path + '_calibre_converted.jpg') - os.remove(path) - for key, val in self.resource_map.items(): - if val == rpath: - self.resource_map[key] = rpath+'_calibre_converted.jpg' - img.set('src', rpath+'_calibre_converted.jpg') - - def fix_markup(self): - ''' - Perform various markup transforms to get the output to render correctly - in the quirky ADE. - ''' - # Replace
that are children of as ADE doesn't handle them - if hasattr(self.body, 'xpath'): - for br in self.body.xpath('./br'): - if br.getparent() is None: - continue - try: - sibling = br.itersiblings().next() - except: - sibling = None - br.tag = 'p' - br.text = u'\u00a0' - if (br.tail and br.tail.strip()) or sibling is None or \ - getattr(sibling, 'tag', '') != 'br': - style = br.get('style', '').split(';') - style = filter(None, map(lambda x: x.strip(), style)) - style.append('margin: 0pt; border:0pt; height:0pt') - br.set('style', '; '.join(style)) - else: - sibling.getparent().remove(sibling) - if sibling.tail: - if not br.tail: - br.tail = '' - br.tail += sibling.tail - - - if self.opts.profile.remove_object_tags: - for tag in self.root.xpath('//embed'): - tag.getparent().remove(tag) - for tag in self.root.xpath('//object'): - if tag.get('type', '').lower().strip() in ('image/svg+xml',): - continue - tag.getparent().remove(tag) - - - for tag in self.root.xpath('//title|//style'): - if not tag.text: - tag.getparent().remove(tag) - for tag in self.root.xpath('//script'): - if not tag.text and not tag.get('src', False): - tag.getparent().remove(tag) - - for tag in self.root.xpath('//form'): - tag.getparent().remove(tag) - - for tag in self.root.xpath('//center'): - tag.tag = 'div' - tag.set('style', 'text-align:center') - - if self.opts.linearize_tables: - for tag in self.root.xpath('//table | //tr | //th | //td'): - tag.tag = 'div' - - # ADE can't handle & in an img url - for tag in self.root.xpath('//img[@src]'): - tag.set('src', tag.get('src', '').replace('&', '')) - - - def save(self): - for meta in list(self.root.xpath('//meta')): - meta.getparent().remove(meta) - # Strip all comments since Adobe DE is petrified of them - Processor.save(self, strip_comments=True) - - def remove_first_image(self): - images = self.root.xpath('//img') - if images: - images[0].getparent().remove(images[0]) - return True - return False - - - - -def config(defaults=None): - return common_config(defaults=defaults) - -def option_parser(): - c = config() - return c.option_parser(usage=_('''\ -%prog [options] file.html|opf - -Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file. -If you specify an OPF file instead of an HTML file, the list of links is takes from -the element of the OPF file. -''')) - -def parse_content(filelist, opts, tdir): - os.makedirs(os.path.join(tdir, 'content', 'resources')) - resource_map, stylesheets = {}, {} - toc = TOC(base_path=tdir, type='root') - stylesheet_map = {} - first_image_removed = False - for htmlfile in filelist: - logging.getLogger('html2epub').debug('Processing %s...'%htmlfile) - hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), - resource_map, filelist, stylesheets) - if not first_image_removed and opts.remove_first_image: - first_image_removed = hp.remove_first_image() - hp.populate_toc(toc) - hp.save() - stylesheet_map[os.path.basename(hp.save_path())] = \ - [s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None] - - logging.getLogger('html2epub').debug('Saving stylesheets...') - if opts.base_font_size2 > 0: - Rationalizer.remove_font_size_information(stylesheets.values()) - for path, css in stylesheets.items(): - raw = getattr(css, 'cssText', css) - if isinstance(raw, unicode): - raw = raw.encode('utf-8') - open(path, 'wb').write(raw) - if toc.count('chapter') > opts.toc_threshold: - toc.purge(['file', 'link', 'unknown']) - if toc.count('chapter') + toc.count('file') > opts.toc_threshold: - toc.purge(['link', 'unknown']) - toc.purge(['link'], max=opts.max_toc_links) - - return resource_map, hp.htmlfile_map, toc, stylesheet_map - -TITLEPAGE = '''\ - - - Cover - - - -
- cover -
- - -''' - -def create_cover_image(src, dest, screen_size, rescale_cover=True): - try: - from PyQt4.Qt import QImage, Qt - if QApplication.instance() is None: - QApplication([]) - im = QImage() - im.load(src) - if im.isNull(): - raise ValueError('Invalid cover image') - if rescale_cover and screen_size is not None: - width, height = im.width(), im.height() - dw, dh = (screen_size[0]-width)/float(width), (screen_size[1]-height)/float(height) - delta = min(dw, dh) - if delta > 0: - nwidth = int(width + delta*(width)) - nheight = int(height + delta*(height)) - im = im.scaled(int(nwidth), int(nheight), Qt.IgnoreAspectRatio, Qt.SmoothTransformation) - im.save(dest) - except: - import traceback - traceback.print_exc() - return False - return True - -def process_title_page(mi, filelist, htmlfilemap, opts, tdir): - old_title_page = None - f = lambda x : os.path.normcase(os.path.normpath(x)) - if not isinstance(mi.cover, basestring): - mi.cover = None - if mi.cover: - if f(filelist[0].path) == f(mi.cover): - old_title_page = htmlfilemap[filelist[0].path] - #logger = logging.getLogger('html2epub') - metadata_cover = mi.cover - if metadata_cover and not os.path.exists(metadata_cover): - metadata_cover = None - - cpath = '/'.join(('resources', '_cover_.jpg')) - cover_dest = os.path.join(tdir, 'content', *cpath.split('/')) - if metadata_cover is not None: - if not create_cover_image(metadata_cover, cover_dest, - opts.profile.screen_size): - metadata_cover = None - specified_cover = opts.cover - if specified_cover and not os.path.exists(specified_cover): - specified_cover = None - if specified_cover is not None: - if not create_cover_image(specified_cover, cover_dest, - opts.profile.screen_size): - specified_cover = None - - cover = metadata_cover if specified_cover is None or (opts.prefer_metadata_cover and metadata_cover is not None) else specified_cover - - if cover is not None: - titlepage = TITLEPAGE%cpath - tp = 'calibre_title_page.html' if old_title_page is None else old_title_page - tppath = os.path.join(tdir, 'content', tp) - with open(tppath, 'wb') as f: - f.write(titlepage) - return tp if old_title_page is None else None, True - elif os.path.exists(cover_dest): - os.remove(cover_dest) - return None, old_title_page is not None - -def find_oeb_cover(htmlfile): - if os.stat(htmlfile).st_size > 2048: - return None - match = re.search(r'(?i)]+src\s*=\s*[\'"](.+?)[\'"]', open(htmlfile, 'rb').read()) - if match: - return match.group(1) - -def condense_ncx(ncx_path): - tree = etree.parse(ncx_path) - for tag in tree.getroot().iter(tag=etree.Element): - if tag.text: - tag.text = tag.text.strip() - if tag.tail: - tag.tail = tag.tail.strip() - compressed = etree.tostring(tree.getroot(), encoding='utf-8') - open(ncx_path, 'wb').write(compressed) - -def convert(htmlfile, opts, notification=None, create_epub=True, - oeb_cover=False, extract_to=None): - htmlfile = os.path.abspath(htmlfile) - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub' - opts.profile = PROFILES[opts.profile] - opts.output = os.path.abspath(opts.output) - if opts.override_css is not None: - try: - opts.override_css = open(opts.override_css, 'rb').read().decode(preferred_encoding, 'replace') - except: - opts.override_css = opts.override_css.decode(preferred_encoding, 'replace') - if opts.from_opf: - opts.from_opf = os.path.abspath(opts.from_opf) - if opts.from_ncx: - opts.from_ncx = os.path.abspath(opts.from_ncx) - if htmlfile.lower().endswith('.opf'): - opf = OPF(htmlfile, os.path.dirname(os.path.abspath(htmlfile))) - filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) - if not filelist: - # Bad OPF look for a HTML file instead - htmlfile = find_html_index(walk(os.path.dirname(htmlfile)))[0] - if htmlfile is None: - raise ValueError('Could not find suitable file to convert.') - filelist = get_filelist(htmlfile, opts)[1] - mi = merge_metadata(None, opf, opts) - else: - opf, filelist = get_filelist(htmlfile, opts) - mi = merge_metadata(htmlfile, opf, opts) - opts.chapter = XPath(opts.chapter, - namespaces={'re':'http://exslt.org/regular-expressions'}) - for x in (1, 2, 3): - attr = 'level%d_toc'%x - if getattr(opts, attr): - setattr(opts, attr, XPath(getattr(opts, attr), - namespaces={'re':'http://exslt.org/regular-expressions'})) - else: - setattr(opts, attr, None) - - with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir: - if opts.keep_intermediate: - print 'Intermediate files in', tdir - resource_map, htmlfile_map, generated_toc, stylesheet_map = \ - parse_content(filelist, opts, tdir) - logger = logging.getLogger('html2epub') - resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()] - - - title_page, has_title_page = process_title_page(mi, filelist, htmlfile_map, opts, tdir) - spine = [htmlfile_map[f.path] for f in filelist] - if not oeb_cover and title_page is not None: - spine = [title_page] + spine - mi.cover = None - mi.cover_data = (None, None) - - - mi = create_metadata(tdir, mi, spine, resources) - buf = cStringIO.StringIO() - if mi.toc: - rebase_toc(mi.toc, htmlfile_map, tdir) - if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2: - mi.toc = generated_toc - if opts.from_ncx: - toc = TOC() - toc.read_ncx_toc(opts.from_ncx) - mi.toc = toc - for item in mi.manifest: - if getattr(item, 'mime_type', None) == 'text/html': - item.mime_type = 'application/xhtml+xml' - opf_path = os.path.join(tdir, 'metadata.opf') - with open(opf_path, 'wb') as f: - mi.render(f, buf, 'toc.ncx') - toc = buf.getvalue() - if toc: - with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f: - f.write(toc) - if opts.show_ncx: - print toc - split(opf_path, opts, stylesheet_map) - if opts.page: - logger.info('\tBuilding page map...') - add_page_map(opf_path, opts) - check_links(opf_path, opts.pretty_print) - - opf = OPF(opf_path, tdir) - opf.remove_guide() - oeb_cover_file = None - if oeb_cover and title_page is not None: - oeb_cover_file = find_oeb_cover(os.path.join(tdir, 'content', title_page)) - if has_title_page or (oeb_cover and oeb_cover_file): - opf.create_guide_element() - if has_title_page and not oeb_cover: - opf.add_guide_item('cover', 'Cover', 'content/'+spine[0]) - if oeb_cover and oeb_cover_file: - opf.add_guide_item('cover', 'Cover', 'content/'+oeb_cover_file) - - cpath = os.path.join(tdir, 'content', 'resources', '_cover_.jpg') - if os.path.exists(cpath): - opf.add_path_to_manifest(cpath, 'image/jpeg') - with open(opf_path, 'wb') as f: - f.write(opf.render()) - ncx_path = os.path.join(os.path.dirname(opf_path), 'toc.ncx') - if os.path.exists(ncx_path) and os.stat(ncx_path).st_size > opts.profile.flow_size: - logger.info('Condensing NCX from %d bytes...'%os.stat(ncx_path).st_size) - condense_ncx(ncx_path) - if os.stat(ncx_path).st_size > opts.profile.flow_size: - logger.warn('NCX still larger than allowed size at %d bytes. Menu based Table of Contents may not work on device.'%os.stat(ncx_path).st_size) - - if opts.profile.screen_size is not None: - rescale_images(os.path.join(tdir, 'content', 'resources'), - opts.profile.screen_size, logger) - - if create_epub: - epub = initialize_container(opts.output) - epub.add_dir(tdir) - epub.close() - run_plugins_on_postprocess(opts.output, 'epub') - logger.info(_('Output written to ')+opts.output) - - if opts.show_opf: - print open(opf_path, 'rb').read() - - if opts.extract_to is not None: - if os.path.exists(opts.extract_to): - shutil.rmtree(opts.extract_to) - shutil.copytree(tdir, opts.extract_to) - - if extract_to is not None: - if os.path.exists(extract_to): - shutil.rmtree(extract_to) - shutil.copytree(tdir, extract_to) - - - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print _('You must specify an input HTML file') - return 1 - convert(args[1], opts) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py new file mode 100644 index 0000000000..b748429725 --- /dev/null +++ b/src/calibre/ebooks/epub/input.py @@ -0,0 +1,127 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, re, uuid +from itertools import cycle + +from lxml import etree + +from calibre.customize.conversion import InputFormatPlugin + +class EPUBInput(InputFormatPlugin): + + name = 'EPUB Input' + author = 'Kovid Goyal' + description = 'Convert EPUB files (.epub) to HTML' + file_types = set(['epub']) + + @classmethod + def decrypt_font(cls, key, path): + raw = open(path, 'rb').read() + crypt = raw[:1024] + key = cycle(iter(key)) + decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) + with open(path, 'wb') as f: + f.write(decrypt) + f.write(raw[1024:]) + + @classmethod + def process_ecryption(cls, encfile, opf, log): + key = None + m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read()) + if m: + key = m.group(1) + key = list(map(ord, uuid.UUID(key).bytes)) + try: + root = etree.parse(encfile) + for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): + algorithm = em.get('Algorithm', '') + if algorithm != 'http://ns.adobe.com/pdf/enc#RC': + return False + cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] + uri = cr.get('URI') + path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) + if os.path.exists(path): + cls.decrypt_font(key, path) + return True + except: + import traceback + traceback.print_exc() + return False + + @classmethod + def rationalize_cover(self, opf): + guide_cover, guide_elem = None, None + for guide_elem in opf.iterguide(): + if guide_elem.get('type', '').lower() == 'cover': + guide_cover = guide_elem.get('href', '') + break + if not guide_cover: + return + spine = list(opf.iterspine()) + if not spine: + return + idref = spine[0].get('idref', '') + manifest = list(opf.itermanifest()) + if not manifest: + return + if manifest[0].get('id', False) != idref: + return + spine[0].getparent().remove(spine[0]) + guide_elem.set('href', 'calibre_raster_cover.jpg') + for elem in list(opf.iterguide()): + if elem.get('type', '').lower() == 'titlepage': + elem.getparent().remove(elem) + from calibre.ebooks.oeb.base import OPF + t = etree.SubElement(guide_elem.getparent(), OPF('reference')) + t.set('type', 'titlepage') + t.set('href', guide_cover) + t.set('title', 'Title Page') + from calibre.ebooks import render_html + renderer = render_html(guide_cover) + if renderer is not None: + open('calibre_raster_cover.jpg', 'wb').write( + renderer.data) + + + def convert(self, stream, options, file_ext, log, accelerators): + from calibre.utils.zipfile import ZipFile + from calibre import walk + from calibre.ebooks import DRMError + from calibre.ebooks.metadata.opf2 import OPF + zf = ZipFile(stream) + zf.extractall(os.getcwd()) + encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) + opf = None + for f in walk(u'.'): + if f.lower().endswith('.opf'): + opf = os.path.abspath(f) + break + path = getattr(stream, 'name', 'stream') + + if opf is None: + raise ValueError('%s is not a valid EPUB file'%path) + + if os.path.exists(encfile): + if not self.process_encryption(encfile, opf, log): + raise DRMError(os.path.basename(path)) + + opf = os.path.relpath(opf, os.getcwdu()) + parts = os.path.split(opf) + opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) + + if len(parts) > 1 and parts[0]: + delta = '/'.join(parts[:-1])+'/' + for elem in opf.itermanifest(): + elem.set('href', delta+elem.get('href')) + for elem in opf.iterguide(): + elem.set('href', delta+elem.get('href')) + + self.rationalize_cover(opf) + + with open('content.opf', 'wb') as nopf: + nopf.write(opf.render()) + + return os.path.abspath('content.opf') diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py new file mode 100644 index 0000000000..3256e1168a --- /dev/null +++ b/src/calibre/ebooks/epub/output.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, shutil, re +from urllib import unquote + +from calibre.customize.conversion import OutputFormatPlugin +from calibre.ptempfile import TemporaryDirectory +from calibre.constants import __appname__, __version__ +from calibre import strftime, guess_type +from calibre.customize.conversion import OptionRecommendation + +from lxml import etree + + +class EPUBOutput(OutputFormatPlugin): + + name = 'EPUB Output' + author = 'Kovid Goyal' + file_type = 'epub' + + options = set([ + OptionRecommendation(name='extract_to', + help=_('Extract the contents of the generated EPUB file to the ' + 'specified directory. The contents of the directory are first ' + 'deleted, so be careful.')), + + OptionRecommendation(name='dont_split_on_page_breaks', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Turn off splitting at page breaks. Normally, input ' + 'files are automatically split at every page break into ' + 'two files. This gives an output ebook that can be ' + 'parsed faster and with less resources. However, ' + 'splitting is slow and if your source file contains a ' + 'very large number of page breaks, you should turn off ' + 'splitting on page breaks.' + ) + ), + + OptionRecommendation(name='flow_size', recommended_value=260, + help=_('Split all HTML files larger than this size (in KB). ' + 'This is necessary as most EPUB readers cannot handle large ' + 'file sizes. The default of %defaultKB is the size required ' + 'for Adobe Digital Editions.') + ), + + + ]) + + + TITLEPAGE_COVER = '''\ + + + Cover + + + +
+ cover +
+ + +''' + + TITLEPAGE = '''\ + + + %(title)s + + + +

%(title)s

+

+
+
+ calibre +
+
+

%(date)s

+




+

%(author)s

+








+

Produced by %(app)s

+
+
+ + +''' + + def convert(self, oeb, output_path, input_plugin, opts, log): + self.log, self.opts, self.oeb = log, opts, oeb + + from calibre.ebooks.oeb.transforms.split import Split + split = Split(not self.opts.dont_split_on_page_breaks, + max_flow_size=self.opts.flow_size*1024 + ) + split(self.oeb, self.opts) + + + self.workaround_ade_quirks() + + from calibre.ebooks.oeb.transforms.rescale import RescaleImages + RescaleImages()(oeb, opts) + self.insert_cover() + + with TemporaryDirectory('_epub_output') as tdir: + from calibre.customize.ui import plugin_for_output_format + oeb_output = plugin_for_output_format('oeb') + oeb_output.convert(oeb, tdir, input_plugin, opts, log) + opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] + self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\ + if x.endswith('.ncx')][0]) + + from calibre.ebooks.epub import initialize_container + epub = initialize_container(output_path, os.path.basename(opf)) + epub.add_dir(tdir) + if opts.extract_to is not None: + if os.path.exists(opts.extract_to): + shutil.rmtree(opts.extract_to) + os.mkdir(opts.extract_to) + epub.extractall(path=opts.extract_to) + self.log.info('EPUB extracted to', opts.extract_to) + epub.close() + + def default_cover(self): + ''' + Create a generic cover for books that dont have a cover + ''' + try: + from calibre.gui2 import images_rc # Needed for access to logo + from PyQt4.Qt import QApplication, QFile, QIODevice + except: + return None + from calibre.ebooks.metadata import authors_to_string + images_rc + m = self.oeb.metadata + title = unicode(m.title[0]) + a = [unicode(x) for x in m.creators if m.role == 'aut'] + author = authors_to_string(a) + if QApplication.instance() is None: QApplication([]) + f = QFile(':/library') + f.open(QIODevice.ReadOnly) + img_data = str(f.readAll()) + id, href = self.oeb.manifest.generate('calibre-logo', + 'calibre-logo.png') + self.oeb.manifest.add(id, href, 'image/png', data=img_data) + html = self.TITLEPAGE%dict(title=title, author=author, + date=strftime('%d %b, %Y'), + app=__appname__ +' '+__version__, + img=href) + id, href = self.oeb.manifest.generate('calibre-titlepage', + 'calibre-titlepage.xhtml') + return self.oeb.manifest.add(id, href, guess_type('t.xhtml')[0], + data=etree.fromstring(html)) + + + def insert_cover(self): + from calibre.ebooks.oeb.base import urldefrag + from calibre import guess_type + g, m = self.oeb.guide, self.oeb.manifest + if 'titlepage' not in g: + if 'cover' in g: + tp = self.TITLEPAGE_COVER%unquote(g['cover'].href) + id, href = m.generate('titlepage', 'titlepage.xhtml') + item = m.add(id, href, guess_type('t.xhtml')[0], + data=etree.fromstring(tp)) + else: + item = self.default_cover() + else: + item = self.oeb.manifest.hrefs[ + urldefrag(self.oeb.guide['titlepage'].href)[0]] + if item is not None: + self.oeb.spine.insert(0, item, True) + if 'cover' not in self.oeb.guide.refs: + self.oeb.guide.add('cover', 'Title Page', 'a') + self.oeb.guide.refs['cover'].href = item.href + if 'titlepage' in self.oeb.guide.refs: + self.oeb.guide.refs['titlepage'].href = item.href + + + + def condense_ncx(self, ncx_path): + if not self.opts.pretty_print: + tree = etree.parse(ncx_path) + for tag in tree.getroot().iter(tag=etree.Element): + if tag.text: + tag.text = tag.text.strip() + if tag.tail: + tag.tail = tag.tail.strip() + compressed = etree.tostring(tree.getroot(), encoding='utf-8') + open(ncx_path, 'wb').write(compressed) + + + + def workaround_ade_quirks(self): + ''' + Perform various markup transforms to get the output to render correctly + in the quirky ADE. + ''' + from calibre.ebooks.oeb.base import XPNSMAP, XHTML + from lxml.etree import XPath as _XPath + from functools import partial + XPath = partial(_XPath, namespaces=XPNSMAP) + + for x in self.oeb.spine: + root = x.data + body = XPath('//h:body')(root) + if body: + body = body[0] + # Replace
that are children of as ADE doesn't handle them + if hasattr(body, 'xpath'): + for br in XPath('./h:br')(body): + if br.getparent() is None: + continue + try: + sibling = br.itersiblings().next() + except: + sibling = None + br.tag = XHTML('p') + br.text = u'\u00a0' + if (br.tail and br.tail.strip()) or sibling is None or \ + getattr(sibling, 'tag', '') != XHTML('br'): + style = br.get('style', '').split(';') + style = filter(None, map(lambda x: x.strip(), style)) + style.append('margin: 0pt; border:0pt; height:0pt') + br.set('style', '; '.join(style)) + else: + sibling.getparent().remove(sibling) + if sibling.tail: + if not br.tail: + br.tail = '' + br.tail += sibling.tail + + for tag in XPath('//h:embed')(root): + tag.getparent().remove(tag) + for tag in XPath('//h:object')(root): + if tag.get('type', '').lower().strip() in ('image/svg+xml',): + continue + tag.getparent().remove(tag) + + for tag in XPath('//h:title|//h:style')(root): + if not tag.text: + tag.getparent().remove(tag) + for tag in XPath('//h:script')(root): + if not tag.text and not tag.get('src', False): + tag.getparent().remove(tag) + + for tag in XPath('//h:form')(root): + tag.getparent().remove(tag) + + for tag in XPath('//h:center')(root): + tag.tag = XHTML('div') + tag.set('style', 'text-align:center') + + # ADE can't handle & in an img url + for tag in XPath('//h:img[@src]')(root): + tag.set('src', tag.get('src', '').replace('&', '')) + + stylesheet = self.oeb.manifest.hrefs['stylesheet.css'] + stylesheet.data.add('a { color: inherit; text-decoration: inherit; ' + 'cursor: default; }') + stylesheet.data.add('a[href] { color: blue; ' + 'text-decoration: underline; cursor:pointer; }') + + special_chars = re.compile(u'[\u200b\u00ad]') + for elem in root.iterdescendants(): + if getattr(elem, 'text', False): + elem.text = special_chars.sub('', elem.text) + elem.text = elem.text.replace(u'\u2011', '-') + if getattr(elem, 'tail', False): + elem.tail = special_chars.sub('', elem.tail) + elem.tail = elem.tail.replace(u'\u2011', '-') + + + diff --git a/src/calibre/ebooks/epub/pages.py b/src/calibre/ebooks/epub/pages.py index 1ab5edde86..4737107a6c 100644 --- a/src/calibre/ebooks/epub/pages.py +++ b/src/calibre/ebooks/epub/pages.py @@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en' import os, re from itertools import count, chain from calibre.ebooks.oeb.base import XHTML, XHTML_NS -from calibre.ebooks.oeb.base import OEBBook, DirWriter +from calibre.ebooks.oeb.base import OEBBook from lxml import etree, html from lxml.etree import XPath diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py deleted file mode 100644 index a5cc6dfc7d..0000000000 --- a/src/calibre/ebooks/epub/split.py +++ /dev/null @@ -1,509 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Split the flows in an epub file to conform to size limitations. -''' - -import os, math, logging, functools, collections, re, copy, sys - -from lxml.etree import XPath as _XPath -from lxml import etree, html -from lxml.cssselect import CSSSelector - -from calibre.ebooks.metadata.opf2 import OPF -from calibre.ebooks.epub import tostring, rules -from calibre import CurrentDir, LoggingInterface - -XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'}) -content = functools.partial(os.path.join, 'content') - -SPLIT_ATTR = 'cs' -SPLIT_POINT_ATTR = 'csp' - -class SplitError(ValueError): - - def __init__(self, path, root): - size = len(tostring(root))/1024. - ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% - (os.path.basename(path), size)) - - - -class Splitter(LoggingInterface): - - def __init__(self, path, opts, stylesheet_map, opf): - LoggingInterface.__init__(self, logging.getLogger('htmlsplit')) - self.setup_cli_handler(opts.verbose) - self.path = path - self.always_remove = not opts.preserve_tag_structure or \ - os.stat(content(path)).st_size > 5*opts.profile.flow_size - self.base = (os.path.splitext(path)[0].replace('%', '%%') + '_split_%d.html') - self.opts = opts - self.orig_size = os.stat(content(path)).st_size - self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) - root = html.fromstring(open(content(path)).read()) - - self.page_breaks, self.trees = [], [] - self.split_size = 0 - - # Split on page breaks - self.splitting_on_page_breaks = True - if not opts.dont_split_on_page_breaks: - self.log_info('\tSplitting on page breaks...') - if self.path in stylesheet_map: - self.find_page_breaks(stylesheet_map[self.path], root) - self.split_on_page_breaks(root.getroottree()) - trees = list(self.trees) - else: - self.trees = [root.getroottree()] - trees = list(self.trees) - - # Split any remaining over-sized trees - self.splitting_on_page_breaks = False - if self.opts.profile.flow_size < sys.maxint: - lt_found = False - self.log_info('\tLooking for large trees...') - self.tree_map = {} - for i, tree in enumerate(list(trees)): - self.split_trees = [] - size = len(tostring(tree.getroot())) - if size > self.opts.profile.flow_size: - lt_found = True - try: - self.split_to_size(tree) - self.tree_map[tree] = self.split_trees - except (SplitError, RuntimeError): # Splitting fails - if not self.always_remove: - self.always_remove = True - self.split_trees = [] - self.split_to_size(tree) - self.tree_map[tree] = self.split_trees - else: - raise - t = [] - for x in trees: - t.extend(self.tree_map.get(x, [x])) - trees = t - if not lt_found: - self.log_info('\tNo large trees found') - - self.trees = trees - self.was_split = len(self.trees) > 1 - if self.was_split: - self.commit() - self.log_info('\t\tSplit into %d parts.', len(self.trees)) - if self.opts.verbose: - for f in self.files: - self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.) - self.fix_opf(opf) - - self.trees = None - - - def split_text(self, text, root, size): - self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) - rest = text.replace('\r', '') - parts = re.split('\n\n', rest) - self.log_debug('\t\t\t\tFound %d parts'%len(parts)) - if max(map(len, parts)) > size: - raise SplitError('Cannot split as file contains a
 tag with a very large paragraph', root)
-        ans = []
-        buf = ''
-        for part in parts:
-            if len(buf) + len(part) < size:
-                buf += '\n\n'+part
-            else:
-                ans.append(buf)
-                buf = part
-        return ans
-
-
-    def split_to_size(self, tree):
-        self.log_debug('\t\tSplitting...')
-        root = tree.getroot()
-        # Split large 
 tags
-        for pre in list(root.xpath('//pre')):
-            text = u''.join(pre.xpath('descendant::text()'))
-            pre.text = text
-            for child in list(pre.iterchildren()):
-                pre.remove(child)
-            if len(pre.text) > self.opts.profile.flow_size*0.5:
-                frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
-                new_pres = []
-                for frag in frags:
-                    pre2 = copy.copy(pre)
-                    pre2.text = frag
-                    pre2.tail = u''
-                    new_pres.append(pre2)
-                new_pres[-1].tail = pre.tail
-                p = pre.getparent()
-                i = p.index(pre)
-                p[i:i+1] = new_pres
-
-        split_point, before = self.find_split_point(root)
-        if split_point is None or self.split_size > 6*self.orig_size:
-            if not self.always_remove:
-                self.log_warn(_('\t\tToo much markup. Re-splitting without '
-                                'structure preservation. This may cause '
-                                'incorrect rendering.'))
-            raise SplitError(self.path, root)
-
-        for t in self.do_split(tree, split_point, before):
-            r = t.getroot()
-            if self.is_page_empty(r):
-                continue
-            size = len(tostring(r))
-            if size <= self.opts.profile.flow_size:
-                self.split_trees.append(t)
-                #print tostring(t.getroot(), pretty_print=True)
-                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
-                               len(self.split_trees), size/1024.)
-                self.split_size += size
-            else:
-                self.split_to_size(t)
-
-    def is_page_empty(self, root):
-        body = root.find('body')
-        if body is None:
-            return False
-        txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
-        if len(txt) > 4:
-            #if len(txt) < 100:
-            #    print 1111111, html.tostring(body, method='html', encoding=unicode)
-            return False
-        for img in root.xpath('//img'):
-            if img.get('style', '') != 'display:none':
-                return False
-        return True
-
-    def do_split(self, tree, split_point, before):
-        '''
-        Split ``tree`` into a *before* and *after* tree at ``split_point``,
-        preserving tag structure, but not duplicating any text.
-        All tags that have had their text and tail
-        removed have the attribute ``calibre_split`` set to 1.
-
-        :param before: If True tree is split before split_point, otherwise after split_point
-        :return: before_tree, after_tree
-        '''
-        path         = tree.getpath(split_point)
-        tree, tree2  = copy.deepcopy(tree), copy.deepcopy(tree)
-        root         = tree.getroot()
-        root2        = tree2.getroot()
-        body, body2  = root.body, root2.body
-        split_point  = root.xpath(path)[0]
-        split_point2 = root2.xpath(path)[0]
-
-        def nix_element(elem, top=True):
-            if self.always_remove:
-                parent = elem.getparent()
-                index = parent.index(elem)
-                if top:
-                    parent.remove(elem)
-                else:
-                    index = parent.index(elem)
-                    parent[index:index+1] = list(elem.iterchildren())
-
-            else:
-                elem.text = u''
-                elem.tail = u''
-                elem.set(SPLIT_ATTR, '1')
-                if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
-                    elem.set('style', 'display:none')
-
-        def fix_split_point(sp):
-            if not self.splitting_on_page_breaks:
-                sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
-
-        # Tree 1
-        hit_split_point = False
-        for elem in list(body.iterdescendants(etree.Element)):
-            if elem.get(SPLIT_ATTR, '0') == '1':
-                continue
-            if elem is split_point:
-                hit_split_point = True
-                if before:
-                    nix_element(elem)
-                fix_split_point(elem)
-                continue
-            if hit_split_point:
-                nix_element(elem)
-
-
-        # Tree 2
-        hit_split_point = False
-        for elem in list(body2.iterdescendants(etree.Element)):
-            if elem.get(SPLIT_ATTR, '0') == '1':
-                continue
-            if elem is split_point2:
-                hit_split_point = True
-                if not before:
-                    nix_element(elem, top=False)
-                fix_split_point(elem)
-                continue
-            if not hit_split_point:
-                nix_element(elem, top=False)
-
-        return tree, tree2
-
-
-    def split_on_page_breaks(self, orig_tree):
-        ordered_ids = []
-        for elem in orig_tree.xpath('//*[@id]'):
-            id = elem.get('id')
-            if id in self.page_break_ids:
-                ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
-
-        self.trees = []
-        tree = orig_tree
-        for pattern, before in ordered_ids:
-            self.log_info('\t\tSplitting on page-break')
-            elem = pattern(tree)
-            if elem:
-                before, after = self.do_split(tree, elem[0], before)
-                self.trees.append(before)
-                tree = after
-        self.trees.append(tree)
-        self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
-
-
-
-    def find_page_breaks(self, stylesheets, root):
-        '''
-        Find all elements that have either page-break-before or page-break-after set.
-        Populates `self.page_breaks` with id based XPath selectors (for elements that don't
-        have ids, an id is created).
-        '''
-        page_break_selectors = set([])
-        for rule in rules(stylesheets):
-            before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
-            after  = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
-            try:
-                if before and before != 'avoid':
-                    page_break_selectors.add((CSSSelector(rule.selectorText), True))
-            except:
-                pass
-            try:
-                if after and after != 'avoid':
-                    page_break_selectors.add((CSSSelector(rule.selectorText), False))
-            except:
-                pass
-
-        page_breaks = set([])
-        for selector, before in page_break_selectors:
-            for elem in selector(root):
-                elem.pb_before = before
-                page_breaks.add(elem)
-
-        for i, elem in enumerate(root.iter()):
-            elem.pb_order = i
-
-        page_breaks = list(page_breaks)
-        page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
-        self.page_break_ids = []
-        for i, x in enumerate(page_breaks):
-            x.set('id', x.get('id', 'calibre_pb_%d'%i))
-            id = x.get('id')
-            self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
-            self.page_break_ids.append(id)
-
-
-    def find_split_point(self, root):
-        '''
-        Find the tag at which to split the tree rooted at `root`.
-        Search order is:
-            * Heading tags
-            * 
tags - *
 tags
-            * 
tags - *

tags - *
tags - *

  • tags - - We try to split in the "middle" of the file (as defined by tag counts. - ''' - def pick_elem(elems): - if elems: - elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != '1'\ - and i.get(SPLIT_ATTR, '0') != '1'] - if elems: - i = int(math.floor(len(elems)/2.)) - elems[i].set(SPLIT_POINT_ATTR, '1') - return elems[i] - - for path in ( - '//*[re:match(name(), "h[1-6]", "i")]', - '/html/body/div', - '//pre', - '//hr', - '//p', - '//div', - '//br', - '//li', - ): - elems = root.xpath(path, - namespaces={'re':'http://exslt.org/regular-expressions'}) - elem = pick_elem(elems) - if elem is not None: - try: - XPath(elem.getroottree().getpath(elem)) - except: - continue - return elem, True - - return None, True - - def commit(self): - ''' - Commit all changes caused by the split. This removes the previously - introduced ``calibre_split`` attribute and calculates an *anchor_map* for - all anchors in the original tree. Internal links are re-directed. The - original file is deleted and the split files are saved. - ''' - self.anchor_map = collections.defaultdict(lambda :self.base%0) - self.files = [] - - for i, tree in enumerate(self.trees): - root = tree.getroot() - self.files.append(self.base%i) - for elem in root.xpath('//*[@id]'): - if elem.get(SPLIT_ATTR, '0') == '0': - self.anchor_map[elem.get('id')] = self.files[-1] - for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)): - elem.attrib.pop(SPLIT_ATTR, None) - elem.attrib.pop(SPLIT_POINT_ATTR, '0') - - for current, tree in zip(self.files, self.trees): - for a in tree.getroot().xpath('//a[@href]'): - href = a.get('href').strip() - if href.startswith('#'): - anchor = href[1:] - file = self.anchor_map[anchor] - if file != current: - a.set('href', file+href) - open(content(current), 'wb').\ - write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print)) - - os.remove(content(self.path)) - - - def fix_opf(self, opf): - ''' - Fix references to the split file in the OPF. - ''' - items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path] - new_items = [('content/'+f, None) for f in self.files] - id_map = {} - for item in items: - id_map[item.get('id')] = opf.replace_manifest_item(item, new_items) - - for id in id_map.keys(): - opf.replace_spine_items_by_idref(id, id_map[id]) - - for ref in opf.iterguide(): - href = ref.get('href', '') - if href.startswith('content/'+self.path): - href = href.split('#') - frag = None - if len(href) > 1: - frag = href[1] - if frag not in self.anchor_map: - self.log_warning('\t\tUnable to re-map OPF link', href) - continue - new_file = self.anchor_map[frag] - ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag))) - - - -def fix_content_links(html_files, changes, opts): - split_files = [f.path for f in changes] - anchor_maps = [f.anchor_map for f in changes] - files = list(html_files) - for j, f in enumerate(split_files): - try: - i = files.index(f) - files[i:i+1] = changes[j].files - except ValueError: - continue - - for htmlfile in files: - changed = False - root = html.fromstring(open(content(htmlfile), 'rb').read()) - for a in root.xpath('//a[@href]'): - href = a.get('href') - if not href.startswith('#'): - href = href.split('#') - anchor = href[1] if len(href) > 1 else None - href = href[0] - if href in split_files: - try: - newf = anchor_maps[split_files.index(href)][anchor] - except: - print '\t\tUnable to remap HTML link:', href, anchor - continue - frag = ('#'+anchor) if anchor else '' - a.set('href', newf+frag) - changed = True - - if changed: - open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print)) - -def fix_ncx(path, changes): - split_files = [f.path for f in changes] - anchor_maps = [f.anchor_map for f in changes] - tree = etree.parse(path) - changed = False - for content in tree.getroot().xpath('//x:content[@src]', - namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}): - href = content.get('src') - if not href.startswith('#'): - href = href.split('#') - anchor = href[1] if len(href) > 1 else None - href = href[0].split('/')[-1] - if href in split_files: - try: - newf = anchor_maps[split_files.index(href)][anchor] - except: - print 'Unable to remap NCX link:', href, anchor - frag = ('#'+anchor) if anchor else '' - content.set('src', 'content/'+newf+frag) - changed = True - if changed: - open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True)) - -def find_html_files(opf): - ''' - Find all HTML files referenced by `opf`. - ''' - html_files = [] - for item in opf.itermanifest(): - if 'html' in item.get('media-type', '').lower(): - f = item.get('href').split('/')[-1] - f2 = f.replace('&', '%26') - if not os.path.exists(content(f)) and os.path.exists(content(f2)): - f = f2 - item.set('href', item.get('href').replace('&', '%26')) - if os.path.exists(content(f)): - html_files.append(f) - return html_files - - -def split(pathtoopf, opts, stylesheet_map): - pathtoopf = os.path.abspath(pathtoopf) - opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) - - with CurrentDir(os.path.dirname(pathtoopf)): - html_files = find_html_files(opf) - changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files] - changes = [c for c in changes if c.was_split] - - fix_content_links(html_files, changes, opts) - for item in opf.itermanifest(): - if item.get('media-type', '') == 'application/x-dtbncx+xml': - fix_ncx(item.get('href'), changes) - break - - open(pathtoopf, 'wb').write(opf.render()) diff --git a/src/calibre/ebooks/lrf/fb2/__init__.py b/src/calibre/ebooks/fb2/__init__.py similarity index 100% rename from src/calibre/ebooks/lrf/fb2/__init__.py rename to src/calibre/ebooks/fb2/__init__.py diff --git a/src/calibre/ebooks/lrf/fb2/fb2.xsl b/src/calibre/ebooks/fb2/fb2.xsl similarity index 100% rename from src/calibre/ebooks/lrf/fb2/fb2.xsl rename to src/calibre/ebooks/fb2/fb2.xsl diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py new file mode 100644 index 0000000000..3a5806b143 --- /dev/null +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into FB2 markup +''' + +import os +import re +from base64 import b64encode + +from calibre import entity_to_unicode +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.constants import __appname__, __version__ + +from BeautifulSoup import BeautifulSoup + +TAG_MAP = { + 'b' : 'strong', + 'i' : 'emphasis', + 'p' : 'p', + 'div' : 'p', +} + +STYLES = [ + ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}), + ('font-style', {'italic' : 'emphasis'}), +] + +class FB2MLizer(object): + def __init__(self, ignore_tables=False): + self.ignore_tables = ignore_tables + + def extract_content(self, oeb_book, opts): + oeb_book.logger.info('Converting XHTML to FB2 markup...') + self.oeb_book = oeb_book + self.opts = opts + return self.fb2mlize_spine() + + def fb2mlize_spine(self): + output = self.fb2_header() + if 'titlepage' in self.oeb_book.guide: + href = self.oeb_book.guide['titlepage'].href + item = self.oeb_book.manifest.hrefs[href] + if item.spine_position is None: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + for item in self.oeb_book.spine: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += self.fb2_body_footer() + output += self.fb2mlize_images() + output += self.fb2_footer() + output = self.clean_text(output) + return BeautifulSoup(output.encode('utf-8')).prettify() + + def fb2_header(self): + return u' ' \ + ' ' \ + '%s ' \ + ' ' \ + '%s - %s ' \ + '
    ' % (self.oeb_book.metadata.title[0].value, __appname__, __version__) + + def fb2_body_footer(self): + return u'
    ' + + def fb2_footer(self): + return u'
    ' + + def fb2mlize_images(self): + images = u'' + for item in self.oeb_book.manifest: + if item.media_type in OEB_IMAGES: + data = b64encode(item.data) + images += '%s' % (os.path.basename(item.href), item.media_type, data) + return images + + def clean_text(self, text): + for entity in set(re.findall('&.+?;', text)): + mo = re.search('(%s)' % entity[1:-1], text) + text = text.replace(entity, entity_to_unicode(mo)) + + text = text.replace('&', '') + + return text + + def dump_text(self, elem, stylizer, tag_stack=[]): + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return u'' + + fb2_text = u'' + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return u'' + + tag = barename(elem.tag) + tag_count = 0 + + if tag == 'img': + fb2_text += '' % os.path.basename(elem.attrib['src']) + + + fb2_tag = TAG_MAP.get(tag, 'p') + if fb2_tag and fb2_tag not in tag_stack: + tag_count += 1 + fb2_text += '<%s>' % fb2_tag + tag_stack.append(fb2_tag) + + # Processes style information + for s in STYLES: + style_tag = s[1].get(style[s[0]], None) + if style_tag: + tag_count += 1 + fb2_text += '<%s>' % style_tag + tag_stack.append(style_tag) + + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + fb2_text += elem.text + + for item in elem: + fb2_text += self.dump_text(item, stylizer, tag_stack) + + close_tag_list = [] + for i in range(0, tag_count): + close_tag_list.insert(0, tag_stack.pop()) + + fb2_text += self.close_tags(close_tag_list) + + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + if 'p' not in tag_stack: + fb2_text += '

    %s

    ' % elem.tail + else: + fb2_text += elem.tail + + return fb2_text + + def close_tags(self, tags): + fb2_text = u'' + for i in range(0, len(tags)): + fb2_tag = tags.pop() + fb2_text += '' % fb2_tag + + return fb2_text + diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/fb2/input.py new file mode 100644 index 0000000000..d96758a4bd --- /dev/null +++ b/src/calibre/ebooks/fb2/input.py @@ -0,0 +1,74 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Anatoly Shipitsin ' +""" +Convert .fb2 files to .lrf +""" +import os +from base64 import b64decode +from lxml import etree + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre import guess_type + +FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0' + +class FB2Input(InputFormatPlugin): + + name = 'FB2 Input' + author = 'Anatoly Shipitsin' + description = 'Convert FB2 files to HTML' + file_types = set(['fb2']) + + recommendations = set([ + ('level1_toc', '//h:h1', OptionRecommendation.MED), + ('level2_toc', '//h:h2', OptionRecommendation.MED), + ('level3_toc', '//h:h3', OptionRecommendation.MED), + ]) + + def convert(self, stream, options, file_ext, log, + accelerators): + from calibre.resources import fb2_xsl + from calibre.ebooks.metadata.opf2 import OPFCreator + from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.oeb.base import XLINK_NS + NAMESPACES = {'f':FB2NS, 'l':XLINK_NS} + + log.debug('Parsing XML...') + parser = etree.XMLParser(recover=True, no_network=True) + doc = etree.parse(stream, parser) + self.extract_embedded_content(doc) + log.debug('Converting XML to HTML...') + styledoc = etree.fromstring(fb2_xsl) + + transform = etree.XSLT(styledoc) + result = transform(doc) + open('index.xhtml', 'wb').write(transform.tostring(result)) + stream.seek(0) + mi = get_metadata(stream, 'fb2') + if not mi.title: + mi.title = _('Unknown') + if not mi.authors: + mi.authors = [_('Unknown')] + opf = OPFCreator(os.getcwdu(), mi) + entries = [(f, guess_type(f)[0]) for f in os.listdir('.')] + opf.create_manifest(entries) + opf.create_spine(['index.xhtml']) + + for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): + href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) + if href is not None: + if href.startswith('#'): + href = href[1:] + opf.guide.set_cover(os.path.abspath(href)) + + opf.render(open('metadata.opf', 'wb')) + return os.path.join(os.getcwd(), 'metadata.opf') + + def extract_embedded_content(self, doc): + for elem in doc.xpath('./*'): + if 'binary' in elem.tag and elem.attrib.has_key('id'): + fname = elem.attrib['id'] + data = b64decode(elem.text.strip()) + open(fname, 'wb').write(data) + diff --git a/src/calibre/ebooks/fb2/output.py b/src/calibre/ebooks/fb2/output.py new file mode 100644 index 0000000000..67ee9f468e --- /dev/null +++ b/src/calibre/ebooks/fb2/output.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import OutputFormatPlugin +from calibre.ebooks.fb2.fb2ml import FB2MLizer + +class FB2Output(OutputFormatPlugin): + + name = 'FB2 Output' + author = 'John Schember' + file_type = 'fb2' + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + fb2mlizer = FB2MLizer(ignore_tables=opts.linearize_tables) + fb2_content = fb2mlizer.extract_content(oeb_book, opts) + + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.truncate() + out_stream.write(fb2_content) + + if close: + out_stream.close() + diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py deleted file mode 100644 index 3365df576e..0000000000 --- a/src/calibre/ebooks/html.py +++ /dev/null @@ -1,1192 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Code to recursively parse HTML files and create an open ebook in a specified -directory or zip file. All the action starts in :function:`create_dir`. -''' - -import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools -from urlparse import urlparse, urlunparse -from urllib import unquote - -from lxml import etree -from lxml.html import HtmlElementClassLookup, HTMLParser as _HTMLParser, \ - fromstring as _fromstring, tostring as _tostring, \ - soupparser, HtmlElement -from lxml.etree import XPath -get_text = XPath("//text()") - -from calibre import LoggingInterface, unicode_path, entity_to_unicode -from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS -from calibre.utils.config import Config, StringConfig -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.metadata.opf2 import OPF, OPFCreator -from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile -from calibre.utils.zipfile import ZipFile -from cssutils import CSSParser - -class HTMLElement(HtmlElement): - - @apply - def specified_font_size(): - - def fget(self): - ans = self.get('specified_font_size', '') - if not ans: - return lambda x: x - if ans.startswith('f'): - return functools.partial(operator.mul, float(ans[1:])) - return float(ans) - - def fset(self, val): - self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val)) - - return property(fget=fget, fset=fset) - - @apply - def computed_font_size(): - def fget(self): - ans = self.get('computed_font_size', '') - if ans == '': - return None - return float(ans) - - def fset(self, val): - self.set('computed_font_size', repr(val)) - - return property(fget=fget, fset=fset) - - def remove_font_size_information(self): - for elem in self.iter(): - for p in ('computed', 'specified'): - elem.attrib.pop(p+'_font_size', None) - - def getpath(self): - return self.getroottree().getpath(self) - -class Lookup(HtmlElementClassLookup): - - def lookup(self, node_type, document, namespace, name): - if node_type == 'element': - return HTMLElement - return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name) - -class HTMLParser(_HTMLParser): - - def __init__(self, **kwargs): - super(HTMLParser, self).__init__(**kwargs) - self.set_element_class_lookup(Lookup()) - -parser = HTMLParser() - -def fromstring(raw, **kw): - return _fromstring(raw, parser=parser, **kw) - -def tostring(root, pretty_print=False): - return _tostring(root, encoding='utf-8', method='xml', - include_meta_content_type=True, - pretty_print=pretty_print) - -class Link(object): - ''' - Represents a link in a HTML file. - ''' - - @classmethod - def url_to_local_path(cls, url, base): - path = urlunparse(('', '', url.path, url.params, url.query, '')) - path = unquote(path) - if os.path.isabs(path): - return path - return os.path.abspath(os.path.join(base, path)) - - def __init__(self, url, base): - ''' - :param url: The url this link points to. Must be an unquoted unicode string. - :param base: The base directory that relative URLs are with respect to. - Must be a unicode string. - ''' - assert isinstance(url, unicode) and isinstance(base, unicode) - self.url = url - self.parsed_url = urlparse(self.url) - self.is_local = self.parsed_url.scheme in ('', 'file') - self.is_internal = self.is_local and not bool(self.parsed_url.path) - self.path = None - self.fragment = unquote(self.parsed_url.fragment) - if self.is_local and not self.is_internal: - self.path = self.url_to_local_path(self.parsed_url, base) - - def __hash__(self): - if self.path is None: - return hash(self.url) - return hash(self.path) - - def __eq__(self, other): - return self.path == getattr(other, 'path', other) - - def __str__(self): - return u'Link: %s --> %s'%(self.url, self.path) - - -class IgnoreFile(Exception): - - def __init__(self, msg, errno): - Exception.__init__(self, msg) - self.doesnt_exist = errno == 2 - self.errno = errno - -class HTMLFile(object): - ''' - Contains basic information about an HTML file. This - includes a list of links to other files as well as - the encoding of each file. Also tries to detect if the file is not a HTML - file in which case :member:`is_binary` is set to True. - - The encoding of the file is available as :member:`encoding`. - ''' - - HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) - TITLE_PAT = re.compile('([^<>]+)', re.IGNORECASE) - LINK_PAT = re.compile( - r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))', - re.DOTALL|re.IGNORECASE) - - def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): - ''' - :param level: The level of this file. Should be 0 for the root file. - :param encoding: Use `encoding` to decode HTML. - :param referrer: The :class:`HTMLFile` that first refers to this file. - ''' - self.path = unicode_path(path_to_html_file, abs=True) - self.title = os.path.splitext(os.path.basename(self.path))[0] - self.base = os.path.dirname(self.path) - self.level = level - self.referrer = referrer - self.links = [] - - try: - with open(self.path, 'rb') as f: - src = f.read() - except IOError, err: - msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err)) - if level == 0: - raise IOError(msg) - raise IgnoreFile(msg, err.errno) - - self.is_binary = not bool(self.HTML_PAT.search(src[:1024])) - if not self.is_binary: - if encoding is None: - encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] - self.encoding = encoding - else: - self.encoding = encoding - - src = src.decode(encoding, 'replace') - match = self.TITLE_PAT.search(src) - self.title = match.group(1) if match is not None else self.title - self.find_links(src) - - - - def __eq__(self, other): - return self.path == getattr(other, 'path', other) - - def __str__(self): - return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path) - - def __repr__(self): - return str(self) - - - def find_links(self, src): - for match in self.LINK_PAT.finditer(src): - url = None - for i in ('url1', 'url2', 'url3'): - url = match.group(i) - if url: - break - link = self.resolve(url) - if link not in self.links: - self.links.append(link) - - def resolve(self, url): - return Link(url, self.base) - - -def depth_first(root, flat, visited=set([])): - yield root - visited.add(root) - for link in root.links: - if link.path is not None and link not in visited: - try: - index = flat.index(link) - except ValueError: # Can happen if max_levels is used - continue - hf = flat[index] - if hf not in visited: - yield hf - visited.add(hf) - for hf in depth_first(hf, flat, visited): - if hf not in visited: - yield hf - visited.add(hf) - - -def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None): - ''' - Recursively traverse all links in the HTML file. - - :param max_levels: Maximum levels of recursion. Must be non-negative. 0 - implies that no links in the root HTML file are followed. - :param encoding: Specify character encoding of HTML files. If `None` it is - auto-detected. - :return: A pair of lists (breadth_first, depth_first). Each list contains - :class:`HTMLFile` objects. - ''' - assert max_levels >= 0 - level = 0 - flat = [HTMLFile(path_to_html_file, level, encoding, verbose)] - next_level = list(flat) - while level < max_levels and len(next_level) > 0: - level += 1 - nl = [] - for hf in next_level: - rejects = [] - for link in hf.links: - if link.path is None or link.path in flat: - continue - try: - nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf) - if nf.is_binary: - raise IgnoreFile('%s is a binary file'%nf.path, -1) - nl.append(nf) - flat.append(nf) - except IgnoreFile, err: - rejects.append(link) - if not err.doesnt_exist or verbose > 1: - print repr(err) - for link in rejects: - hf.links.remove(link) - - next_level = list(nl) - orec = sys.getrecursionlimit() - sys.setrecursionlimit(500000) - try: - return flat, list(depth_first(flat[0], flat)) - finally: - sys.setrecursionlimit(orec) - - -def opf_traverse(opf_reader, verbose=0, encoding=None): - ''' - Return a list of :class:`HTMLFile` objects in the order specified by the - `` element of the OPF. - - :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance. - :param encoding: Specify character encoding of HTML files. If `None` it is - auto-detected. - ''' - if not opf_reader.spine: - raise ValueError('OPF does not have a spine') - flat = [] - for path in opf_reader.spine.items(): - path = os.path.abspath(path) - if path not in flat: - flat.append(os.path.abspath(path)) - for item in opf_reader.manifest: - if 'html' in item.mime_type: - path = os.path.abspath(item.path) - if path not in flat: - flat.append(path) - for i, path in enumerate(flat): - if not os.path.exists(path): - path = path.replace('&', '%26') - if os.path.exists(path): - flat[i] = path - for item in opf_reader.itermanifest(): - item.set('href', item.get('href').replace('&', '%26')) - ans = [] - for path in flat: - if os.path.exists(path): - ans.append(HTMLFile(path, 0, encoding, verbose)) - else: - print 'WARNING: OPF spine item %s does not exist'%path - ans = [f for f in ans if not f.is_binary] - return ans - - -convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp']) -_span_pat = re.compile('', re.DOTALL|re.IGNORECASE) - -def sanitize_head(match): - x = match.group(1) - x = _span_pat.sub('', x) - return '\n'+x+'\n' - -class PreProcessor(object): - PREPROCESS = [ - # Some idiotic HTML generators (Frontpage I'm looking at you) - # Put all sorts of crap into . This messes up lxml - (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), - sanitize_head), - # Convert all entities, since lxml doesn't handle them well - (re.compile(r'&(\S+?);'), convert_entities), - # Remove the ', re.IGNORECASE), - lambda match: ''), - ] - - # Fix pdftohtml markup - PDFTOHTML = [ - # Remove
    tags - (re.compile(r'', re.IGNORECASE), lambda match: '
    '), - # Remove page numbers - (re.compile(r'\d+
    ', re.IGNORECASE), lambda match: ''), - # Remove
    and replace

    with

    - (re.compile(r'\s*', re.IGNORECASE), lambda match: '

    '), - (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 - else match.group(1)), - # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), - - # Remove gray background - (re.compile(r']+>'), lambda match : ''), - - # Remove non breaking spaces - (re.compile(ur'\u00a0'), lambda match : ' '), - - ] - - # Fix Book Designer markup - BOOK_DESIGNER = [ - # HR - (re.compile('


    ', re.IGNORECASE), - lambda match : ' '), - # Create header tags - (re.compile('<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
  • ', re.IGNORECASE), - lambda match : '

    %s

    '%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), - lambda match : '

    %s

    '%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), - lambda match : '

    %s

    '%(match.group(1),)), - (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), - lambda match : '

    %s

    '%(match.group(1),)), - ] - - def is_baen(self, src): - return re.compile(r'<]*id=BookTitle', raw) is not None - - def is_pdftohtml(self, src): - return '' in src[:1000] - - def preprocess(self, html): - opts = getattr(self, 'opts', False) - if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_special_chars', False): - html = opts.profile.remove_special_chars.sub('', html) - html = html.replace(u'\u2011', '-') - if self.is_baen(html): - rules = [] - elif self.is_book_designer(html): - rules = self.BOOK_DESIGNER - elif self.is_pdftohtml(html): - rules = self.PDFTOHTML - else: - rules = [] - for rule in self.PREPROCESS + rules: - html = rule[0].sub(rule[1], html) - return html - -class Parser(PreProcessor, LoggingInterface): -# SELF_CLOSING_TAGS = 'hr|br|link|img|meta|input|area|base|basefont' -# SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in -# [ -# (r'<(?P%s)(?P(\s+[^<>]*){0,1})(?', -# '<\g\g />'), -# (), -# ] -# ] - - def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'): - LoggingInterface.__init__(self, logging.getLogger(name)) - self.setup_cli_handler(opts.verbose) - self.htmlfile = htmlfile - self.opts = opts - self.tdir = tdir - self.resource_map = resource_map - self.htmlfiles = htmlfiles - self.resource_dir = os.path.join(tdir, 'resources') - save_counter = 1 - self.htmlfile_map = {} - self.level = self.htmlfile.level - for f in self.htmlfiles: - name = os.path.basename(f.path) - name = os.path.splitext(name)[0] + '.xhtml' - if name in self.htmlfile_map.values(): - name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1] - save_counter += 1 - self.htmlfile_map[f.path] = name - - self.parse_html() - # Handle tags inside embedded - # At least one source of EPUB files (Penguin) uses xlink:href - # without declaring the xlink namespace - for image in self.root.xpath('//image'): - for attr in image.attrib.keys(): - if attr.endswith(':href'): - nhref = self.rewrite_links(image.get(attr)) - image.set(attr, nhref) - - self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) - for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates - if self.root.get(bad, None) is not None: - self.root.attrib.pop(bad) - - - - def save_path(self): - return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]) - - def save(self, strip_comments=False): - ''' - Save processed HTML into the content directory. - Should be called after all HTML processing is finished. - ''' - self.root.set('xmlns', 'http://www.w3.org/1999/xhtml') - self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink') - for svg in self.root.xpath('//svg'): - svg.set('xmlns', 'http://www.w3.org/2000/svg') - - ans = tostring(self.root, pretty_print=self.opts.pretty_print) - ans = re.compile(r'', re.IGNORECASE).sub( - '\n\t\n', ans[:1000])+ans[1000:] - if strip_comments: - ans = re.compile(r'', re.DOTALL).sub('', ans) - with open(self.save_path(), 'wb') as f: - f.write(ans) - return f.name - - - def parse_html(self): - ''' Create lxml ElementTree from HTML ''' - self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:])) - if self.htmlfile.is_binary: - raise ValueError('Not a valid HTML file: '+self.htmlfile.path) - src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip() - src = src.replace('\x00', '').replace('\r', ' ') - src = self.preprocess(src) - # lxml chokes on unicode input when it contains encoding declarations - for pat in ENCODING_PATS: - src = pat.sub('', src) - src = src[src.find('<'):] - # Remove unclosed - - -
    - comic page #%d -
    - - - ''') - dir = os.path.dirname(pages[0]) - for i, page in enumerate(pages): - wrapper = WRAPPER%(i+1, os.path.basename(page), i+1) - page = os.path.join(dir, 'page_%d.html'%(i+1)) - open(page, 'wb').write(wrapper) - wrappers.append(page) - - mi = MetaInformation(opts.title, [opts.author]) - opf = OPFCreator(dir, mi) - opf.create_manifest([(w, None) for w in wrappers]) - opf.create_spine(wrappers) - metadata = os.path.join(dir, 'metadata.opf') - opf.render(open(metadata, 'wb')) - opts2 = html2epub_config('margin_left=0\nmargin_right=0\nmargin_top=0\nmargin_bottom=0').parse() - opts2.output = opts.output - html2epub(metadata, opts2) - -def create_lrf(pages, profile, opts, thumbnail=None): - width, height = PROFILES[profile] - ps = {} - ps['topmargin'] = 0 - ps['evensidemargin'] = 0 - ps['oddsidemargin'] = 0 - ps['textwidth'] = width - ps['textheight'] = height - book = Book(title=opts.title, author=opts.author, - bookid=uuid4().hex, - publisher='%s %s'%(__appname__, __version__), thumbnail=thumbnail, - category='Comic', pagestyledefault=ps, - booksetting=BookSetting(screenwidth=width, screenheight=height)) - for page in pages: - imageStream = ImageStream(page) - _page = book.create_page() - _page.append(ImageBlock(refstream=imageStream, - blockwidth=width, blockheight=height, xsize=width, - ysize=height, x1=width, y1=height)) - book.append(_page) - - book.renderLrf(open(opts.output, 'wb')) - print _('Output written to'), opts.output - - -def create_pdf(pages, profile, opts, thumbnail=None,toc=None): - width, height = PROFILES[profile] - - from reportlab.pdfgen import canvas - - cur_page=0 - heading = [] - if toc != None: - if len(toc) == 1: - toc = None - else: - toc_index = 0 - base_cur = 0 - rem = 0 - breaker = False - while True: - letter=toc[0][0][base_cur] - for i in range(len(toc)): - if letter != toc[i][0][base_cur]: - breaker = True - if breaker: - break - if letter == os.sep: - rem=base_cur - base_cur += 1 - toc.append(("Not seen",-1)) - - - pdf = canvas.Canvas(filename=opts.output, pagesize=(width,height+15)) - pdf.setAuthor(opts.author) - pdf.setTitle(opts.title) - - - for page in pages: - if opts.keep_aspect_ratio: - img = NewMagickWand() - if img < 0: - raise RuntimeError('Cannot create wand.') - if not MagickReadImage(img, page): - raise IOError('Failed to read image from: %'%page) - sizex = MagickGetImageWidth(img) - sizey = MagickGetImageHeight(img) - if opts.keep_aspect_ratio: - # Preserve the aspect ratio by adding border - aspect = float(sizex) / float(sizey) - if aspect <= (float(width) / float(height)): - newsizey = height - newsizex = int(newsizey * aspect) - deltax = (width - newsizex) / 2 - deltay = 0 - else: - newsizex = width - newsizey = int(newsizex / aspect) - deltax = 0 - deltay = (height - newsizey) / 2 - pdf.drawImage(page, x=deltax,y=deltay,width=newsizex, height=newsizey) - else: - pdf.drawImage(page, x=0,y=0,width=width, height=height) - if toc != None: - if toc[toc_index][1] == cur_page: - tmp=toc[toc_index][0] - toc_current=tmp[rem:len(tmp)-4] - index=0 - while True: - key = 'page%d-%d' % (cur_page, index) - pdf.bookmarkPage(key) - (head,dummy,list)=toc_current.partition(os.sep) - try: - if heading[index] != head: - heading[index] = head - pdf.addOutlineEntry(title=head,key=key,level=index) - except: - heading.append(head) - pdf.addOutlineEntry(title=head,key=key,level=index) - index += 1 - toc_current=list - if dummy == "": - break - toc_index += 1 - cur_page += 1 - pdf.showPage() - # Write the document to disk - pdf.save() - - -def do_convert(path_to_file, opts, notification=lambda m, p: p, output_format='lrf'): - path_to_file = run_plugins_on_preprocess(path_to_file) - source = path_to_file - to_delete = [] - toc = [] - list = [] - pages = [] - - - if not opts.title: - opts.title = os.path.splitext(os.path.basename(source))[0] - if not opts.output: - opts.output = os.path.abspath(os.path.splitext(os.path.basename(source))[0]+'.'+output_format) - if os.path.isdir(source): - for path in all_files( source , '*.cbr|*.cbz' ): - list.append( path ) - else: - list= [ os.path.abspath(source) ] - - for source in list: - tdir = extract_comic(source) - new_pages = find_pages(tdir, sort_on_mtime=opts.no_sort, verbose=opts.verbose) - thumbnail = None - if not new_pages: - raise ValueError('Could not find any pages in the comic: %s'%source) - if not getattr(opts, 'no_process', False): - new_pages, failures, tdir2 = process_pages(new_pages, opts, notification) - if not new_pages: - raise ValueError('Could not find any valid pages in the comic: %s'%source) - if failures: - print 'Could not process the following pages (run with --verbose to see why):' - for f in failures: - print '\t', f - thumbnail = os.path.join(tdir2, 'thumbnail.png') - if not os.access(thumbnail, os.R_OK): - thumbnail = None - toc.append((source,len(pages))) - pages.extend(new_pages) - to_delete.append(tdir) - - - if output_format == 'lrf': - create_lrf(pages, opts.profile, opts, thumbnail=thumbnail) - if output_format == 'epub': - create_epub(pages, opts.profile, opts, thumbnail=thumbnail) - if output_format == 'pdf': - create_pdf(pages, opts.profile, opts, thumbnail=thumbnail,toc=toc) - for tdir in to_delete: - shutil.rmtree(tdir) - - -def all_files(root, patterns='*'): - # Expand patterns from semicolon-separated string to list - patterns = patterns.split('|') - for path, subdirs, files in os.walk(root): - files.sort( ) - for name in files: - for pattern in patterns: - if fnmatch.fnmatch(name, pattern): - yield os.path.join(path, name) - break - - -def main(args=sys.argv, notification=None, output_format='lrf'): - parser = option_parser(output_format=output_format) - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print '\nYou must specify a file to convert' - return 1 - - if not callable(notification): - pb = ProgressBar(terminal_controller, _('Rendering comic pages...'), - no_progress_bar=opts.no_progress_bar or getattr(opts, 'no_process', False)) - notification = pb.update - - source = os.path.abspath(args[1]) - do_convert(source, opts, notification, output_format=output_format) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/epub/__init__.py b/src/calibre/ebooks/lrf/epub/__init__.py deleted file mode 100644 index ab32bc9c41..0000000000 --- a/src/calibre/ebooks/lrf/epub/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - diff --git a/src/calibre/ebooks/lrf/epub/convert_from.py b/src/calibre/ebooks/lrf/epub/convert_from.py deleted file mode 100644 index c564930ea5..0000000000 --- a/src/calibre/ebooks/lrf/epub/convert_from.py +++ /dev/null @@ -1,75 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - -import os, sys, shutil, logging -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks import ConversionError, DRMError -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre.ebooks.metadata.opf import OPF -from calibre.ebooks.metadata.epub import OCFDirReader -from calibre.utils.zipfile import ZipFile -from calibre import setup_cli_handlers -from calibre.ptempfile import PersistentTemporaryDirectory - - -def option_parser(): - return lrf_option_parser( -_('''Usage: %prog [options] mybook.epub - - -%prog converts mybook.epub to mybook.lrf''') - ) - -def generate_html(pathtoepub, logger): - if not os.access(pathtoepub, os.R_OK): - raise ConversionError('Cannot read from ' + pathtoepub) - tdir = PersistentTemporaryDirectory('_epub2lrf') - #os.rmdir(tdir) - try: - ZipFile(pathtoepub).extractall(tdir) - except: - raise ConversionError, '.epub extraction failed' - if os.path.exists(os.path.join(tdir, 'META-INF', 'encryption.xml')): - raise DRMError(os.path.basename(pathtoepub)) - - return tdir - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('epub2lrf') - setup_cli_handlers(logger, level) - epub = os.path.abspath(os.path.expanduser(path)) - tdir = generate_html(epub, logger) - try: - ocf = OCFDirReader(tdir) - htmlfile = ocf.opf.spine[0].path - options.opf = os.path.join(tdir, ocf.container[OPF.MIMETYPE]) - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - options.use_spine = True - - html_process_file(htmlfile, options, logger=logger) - finally: - try: - shutil.rmtree(tdir) - except: - logger.warning('Failed to delete temporary directory '+tdir) - - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No epub file specified' - return 1 - process_file(args[1], options, logger) - return 0 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/fb2/convert_from.py b/src/calibre/ebooks/lrf/fb2/convert_from.py deleted file mode 100644 index 24562e708c..0000000000 --- a/src/calibre/ebooks/lrf/fb2/convert_from.py +++ /dev/null @@ -1,125 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Anatoly Shipitsin ' -""" -Convert .fb2 files to .lrf -""" -import os, sys, shutil, logging -from base64 import b64decode -from lxml import etree - -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre import setup_cli_handlers -from calibre.resources import fb2_xsl -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.metadata import MetaInformation - - -def option_parser(): - parser = lrf_option_parser( -_('''%prog [options] mybook.fb2 - - -%prog converts mybook.fb2 to mybook.lrf''')) - parser.add_option('--debug-html-generation', action='store_true', default=False, - dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.')) - parser.add_option('--keep-intermediate-files', action='store_true', default=False, - help=_('Keep generated HTML files after completing conversion to LRF.')) - return parser - -def extract_embedded_content(doc): - for elem in doc.xpath('./*'): - if 'binary' in elem.tag and elem.attrib.has_key('id'): - fname = elem.attrib['id'] - data = b64decode(elem.text.strip()) - open(fname, 'wb').write(data) - -def to_html(fb2file, tdir): - fb2file = os.path.abspath(fb2file) - cwd = os.getcwd() - try: - os.chdir(tdir) - print 'Parsing XML...' - parser = etree.XMLParser(recover=True, no_network=True) - doc = etree.parse(fb2file, parser) - extract_embedded_content(doc) - print 'Converting XML to HTML...' - styledoc = etree.fromstring(fb2_xsl) - - transform = etree.XSLT(styledoc) - result = transform(doc) - open('index.html', 'wb').write(transform.tostring(result)) - try: - mi = get_metadata(open(fb2file, 'rb'), 'fb2') - except: - mi = MetaInformation(None, None) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(fb2file))[0] - if not mi.authors: - mi.authors = [_('Unknown')] - opf = OPFCreator(tdir, mi) - opf.create_manifest([('index.html', None)]) - opf.create_spine(['index.html']) - opf.render(open('metadata.opf', 'wb')) - return os.path.join(tdir, 'metadata.opf') - finally: - os.chdir(cwd) - - -def generate_html(fb2file, encoding, logger): - tdir = PersistentTemporaryDirectory('_fb22lrf') - to_html(fb2file, tdir) - return os.path.join(tdir, 'index.html') - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('fb22lrf') - setup_cli_handlers(logger, level) - fb2 = os.path.abspath(os.path.expanduser(path)) - f = open(fb2, 'rb') - mi = get_metadata(f, 'fb2') - f.close() - htmlfile = generate_html(fb2, options.encoding, logger) - tdir = os.path.dirname(htmlfile) - cwd = os.getcwdu() - try: - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(fb2))[0] - if (not options.title or options.title == _('Unknown')): - options.title = mi.title - if (not options.author or options.author == _('Unknown')) and mi.authors: - options.author = mi.authors.pop() - if (not options.category or options.category == _('Unknown')) and mi.category: - options.category = mi.category - if (not options.freetext or options.freetext == _('Unknown')) and mi.comments: - options.freetext = mi.comments - os.chdir(tdir) - html_process_file(htmlfile, options, logger) - finally: - os.chdir(cwd) - if getattr(options, 'keep_intermediate_files', False): - logger.debug('Intermediate files in '+ tdir) - else: - shutil.rmtree(tdir) - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No fb2 file specified' - return 1 - process_file(args[1], options, logger) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/feeds/__init__.py b/src/calibre/ebooks/lrf/feeds/__init__.py deleted file mode 100644 index ec763fbda7..0000000000 --- a/src/calibre/ebooks/lrf/feeds/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' diff --git a/src/calibre/ebooks/lrf/feeds/convert_from.py b/src/calibre/ebooks/lrf/feeds/convert_from.py deleted file mode 100644 index 6965ea7bf3..0000000000 --- a/src/calibre/ebooks/lrf/feeds/convert_from.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Convert web feeds to LRF files. -''' -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.lrf.html.convert_from import process_file -from calibre.web.feeds.main import option_parser as feeds_option_parser -from calibre.web.feeds.main import run_recipe -from calibre.ptempfile import TemporaryDirectory -from calibre import sanitize_file_name, strftime - -import sys, os - -def option_parser(): - parser = feeds_option_parser() - parser.remove_option('--output-dir') - parser.remove_option('--lrf') - parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk')) - lrf_parser = lrf_option_parser('') - lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf')) - parser.merge(lrf_parser) - return parser - -def main(args=sys.argv, notification=None, handler=None): - parser = option_parser() - opts, args = parser.parse_args(args) - opts.lrf = True - - if len(args) != 2 and opts.feeds is None: - parser.print_help() - return 1 - - recipe_arg = args[1] if len(args) > 1 else None - - with TemporaryDirectory('_feeds2lrf') as tdir: - opts.output_dir = tdir - - recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler) - - htmlfile = os.path.join(tdir, 'index.html') - if not os.access(htmlfile, os.R_OK): - raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg) - - lparser = lrf_option_parser('') - ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0] - parser.merge_options(ropts, opts) - - if not opts.output: - ext = '.lrs' if opts.lrs else '.lrf' - fname = recipe.title + strftime(recipe.timefmt)+ext - opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname)) - print 'Generating LRF...' - process_file(htmlfile, opts) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index c72bcfbfe5..515ec4182d 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -1,12 +1,12 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -""" +""" Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ -import os, re, sys, copy, glob, logging, tempfile +import os, re, sys, copy, glob, tempfile from collections import deque from urllib import unquote from urlparse import urlparse @@ -16,6 +16,7 @@ from calibre.customize.ui import run_plugins_on_postprocess try: from PIL import Image as PILImage + PILImage except ImportError: import Image as PILImage @@ -25,15 +26,14 @@ from calibre.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ LrsError, Sup, Sub, EmpLine -from calibre.ebooks.lrf.pylrs.pylrs import Span -from calibre.ebooks.lrf import Book, entity_to_unicode -from calibre.ebooks.lrf import option_parser as lrf_option_parser +from calibre.ebooks.lrf.pylrs.pylrs import Span +from calibre.ebooks.lrf import Book from calibre.ebooks import ConversionError -from calibre.ebooks.lrf.html.table import Table -from calibre import filename_to_utf8, setup_cli_handlers, __appname__, \ - fit_image, LoggingInterface, preferred_encoding +from calibre.ebooks.lrf.html.table import Table +from calibre import filename_to_utf8, __appname__, \ + fit_image, preferred_encoding, entity_to_unicode from calibre.ptempfile import PersistentTemporaryFile -from calibre.devices.interface import Device +from calibre.devices.interface import DevicePlugin as Device from calibre.ebooks.lrf.html.color_map import lrs_color from calibre.ebooks.chardet import xml_to_unicode @@ -43,7 +43,7 @@ def update_css(ncss, ocss): ocss[key].update(ncss[key]) else: ocss[key] = ncss[key] - + def munge_paths(basepath, url): purl = urlparse(unquote(url),) path, fragment = purl[2], purl[5] @@ -74,57 +74,57 @@ def strip_style_comments(match): return src def tag_regex(tagname): - '''Return non-grouping regular expressions that match the opening and closing tags for tagname''' + '''Return non-grouping regular expressions that match the opening and closing tags for tagname''' return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)'%dict(t=tagname), \ close=r''%dict(t=tagname)) -class HTMLConverter(object, LoggingInterface): +class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) - - + + MARKUP_MASSAGE = [ # Close tags - (re.compile(r']*)?/>', re.IGNORECASE), + (re.compile(r']*)?/>', re.IGNORECASE), lambda match: ''), - # Strip comments from )', re.IGNORECASE|re.DOTALL), strip_style_comments), - + # Remove self closing script tags as they also mess up BeautifulSoup (re.compile(r'(?i)]+?/>'), lambda match: ''), - + # BeautifulSoup treats self closing
    tags as open
    tags - (re.compile(r'(?i)<\s*div([^>]*)/\s*>'), + (re.compile(r'(?i)<\s*div([^>]*)/\s*>'), lambda match: '
    '%match.group(1)) - + ] # Fix Baen markup - BAEN = [ - (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), + BAEN = [ + (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), lambda match: match.group(1)), - (re.compile(r'

    \s*(\s*)\s*

    ', re.IGNORECASE), + (re.compile(r'

    \s*(\s*)\s*

    ', re.IGNORECASE), lambda match: match.group(1)), - (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), + (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), lambda match: ''), ] # Fix pdftohtml markup @@ -135,14 +135,14 @@ class HTMLConverter(object, LoggingInterface): (re.compile(r'\d+
    ', re.IGNORECASE), lambda match: ''), # Remove
    and replace

    with

    (re.compile(r'\s*', re.IGNORECASE), lambda match: '

    '), - (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 + (re.compile(r'(.*)', re.IGNORECASE), + lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 else match.group(1)), # Remove hyphenation (re.compile(r'-\n\r?'), lambda match: ''), - + ] - + # Fix Book Designer markup BOOK_DESIGNER = [ # HR @@ -161,23 +161,23 @@ class HTMLConverter(object, LoggingInterface): (re.compile('<]*?>( ){4}

    ', re.IGNORECASE), lambda match : '

    '), ] - + def __hasattr__(self, attr): if hasattr(self.options, attr): return True return object.__hasattr__(self, attr) - + def __getattr__(self, attr): if hasattr(self.options, attr): return getattr(self.options, attr) return object.__getattribute__(self, attr) - + def __setattr__(self, attr, val): if hasattr(self.options, attr): setattr(self.options, attr, val) else: object.__setattr__(self, attr, val) - + CSS = { 'h1' : {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'}, 'h2' : {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'}, @@ -201,28 +201,28 @@ class HTMLConverter(object, LoggingInterface): 'sup' : {'vertical-align': 'super', 'font-size': '60%'}, 'sub' : {'vertical-align': 'sub', 'font-size': '60%'}, } - + def __init__(self, book, fonts, options, logger, paths): ''' Convert HTML files at C{paths} and add to C{book}. After creating the object, you must call L{self.writeto} to output the LRF/S file. - - @param book: The LRF book + + @param book: The LRF book @type book: L{lrf.pylrs.Book} @param fonts: dict specifying the font families to use ''' - # Defaults for various formatting tags + # Defaults for various formatting tags object.__setattr__(self, 'options', options) - LoggingInterface.__init__(self, logger) + self.log = logger self.fonts = fonts #: dict specifying font families to use - # Memory - self.scaled_images = {} #: Temporary files with scaled version of images - self.rotated_images = {} #: Temporary files with rotated version of images + # Memory + self.scaled_images = {} #: Temporary files with scaled version of images + self.rotated_images = {} #: Temporary files with rotated version of images self.text_styles = [] #: Keep track of already used textstyles self.block_styles = [] #: Keep track of already used blockstyles self.images = {} #: Images referenced in the HTML document self.targets = {} #: and id elements - self.links = deque() #: elements + self.links = deque() #: elements self.processed_files = [] self.extra_toc_entries = [] #: TOC entries gleaned from semantic information self.image_memory = [] @@ -236,30 +236,30 @@ class HTMLConverter(object, LoggingInterface): self.preserve_block_style = False #: Used so that

    tags in

    elements are handled properly self.avoid_page_break = False self.current_page = book.create_page() - - # Styles - self.blockquote_style = book.create_block_style(sidemargin=60, + + # Styles + self.blockquote_style = book.create_block_style(sidemargin=60, topskip=20, footskip=20) self.unindented_style = book.create_text_style(parindent=0) - - + + self.in_table = False # List processing self.list_level = 0 self.list_indent = 20 self.list_counter = 1 - + self.book = book #: The Book object representing a BBeB book - + self.override_css = {} self.override_pcss = {} - + if self._override_css is not None: if os.access(self._override_css, os.R_OK): src = open(self._override_css, 'rb').read() else: src = self._override_css - match = self.PAGE_BREAK_PAT.search(src) + match = self.PAGE_BREAK_PAT.search(src) if match and not re.match('avoid', match.group(1), re.IGNORECASE): self.page_break_found = True ncss, npcss = self.parse_css(src) @@ -267,12 +267,12 @@ class HTMLConverter(object, LoggingInterface): update_css(ncss, self.override_css) if npcss: update_css(npcss, self.override_pcss) - - - + + + paths = [os.path.abspath(path) for path in paths] paths = [path.decode(sys.getfilesystemencoding()) if not isinstance(path, unicode) else path for path in paths] - + while len(paths) > 0 and self.link_level <= self.link_levels: for path in paths: if path in self.processed_files: @@ -288,62 +288,62 @@ class HTMLConverter(object, LoggingInterface): if link['path'] == path: self.links.remove(link) break - self.log_warn('Could not process '+path) + self.log.warn('Could not process '+path) if self.verbose: - self.log_exception(' ') + self.log.exception(' ') self.links = self.process_links() self.link_level += 1 paths = [link['path'] for link in self.links] - + if self.current_page is not None and self.current_page.has_text(): self.book.append(self.current_page) - + for text, tb in self.extra_toc_entries: self.book.addTocEntry(text, tb) - + if self.base_font_size > 0: - self.log_info('\tRationalizing font sizes...') + self.log.info('\tRationalizing font sizes...') self.book.rationalize_font_sizes(self.base_font_size) - + def is_baen(self, soup): - return bool(soup.find('meta', attrs={'name':'Publisher', + return bool(soup.find('meta', attrs={'name':'Publisher', 'content':re.compile('Baen', re.IGNORECASE)})) - + def is_book_designer(self, raw): return bool(re.search('<]*id=BookTitle', raw)) - + def preprocess(self, raw): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(HTMLConverter.MARKUP_MASSAGE) - + if not self.book_designer and self.is_book_designer(raw): self.book_designer = True - self.log_info(_('\tBook Designer file detected.')) - - self.log_info(_('\tParsing HTML...')) - + self.log.info(_('\tBook Designer file detected.')) + + self.log.info(_('\tParsing HTML...')) + if self.baen: nmassage.extend(HTMLConverter.BAEN) - + if self.pdftohtml: nmassage.extend(HTMLConverter.PDFTOHTML) if self.book_designer: nmassage.extend(HTMLConverter.BOOK_DESIGNER) try: - soup = BeautifulSoup(raw, + soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.XHTML_ENTITIES, markupMassage=nmassage) except ConversionError, err: if 'Failed to coerce to unicode' in str(err): raw = unicode(raw, 'utf8', 'replace') - soup = BeautifulSoup(raw, + soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.XHTML_ENTITIES, markupMassage=nmassage) else: raise if not self.baen and self.is_baen(soup): self.baen = True - self.log_info(_('\tBaen file detected. Re-parsing...')) + self.log.info(_('\tBaen file detected. Re-parsing...')) return self.preprocess(raw) if self.book_designer: t = soup.find(id='BookTitle') @@ -359,13 +359,13 @@ class HTMLConverter(object, LoggingInterface): try: dump = open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb') dump.write(unicode(soup).encode('utf-8')) - self.log_info(_('Written preprocessed HTML to ')+dump.name) + self.log.info(_('Written preprocessed HTML to ')+dump.name) dump.close() except: pass - + return soup - + def add_file(self, path): self.css = HTMLConverter.CSS.copy() self.pseudo_css = self.override_pcss.copy() @@ -374,13 +374,13 @@ class HTMLConverter(object, LoggingInterface): self.css[selector].update(self.override_css[selector]) else: self.css[selector] = self.override_css[selector] - + upath = path.encode(sys.getfilesystemencoding()) if isinstance(path, unicode) else path self.file_name = os.path.basename(upath.decode(sys.getfilesystemencoding())) - self.log_info(_('Processing %s'), repr(upath) if self.verbose else repr(self.file_name)) - + self.log.info(_('Processing %s')%( repr(upath) if self.verbose else repr(self.file_name))) + if not os.path.exists(upath): - upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names + upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names f = open(upath, 'rb') raw = f.read() if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files @@ -391,7 +391,7 @@ class HTMLConverter(object, LoggingInterface): raw = xml_to_unicode(raw, self.verbose)[0] f.close() soup = self.preprocess(raw) - self.log_info(_('\tConverting to BBeB...')) + self.log.info(_('\tConverting to BBeB...')) self.current_style = {} self.page_break_found = False if not isinstance(path, unicode): @@ -400,9 +400,9 @@ class HTMLConverter(object, LoggingInterface): self.previous_text = '\n' self.tops[path] = self.parse_file(soup) self.processed_files.append(path) - - - + + + def parse_css(self, style): """ Parse the contents of a - - -%(body)s - - -''' - res = [] - para = [] - styles = [] - for page in self.pages: - res.append(u''%page.id) - for group in page.groups: - if group.is_header or group.is_footer: - continue - if group.style is not None: - styles.append(u'.%s %s\n'%(group.id, group.style.to_css())) - for line in group.lines: - if line.is_para_start: - indent = group.left_margin - line.left - if para: - res.append(u'

    %s

    '%(indent, ''.join(para))) - para = [] - para.append(line.to_xhtml(group.id)) - if page.page_break_after: - res.append(u'
    ') - if para: - res.append(u'

    %s

    '%(''.join(para))) - para = [] - - return (header%dict(style='\n'.join(styles), body='\n'.join(res))).encode('utf-8') - -class PDFConverter(object): - - @classmethod - def generate_xml(cls, pathtopdf, logger): - pathtopdf = os.path.abspath(pathtopdf) - tdir = tempfile.mkdtemp('pdf2xml', __appname__) - atexit.register(shutil.rmtree, tdir) - xmlfile = os.path.basename(pathtopdf)+'.xml' - os.chdir(tdir) - cmd = PDFTOXML + ' -outline "%s" "%s"'%(pathtopdf, xmlfile) - p = subprocess.Popen(cmd, shell=True, stderr=subprocess.STDOUT, - stdout=subprocess.PIPE) - log = p.stdout.read() - ret = p.wait() - if ret != 0: - raise ConversionError, log - xmlfile = os.path.join(tdir, xmlfile) - if os.stat(xmlfile).st_size < 20: - raise ConversionError(os.path.basename(pathtopdf) + ' does not allow copying of text.') - return xmlfile - - - def __init__(self, pathtopdf, logger, opts): - self.cwd = os.getcwdu() - self.logger = logger - self.opts = opts - try: - self.logger.info('Converting PDF to XML') - self.xmlfile = self.generate_xml(pathtopdf, self.logger) - self.tdir = os.path.dirname(self.xmlfile) - self.data_dir = self.xmlfile + '_data' - outline_file = self.xmlfile.rpartition('.')[0]+'_outline.xml' - self.logger.info('Parsing XML') - self.document = PDFDocument(self.xmlfile) - self.outline = parse(outline_file) - finally: - os.chdir(self.cwd) - - def convert(self, output_dir): - doc = self.document.to_xhtml() - open(os.path.join(output_dir, 'document.html'), 'wb').write(doc) - - - -def option_parser(): - parser = OptionParser(usage=\ -''' -%prog [options] myfile.pdf - -Convert a PDF file to a HTML file. -''') - parser.add_option('-o', '--output-dir', default='.', - help=_('Path to output directory in which to create the HTML file. Defaults to current directory.')) - parser.add_option('--verbose', default=False, action='store_true', - help=_('Be more verbose.')) - return parser - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args() - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('pdf2html') - setup_cli_handlers(logger, level) - if len(args) != 1: - parser.print_help() - print _('You must specify a single PDF file.') - return 1 - options.output_dir = os.path.abspath(options.output_dir) - converter = PDFConverter(os.path.abspath(args[0]), logger, options) - converter.convert(options.output_dir) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/rtf/convert_from.py b/src/calibre/ebooks/lrf/rtf/convert_from.py deleted file mode 100644 index e4dd153d2a..0000000000 --- a/src/calibre/ebooks/lrf/rtf/convert_from.py +++ /dev/null @@ -1,190 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -import os, sys, shutil, logging, glob - -from lxml import etree - -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre import setup_cli_handlers -from calibre.libwand import convert, WandException -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup -from calibre.ebooks.lrf.rtf.xsl import xhtml -from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf import OPFCreator - -def option_parser(): - parser = lrf_option_parser( -_('''%prog [options] mybook.rtf - - -%prog converts mybook.rtf to mybook.lrf''') - ) - parser.add_option('--keep-intermediate-files', action='store_true', default=False) - return parser - -def convert_images(html, logger): - wmfs = glob.glob('*.wmf') + glob.glob('*.WMF') - for wmf in wmfs: - target = os.path.join(os.path.dirname(wmf), os.path.splitext(os.path.basename(wmf))[0]+'.jpg') - try: - convert(wmf, target) - html = html.replace(os.path.basename(wmf), os.path.basename(target)) - except WandException, err: - logger.warning(u'Unable to convert image %s with error: %s'%(wmf, unicode(err))) - continue - return html - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('rtf2lrf') - setup_cli_handlers(logger, level) - rtf = os.path.abspath(os.path.expanduser(path)) - f = open(rtf, 'rb') - mi = get_metadata(f, 'rtf') - f.close() - tdir = PersistentTemporaryDirectory('_rtf2lrf') - html = generate_html(rtf, tdir) - cwd = os.getcwdu() - try: - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(rtf))[0] - if (not options.title or options.title == 'Unknown'): - options.title = mi.title - if (not options.author or options.author == 'Unknown') and mi.author: - options.author = mi.author - if (not options.category or options.category == 'Unknown') and mi.category: - options.category = mi.category - if (not options.freetext or options.freetext == 'Unknown') and mi.comments: - options.freetext = mi.comments - os.chdir(tdir) - html_process_file(html, options, logger) - finally: - os.chdir(cwd) - if hasattr(options, 'keep_intermediate_files') and options.keep_intermediate_files: - logger.debug('Intermediate files in '+ tdir) - else: - shutil.rmtree(tdir) - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No rtf file specified' - return 1 - process_file(args[1], options, logger) - return 0 - - -def generate_xml(rtfpath, tdir): - from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf - ofile = os.path.join(tdir, 'index.xml') - cwd = os.getcwdu() - os.chdir(tdir) - rtfpath = os.path.abspath(rtfpath) - try: - parser = ParseRtf( - in_file = rtfpath, - out_file = ofile, - # Convert symbol fonts to unicode equivelents. Default - # is 1 - convert_symbol = 1, - - # Convert Zapf fonts to unicode equivelents. Default - # is 1. - convert_zapf = 1, - - # Convert Wingding fonts to unicode equivelents. - # Default is 1. - convert_wingdings = 1, - - # Convert RTF caps to real caps. - # Default is 1. - convert_caps = 1, - - # Indent resulting XML. - # Default is 0 (no indent). - indent = 1, - - # Form lists from RTF. Default is 1. - form_lists = 1, - - # Convert headings to sections. Default is 0. - headings_to_sections = 1, - - # Group paragraphs with the same style name. Default is 1. - group_styles = 1, - - # Group borders. Default is 1. - group_borders = 1, - - # Write or do not write paragraphs. Default is 0. - empty_paragraphs = 0, - ) - parser.parse_rtf() - finally: - os.chdir(cwd) - return ofile - - -def generate_html(rtfpath, tdir): - print 'Converting RTF to XML...' - rtfpath = os.path.abspath(rtfpath) - try: - xml = generate_xml(rtfpath, tdir) - except RtfInvalidCodeException: - raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.')) - tdir = os.path.dirname(xml) - cwd = os.getcwdu() - os.chdir(tdir) - try: - print 'Parsing XML...' - parser = etree.XMLParser(recover=True, no_network=True) - try: - doc = etree.parse(xml, parser) - except: - raise - print 'Parsing failed. Trying to clean up XML...' - soup = BeautifulStoneSoup(open(xml, 'rb').read()) - doc = etree.fromstring(str(soup)) - print 'Converting XML to HTML...' - styledoc = etree.fromstring(xhtml) - - transform = etree.XSLT(styledoc) - result = transform(doc) - tdir = os.path.dirname(xml) - html = os.path.join(tdir, 'index.html') - f = open(html, 'wb') - res = transform.tostring(result) - res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] - f.write(res) - f.close() - try: - mi = get_metadata(open(rtfpath, 'rb'), 'rtf') - except: - mi = MetaInformation(None, None) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(rtfpath))[0] - if not mi.authors: - mi.authors = [_('Unknown')] - opf = OPFCreator(tdir, mi) - opf.create_manifest([('index.html', None)]) - opf.create_spine(['index.html']) - opf.render(open('metadata.opf', 'wb')) - finally: - os.chdir(cwd) - return html - -if __name__ == '__main__': - sys.exit(main()) - \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/tags.py b/src/calibre/ebooks/lrf/tags.py index c8ef312ae3..17db193e1a 100644 --- a/src/calibre/ebooks/lrf/tags.py +++ b/src/calibre/ebooks/lrf/tags.py @@ -207,32 +207,32 @@ class Tag(object): s += " at %08X, contents: %s" % (self.offset, repr(self.contents)) return s - @apply - def byte(): + @dynamic_property + def byte(self): def fget(self): if len(self.contents) != 1: raise LRFParseError("Bad parameter for tag ID: %04X" % self.id) return struct.unpack("' - for match in re.finditer(r']*src="([^"]+)"', html): - fix_image_includes(os.path.dirname(txtfile), tdir, match) - p = os.path.join(tdir, 'index.html') - open(p, 'wb').write(html.encode('utf-8')) - mi = MetaInformation(os.path.splitext(os.path.basename(txtfile))[0], [_('Unknown')]) - opf = OPFCreator(tdir, mi) - opf.create_manifest([(os.path.join(tdir, 'index.html'), None)]) - opf.create_spine([os.path.join(tdir, 'index.html')]) - opf.render(open(os.path.join(tdir, 'metadata.opf'), 'wb')) - return p - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('txt2lrf') - setup_cli_handlers(logger, level) - txt = os.path.abspath(os.path.expanduser(path)) - if not hasattr(options, 'debug_html_generation'): - options.debug_html_generation = False - tdir = PersistentTemporaryDirectory('_txt2lrf') - htmlfile = generate_html(txt, options.encoding, tdir) - options.encoding = 'utf-8' - if not options.debug_html_generation: - options.force_page_break = 'h2' - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - if not options.title: - options.title = os.path.splitext(os.path.basename(path))[0] - html_process_file(htmlfile, options, logger) - else: - print open(htmlfile, 'rb').read() - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No txt file specified' - return 1 - process_file(args[1], options, logger) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/txt/demo/demo.txt b/src/calibre/ebooks/lrf/txt/demo/demo.txt deleted file mode 100644 index af4139241b..0000000000 --- a/src/calibre/ebooks/lrf/txt/demo/demo.txt +++ /dev/null @@ -1,89 +0,0 @@ -Demonstration of `txt2lrf` -========================== - -`txt2lrf` provides a convenient way to create LRF files with good formatting. -`txt2lrf` recognizes a simple markup language called *markdown*. - -The idea is to provide a lightweight markup that can be used to create -TXT files that can be read by themselves or automatically converted to LRF. -[{@name=toc}]() - -

    - -///Table of Contents/// - - -Text formatting ---------------- -**Bold** and *italic* text is easily specified. - -> Blockquotes are also very simple to specify. -> This is a basic blockquote paragraph. I absolutely -> love block quotes don't you? - - This is a preformatted code block. No formatting rules are applied to text in this block and it is rendered in a monospaced font. - - -For details on the text formatting syntax visit - - http://daringfireball.net/projects/markdown/syntax -___ -[Table of Contents](#toc) - -Lists ------ -Both ordered and unordered lists are supported. - - -### Unordered lists - -+ What a -+ *nice* -+ list - - - -### Ordered lists - -1. One -2. Two -3. Three - -**Note:** Nested lists are not supported - -___ -[Table of Contents](#toc) - -Tables ------- - -Simple tables are easily generated - -| |* Col 1 *|* Col 2 *| -|* Row 1 *| (1, 1) | (1, 2) | -|* Row 2 *| (2, 1) | (2, 2) | - -**Note:** Nested tables are not supported - -___ -[Table of Contents](#toc) - -Images ------- - -`txt2lrf` also has support for inline images like -![this one](small.jpg) this one. - -___ -[Table of Contents](#toc) - -Automatic TOC Creation ----------------------- - -By inserting `///Table of Contents///` into the text at some point -a table of contents is automatically generated with links that point -to all headings underlined with `-------`. - -___ -[Table of Contents](#toc) - diff --git a/src/calibre/ebooks/lrf/txt/demo/small.jpg b/src/calibre/ebooks/lrf/txt/demo/small.jpg deleted file mode 100644 index 6dae5fde42..0000000000 Binary files a/src/calibre/ebooks/lrf/txt/demo/small.jpg and /dev/null differ diff --git a/src/calibre/ebooks/lrf/web/__init__.py b/src/calibre/ebooks/lrf/web/__init__.py deleted file mode 100644 index c25b6259a8..0000000000 --- a/src/calibre/ebooks/lrf/web/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - - -builtin_profiles = [] -available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] diff --git a/src/calibre/ebooks/lrf/web/convert_from.py b/src/calibre/ebooks/lrf/web/convert_from.py deleted file mode 100644 index ca523e869b..0000000000 --- a/src/calibre/ebooks/lrf/web/convert_from.py +++ /dev/null @@ -1,183 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -'''Convert websites into LRF files.''' - -import sys, tempfile, shutil, os, logging, imp, inspect, re -from urlparse import urlsplit - -from calibre import __appname__, setup_cli_handlers, CommandLineError, strftime -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.lrf.html.convert_from import process_file - -from calibre.web.fetch.simple import create_fetcher - -from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile, create_class -from calibre.ebooks.lrf.web import builtin_profiles, available_profiles - - -def option_parser(): - parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n''' - '''%prog downloads a site from the web and converts it ''' - '''into a LRF file for use with the SONY Reader. ''' - '''website_profile is one of '''+str(available_profiles)+\ - ''' If you specify a website_profile of default or do not specify ''' - '''it, you must specify the --url option.''' - ) - - parser.add_option('-u', '--url', dest='url', default=None, - help='The URL to download. You only need to specify this if you are not specifying a website_profile.') - parser.add_option('--user-profile', default=None, - help='Path to a python file containing a user created profile. For help visit http://%s.kovidgoyal.net/wiki/UserProfiles'%__appname__) - parser.add_option('--username', dest='username', default=None, - help='Specify the username to be used while downloading. Only used if the profile supports it.') - parser.add_option('--password', dest='password', default=None, - help='Specify the password to be used while downloading. Only used if the profile supports it.') - parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout, - default=None, type='int', dest='timeout') - parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout, - default=None, type='int', dest='max_recursions') - parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files', - help='The maximum number of files to download. This only applies to files from
    tags. Default is %d'%DefaultProfile.timeout) - parser.add_option('--delay', default=None, dest='delay', type='int', - help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout) - parser.add_option('--dont-download-stylesheets', action='store_true', default=None, - help='Do not download CSS stylesheets.', dest='no_stylesheets') - parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append', - help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.') - parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', - help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.') - parser.add_option('--keep-downloaded-files', default=False, action='store_true', - help='''Do not delete the downloaded files after creating the LRF''') - return parser - -def fetch_website(options, logger): - tdir = tempfile.mkdtemp(prefix=__appname__+'_', suffix='_web2lrf') - options.dir = tdir - fetcher = create_fetcher(options, logger) - fetcher.preprocess_regexps = options.preprocess_regexps - return fetcher.start_fetch(options.url), tdir - -def create_lrf(htmlfile, options, logger): - if not options.author or options.author.lower() == 'unknown': - options.author = __appname__ - options.header = True - if options.output: - options.output = os.path.abspath(os.path.expanduser(options.output)) - else: - options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf'))) - - process_file(htmlfile, options, logger) - -def process_profile(args, options, logger=None): - tdir = None - try: - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('web2lrf') - setup_cli_handlers(logger, level) - index = -1 - - if len(args) == 2 and re.search(r'class\s+\S+\(\S+\)\s*\:', args[1]): - profile = create_class(args[1]) - else: - if options.user_profile is not None: - path = os.path.abspath(options.user_profile) - name = os.path.splitext(os.path.basename(path))[0] - res = imp.find_module(name, [os.path.dirname(path)]) - module = imp.load_module(name, *res) - classes = inspect.getmembers(module, - lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\ - and x is not DefaultProfile and x is not FullContentProfile) - if not classes: - raise CommandLineError('Invalid user profile '+path) - builtin_profiles.append(classes[0][1]) - available_profiles.append(name) - if len(args) < 2: - args.append(name) - args[1] = name - index = -1 - if len(args) == 2: - try: - if isinstance(args[1], basestring): - if args[1] != 'default': - index = available_profiles.index(args[1]) - except ValueError: - raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles)) - else: - raise CommandLineError('Only one profile at a time is allowed.') - profile = DefaultProfile if index == -1 else builtin_profiles[index] - - - - profile = profile(logger, options.verbose, options.username, options.password) - if profile.browser is not None: - options.browser = profile.browser - - for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'): - val = getattr(options, opt) - if val is None: - setattr(options, opt, getattr(profile, opt)) - - if not options.url: - options.url = profile.url - - if not options.url: - raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,)) - - if not options.title: - title = profile.title - if not title: - title = urlsplit(options.url).netloc - options.title = title + strftime(profile.timefmt) - - options.match_regexps += profile.match_regexps - options.preprocess_regexps = profile.preprocess_regexps - options.filter_regexps += profile.filter_regexps - - options.encoding = profile.encoding if options.encoding is None else options.encoding - - if len(args) == 2 and args[1] != 'default': - options.anchor_ids = False - - htmlfile, tdir = fetch_website(options, logger) - options.encoding = 'utf-8' - cwd = os.getcwd() - if not options.output: - title = options.title.encode(sys.getfilesystemencoding()) if isinstance(options.title, unicode) else options.title - options.output = os.path.join(cwd, options.title+('.lrs' if options.lrs else '.lrf')) - if not os.path.isabs(options.output): - options.output = os.path.join(cwd, options.output) - - option_parser().parse_args(profile.html2lrf_options, options) - - try: - os.chdir(os.path.dirname(htmlfile)) - create_lrf(os.path.basename(htmlfile), options, logger) - finally: - os.chdir(cwd) - finally: - try: - profile.cleanup() - except: - pass - if tdir and os.path.isdir(tdir): - if options.keep_downloaded_files: - print 'Downloaded files in ', tdir - else: - shutil.rmtree(tdir) - - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) > 2 or (len(args) == 1 and not options.user_profile): - parser.print_help() - return 1 - try: - process_profile(args, options, logger=logger) - except CommandLineError, err: - print >>sys.stderr, err - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/web/profiles/__init__.py b/src/calibre/ebooks/lrf/web/profiles/__init__.py deleted file mode 100644 index 9544cad7c3..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/__init__.py +++ /dev/null @@ -1,572 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Contains the Base Profiles that can be used to easily create profiles to download -particular websites. -''' - -import tempfile, time, calendar, re, operator, atexit, shutil, os -from htmlentitydefs import name2codepoint -from email.utils import formatdate - -from calibre import __appname__, iswindows, browser, strftime -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag - - -class DefaultProfile(object): - - #: The title to use for the LRF file - #: @type: string - title = 'Default Profile' - - #: Maximum number of articles to download from each feed - #: @type: integer - max_articles_per_feed = 10 - - #: If True process the element of the feed as HTML - #: @type: boolean - html_description = True - - #: How many days old should the oldest article downloaded from the feeds be - #: @type: integer - oldest_article = 7 - - #: Recommend frequency at which to download this profile. In days. - recommended_frequency = 7 - - #: Number of levels of links to follow - #: @type: integer - max_recursions = 1 - - #: Maximum number of files to download - #: @type: integer - max_files = 3000 - - #: Delay between consecutive downloads in seconds - #: @type: integer - delay = 0 - - #: Timeout for fetching files from server in seconds - #: @type: integer - timeout = 10 - - #: The format string for the date shown on the first page - #: @type: string - timefmt = ' [%a %d %b %Y]' - - #: The order of elements to search for a URL when parsing the RSS feed. You - #: can replace these elements by completely arbitrary elements to customize - #: feed processing. - #: @type: list of strings - url_search_order = ['guid', 'link'] - - #: The format string used to parse the publication date in the RSS feed. - #: If set to None some default heuristics are used, these may fail, - #: in which case set this to the correct string or re-implement - #: L{DefaultProfile.strptime} in your subclass. - #: @type: string or None - pubdate_fmt = None - - #: If True will look for a publication date for each article. - #: If False assumes the publication date is the current time. - #: @type: boolean - use_pubdate = True, - - #: Max number of characters in the short description. - #: Used by L{FullContentProfile} - #: @type: integer - summary_length = 500 - - #: If True stylesheets are not downloaded and processed - #: Convenient flag to disable loading of stylesheets for websites - #: that have overly complex stylesheets unsuitable for conversion - #: to ebooks formats - #: @type: boolean - no_stylesheets = False - - #: If False articles with the same title in the same feed - #: are not downloaded multiple times - #: @type: boolean - allow_duplicates = False - - #: If True the GUI will ask the user for a username and password - #: to use while downloading - #: @type: boolean - needs_subscription = False - - #: Specify an override encoding for sites that have an incorrect - #: charset specification. THe most common being specifying latin1 and - #: using cp1252 - encoding = None - - #: List of regular expressions that determines which links to follow - #: If empty, it is ignored. - #: Only one of L{match_regexps} or L{filter_regexps} should be defined - #: @type: list of strings - match_regexps = [] - - #: List of regular expressions that determines which links to ignore - #: If empty it is ignored - #: Only one of L{match_regexps} or L{filter_regexps} should be defined - #: @type: list of strings - filter_regexps = [] - - #: List of options to pass to html2lrf, to customize conversion - #: to LRF - #: @type: list of strings - html2lrf_options = [] - - #: List of regexp substitution rules to run on the downloaded HTML. Each element of the - #: list should be a two element tuple. The first element of the tuple should - #: be a compiled regular expression and the second a callable that takes - #: a single match object and returns a string to replace the match. - #: @type: list of tuples - preprocess_regexps = [] - - # See the built-in profiles for examples of these settings. - - #: The URL of the website - #: @type: string - url = '' - - feeds = [] - CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL) - - def get_feeds(self): - ''' - Return a list of RSS feeds to fetch for this profile. Each element of the list - must be a 2-element tuple of the form (title, url). - ''' - if not self.feeds: - raise NotImplementedError - return self.feeds - - @classmethod - def print_version(cls, url): - ''' - Take a URL pointing to an article and returns the URL pointing to the - print version of the article. - ''' - return url - - @classmethod - def get_browser(cls): - ''' - Return a browser instance used to fetch documents from the web. - - If your profile requires that you login first, override this method - in your subclass. See for example the nytimes profile. - ''' - return browser() - - - - - def __init__(self, logger, verbose=False, username=None, password=None, lrf=True): - self.logger = logger - self.username = username - self.password = password - self.verbose = verbose - self.lrf = lrf - self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_') - self.browser = self.get_browser() - try: - self.url = 'file:'+ ('' if iswindows else '//') + self.build_index() - except NotImplementedError: - self.url = None - atexit.register(cleanup, self.temp_dir) - - def build_index(self): - '''Build an RSS based index.html''' - articles = self.parse_feeds() - encoding = 'utf-8' if self.encoding is None else self.encoding - def build_sub_index(title, items): - ilist = '' - li = u'
  • %(title)s [%(date)s]
    \n'+\ - u'
    %(description)s
  • \n' - for item in items: - if not item.has_key('date'): - item['date'] = time.strftime('%a, %d %b', time.localtime()) - ilist += li%item - return u'''\ - - -

    %(title)s

    -
      - %(items)s -
    - - - '''%dict(title=title, items=ilist.rstrip()) - - cnum = 0 - clist = '' - categories = articles.keys() - categories.sort() - for category in categories: - cnum += 1 - cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html') - prefix = 'file:' if iswindows else '' - clist += u'
  • %s
  • \n'%(prefix+cfile, category) - src = build_sub_index(category, articles[category]) - open(cfile, 'wb').write(src.encode(encoding)) - - title = self.title - if not isinstance(title, unicode): - title = unicode(title, 'utf-8', 'replace') - src = u'''\ - - -

    %(title)s

    -
    %(date)s
    -
      - %(categories)s -
    - - - '''%dict(date=strftime('%a, %d %B, %Y'), - categories=clist, title=title) - index = os.path.join(self.temp_dir, 'index.html') - open(index, 'wb').write(src.encode(encoding)) - - return index - - - @classmethod - def tag_to_string(cls, tag, use_alt=True): - ''' - Convenience method to take a BeautifulSoup Tag and extract the text from it - recursively, including any CDATA sections and alt tag attributes. - @param use_alt: If True try to use the alt attribute for tags that don't have any textual content - @type use_alt: boolean - @return: A unicode (possibly empty) object - @rtype: unicode string - ''' - if not tag: - return '' - if isinstance(tag, basestring): - return tag - strings = [] - for item in tag.contents: - if isinstance(item, (NavigableString, CData)): - strings.append(item.string) - elif isinstance(item, Tag): - res = cls.tag_to_string(item) - if res: - strings.append(res) - elif use_alt and item.has_key('alt'): - strings.append(item['alt']) - return u''.join(strings) - - def get_article_url(self, item): - ''' - Return the article URL given an item Tag from a feed, or None if no valid URL is found - @type item: BeatifulSoup.Tag - @param item: A BeautifulSoup Tag instance corresponding to the tag from a feed. - @rtype: string or None - ''' - url = None - for element in self.url_search_order: - url = item.find(element.lower()) - if url: - break - return url - - - def parse_feeds(self, require_url=True): - ''' - Create list of articles from a list of feeds. - @param require_url: If True skip articles that don't have a link to a HTML page with the full article contents. - @type require_url: boolean - @rtype: dictionary - @return: A dictionary whose keys are feed titles and whose values are each - a list of dictionaries. Each list contains dictionaries of the form:: - { - 'title' : article title, - 'url' : URL of print version, - 'date' : The publication date of the article as a string, - 'description' : A summary of the article - 'content' : The full article (can be an empty string). This is used by FullContentProfile - } - ''' - added_articles = {} - feeds = self.get_feeds() - articles = {} - for title, url in feeds: - try: - src = self.browser.open(url).read() - except Exception, err: - self.logger.error('Could not fetch feed: %s\nError: %s'%(url, err)) - if self.verbose: - self.logger.exception(' ') - continue - - articles[title] = [] - added_articles[title] = [] - soup = BeautifulStoneSoup(src) - for item in soup.findAll('item'): - try: - atitle = item.find('title') - if not atitle: - continue - - atitle = self.tag_to_string(atitle) - if self.use_pubdate: - pubdate = item.find('pubdate') - if not pubdate: - pubdate = item.find('dc:date') - if not pubdate or not pubdate.string: - pubdate = formatdate() - pubdate = self.tag_to_string(pubdate) - pubdate = pubdate.replace('+0000', 'GMT') - - - url = self.get_article_url(item) - url = self.tag_to_string(url) - if require_url and not url: - self.logger.debug('Skipping article %s as it does not have a link url'%atitle) - continue - purl = url - try: - purl = self.print_version(url) - except Exception, err: - self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err)) - continue - - content = item.find('content:encoded') - if not content: - content = item.find('description') - if content: - content = self.process_html_description(content, strip_links=False) - else: - content = '' - - d = { - 'title' : atitle, - 'url' : purl, - 'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(), - 'date' : pubdate if self.use_pubdate else formatdate(), - 'content' : content, - } - delta = time.time() - d['timestamp'] - if not self.allow_duplicates: - if d['title'] in added_articles[title]: - continue - added_articles[title].append(d['title']) - if delta > self.oldest_article*3600*24: - continue - - except Exception, err: - if self.verbose: - self.logger.exception('Error parsing article:\n%s'%(item,)) - continue - try: - desc = '' - for c in item.findAll('description'): - desc = self.tag_to_string(c) - if desc: - break - d['description'] = self.process_html_description(desc) if self.html_description else desc.string - except: - d['description'] = '' - articles[title].append(d) - articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True) - articles[title] = articles[title][:self.max_articles_per_feed+1] - #for item in articles[title]: - # item.pop('timestamp') - if not articles[title]: - articles.pop(title) - return articles - - - def cleanup(self): - ''' - Called after LRF file has been generated. Use it to do any cleanup like - logging out of subscription sites, etc. - ''' - pass - - @classmethod - def process_html_description(cls, tag, strip_links=True): - ''' - Process a tag that contains HTML markup, either - entity encoded or escaped in a CDATA section. - @return: HTML - @rtype: string - ''' - src = '\n'.join(tag.contents) if hasattr(tag, 'contents') else tag - match = cls.CDATA_PAT.match(src.lstrip()) - if match: - src = match.group(1) - else: - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] - for e in replaced_entities: - ent = '&'+e+';' - src = src.replace(ent, unichr(name2codepoint[e])) - if strip_links: - src = re.compile(r'(.*?)', re.IGNORECASE|re.DOTALL).sub(r'\1', src) - - return src - - - DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) - FULL_DAY_MAP = dict(Sunday=0, Monday=1, Tueday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6) - MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12) - FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6, - July=7, August=8, September=9, October=10, - November=11, December=12) - - @classmethod - def strptime(cls, src): - ''' - Take a string and return the date that string represents, in UTC as - an epoch (i.e. number of seconds since Jan 1, 1970). This function uses - a bunch of heuristics and is a prime candidate for being overridden in a - subclass. - @param src: Timestamp as a string - @type src: string - @return: time ans a epoch - @rtype: number - ''' - delta = 0 - zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src) - if zone: - delta = zone.group(1) - hrs, mins = int(delta[1:3]), int(delta[-2:].rstrip()) - delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1) - src = src.replace(zone.group(), '') - if cls.pubdate_fmt is None: - src = src.strip().split() - try: - src[0] = str(cls.DAY_MAP[src[0][:-1]])+',' - except KeyError: - src[0] = str(cls.FULL_DAY_MAP[src[0][:-1]])+',' - try: - src[2] = str(cls.MONTH_MAP[src[2]]) - except KeyError: - src[2] = str(cls.FULL_MONTH_MAP[src[2]]) - fmt = '%w, %d %m %Y %H:%M:%S' - src = src[:5] # Discard extra information - try: - time_t = time.strptime(' '.join(src), fmt) - except ValueError: - time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y')) - return calendar.timegm(time_t)-delta - else: - return calendar.timegm(time.strptime(src, cls.pubdate_fmt)) - - def command_line_options(self): - args = [] - args.append('--max-recursions='+str(self.max_recursions)) - args.append('--delay='+str(self.delay)) - args.append('--max-files='+str(self.max_files)) - for i in self.match_regexps: - args.append('--match-regexp="'+i+'"') - for i in self.filter_regexps: - args.append('--filter-regexp="'+i+'"') - return args - - -class FullContentProfile(DefaultProfile): - ''' - This profile is designed for feeds that embed the full article content in the RSS file. - ''' - - max_recursions = 0 - article_counter = 0 - - - def build_index(self): - '''Build an RSS based index.html. ''' - articles = self.parse_feeds(require_url=False) - - def build_sub_index(title, items): - ilist = '' - li = u'
  • %(title)s [%(date)s]
    \n'+\ - u'
    %(description)s
  • \n' - for item in items: - content = item['content'] - if not content: - self.logger.debug('Skipping article as it has no content:%s'%item['title']) - continue - item['description'] = cutoff(item['description'], self.summary_length)+'…' - self.article_counter = self.article_counter + 1 - url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter) - item['url'] = url - open(url, 'wb').write((u'''\ - - -

    %s

    -
    - %s -
    - - '''%(item['title'], content)).encode('utf-8') - ) - ilist += li%item - return u'''\ - - -

    %(title)s

    -
      - %(items)s -
    - - - '''%dict(title=title, items=ilist.rstrip()) - - cnum = 0 - clist = '' - categories = articles.keys() - categories.sort() - for category in categories: - cnum += 1 - cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html') - prefix = 'file:' if iswindows else '' - clist += u'
  • %s
  • \n'%(prefix+cfile, category) - src = build_sub_index(category, articles[category]) - open(cfile, 'wb').write(src.encode('utf-8')) - - src = '''\ - - -

    %(title)s

    -
    %(date)s
    -
      - %(categories)s -
    - - - '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), - categories=clist, title=self.title) - index = os.path.join(self.temp_dir, 'index.html') - open(index, 'wb').write(src.encode('utf-8')) - return index - -def cutoff(src, pos, fuzz=50): - si = src.find(';', pos) - if si > 0 and si-pos > fuzz: - si = -1 - gi = src.find('>', pos) - if gi > 0 and gi-pos > fuzz: - gi = -1 - npos = max(si, gi) - if npos < 0: - npos = pos - return src[:npos+1] - -def create_class(src): - environment = {'FullContentProfile':FullContentProfile, 'DefaultProfile':DefaultProfile} - exec src in environment - for item in environment.values(): - if hasattr(item, 'build_index'): - if item.__name__ not in ['DefaultProfile', 'FullContentProfile']: - return item - -def cleanup(tdir): - try: - if os.path.isdir(tdir): - shutil.rmtree(tdir) - except: - pass - \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/web/profiles/ap.py b/src/calibre/ebooks/lrf/web/profiles/ap.py deleted file mode 100644 index 161699941a..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/ap.py +++ /dev/null @@ -1,38 +0,0 @@ -import re -from calibre.ebooks.lrf.web.profiles import DefaultProfile - - -class AssociatedPress(DefaultProfile): - - title = 'Associated Press' - max_recursions = 2 - max_articles_per_feed = 15 - html2lrf_options = ['--force-page-break-before-tag="chapter"'] - - - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in -[ - (r'.*?' , lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'
    .*?

    ', lambda match : '

    '), - (r'

    ', lambda match : '

    '), - (r'Learn more about our Privacy Policy.*?', lambda match : ''), - ] - ] - - - - def get_feeds(self): - return [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'), - ('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'), - ('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'), - ('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'), - ('AP Washington State News', 'http://hosted.ap.org/lineups/WASHINGTONHEADS-rss_2.0.xml?SITE=NYPLA&SECTION=HOME'), - ('AP Technology News', 'http://hosted.ap.org/lineups/TECHHEADS-rss_2.0.xml?SITE=CTNHR&SECTION=HOME'), - ('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'), - ('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'), - ('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'), - ] \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/web/profiles/atlantic.py b/src/calibre/ebooks/lrf/web/profiles/atlantic.py deleted file mode 100644 index eebbe84d96..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/atlantic.py +++ /dev/null @@ -1,47 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -import re -from calibre.ebooks.lrf.web.profiles import DefaultProfile -from calibre.ebooks.BeautifulSoup import BeautifulSoup - -class Atlantic(DefaultProfile): - - title = 'The Atlantic' - max_recursions = 2 - INDEX = 'http://www.theatlantic.com/doc/current' - - preprocess_regexps = [ - (re.compile(r'

    .*?<\!--\s+INVISIBLE SKIP .*?\s+-->', - lambda match : ''), - (r'', lambda match: ''), - ] - ] - - def __init__(self, logger, verbose=False, username=None, password=None): - DefaultProfile.__init__(self, username, password) - self.browser = None # Needed as otherwise there are timeouts while fetching actual articles - - def print_version(self, url): - return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '') - - def get_feeds(self): - src = self.browser.open('http://economist.com/rss/').read() - soup = BeautifulSoup(src) - feeds = [] - for ul in soup.findAll('ul'): - lis = ul.findAll('li') - try: - title, link = lis[0], lis[1] - except IndexError: - continue - title = title.string - if title: - title = title.strip() - if title not in self.__class__.TITLES: - continue - a = link.find('a') - feeds.append((title, a['href'].strip())) - - return feeds diff --git a/src/calibre/ebooks/lrf/web/profiles/faznet.py b/src/calibre/ebooks/lrf/web/profiles/faznet.py deleted file mode 100644 index 53f2cde752..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/faznet.py +++ /dev/null @@ -1,28 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Profile to download FAZ.net -''' -import re - -from calibre.ebooks.lrf.web.profiles import DefaultProfile - -class FazNet(DefaultProfile): - - title = 'FAZ NET' - max_recursions = 2 - html_description = True - max_articles_per_feed = 30 - - preprocess_regexps = [ - (re.compile(r'Zum Thema.*?', re.IGNORECASE | re.DOTALL), - lambda match : ''), - ] - - - def get_feeds(self): - return [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ] - - def print_version(self, url): - return url.replace('.html?rss_aktuell', '~Afor~Eprint.html') - diff --git a/src/calibre/ebooks/lrf/web/profiles/jpost.py b/src/calibre/ebooks/lrf/web/profiles/jpost.py deleted file mode 100644 index ddc2a00e35..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/jpost.py +++ /dev/null @@ -1,36 +0,0 @@ -import re -from calibre.ebooks.lrf.web.profiles import DefaultProfile - -class JerusalemPost(DefaultProfile): - - title = 'Jerusalem Post' - max_recursions = 2 - max_articles_per_feed = 10 - - - - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in -[ - (r'.*?' , lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'
    ', lambda match : ''), - (r'\'NWAnews.com', lambda match : ''), - (r'', lambda match : ''), - (r'

    .*?', lambda match : ''), - - ] - ] - - def get_feeds(self): - return [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'), - ('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'), - ('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'), - ('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'), - ('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'), - ] - - def print_version(self, url): - return ('http://www.jpost.com/servlet/Satellite?cid=' + url.rpartition('&')[2] + '&pagename=JPost%2FJPArticle%2FPrinter') - diff --git a/src/calibre/ebooks/lrf/web/profiles/jutarnji.py b/src/calibre/ebooks/lrf/web/profiles/jutarnji.py deleted file mode 100644 index 93da341edd..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/jutarnji.py +++ /dev/null @@ -1,44 +0,0 @@ -''' - Profile to download Jutarnji.hr by Valloric -''' - -import re - -from calibre.ebooks.lrf.web.profiles import DefaultProfile - -class Jutarnji(DefaultProfile): - - title = 'Jutarnji' - max_recursions = 2 - timefmt = ' [%d %b %Y]' - max_articles_per_feed = 80 - html_description = True - no_stylesheets = True - - preprocess_regexps = [ - (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r'

    .*?', re.IGNORECASE | re.DOTALL), lambda match : '
    '), - (re.compile(r')|(
    )|(
    )|(

    )|())', lambda match: '

    '), - - ## Remove any links/ads/comments/cruft from the end of the body of the article. - (r'(()|(
    )|(

    ©)|(