diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 93d5813fba..c8d3e4dd3a 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -90,6 +90,8 @@ to auto-generate a Table of Contents. help=_('Maximum number of links from each HTML file to insert into the TOC. Set to 0 to disable. Default is: %default.')) toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, help=_("Don't add auto-detected chapters to the Table of Contents.")) + toc('use_auto_toc', ['--use-auto-toc'], default=False, + help=_('Normally, if the source file already has a Table of Contents, it is used in preference to the autodetected one. With this option, the autodetected one is always used.')) layout = c.add_group('page layout', _('Control page layout')) layout('margin_top', ['--margin-top'], default=5.0, diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index 0246ca9b3a..4f372b85a0 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -15,7 +15,7 @@ from calibre.ebooks.epub import config as common_config from calibre.ebooks.epub.from_html import convert as html2epub from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf import OPFCreator +from calibre.ebooks.metadata.opf2 import OPFCreator def lit2opf(path, tdir, opts): from calibre.ebooks.lit.reader import LitReader @@ -74,7 +74,7 @@ MAP = { 'txt' : txt2opf, 'pdf' : pdf2opf, } -SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf'] +SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip'] def unarchive(path, tdir): extract(path, tdir) diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 12bec12734..c3e29f761a 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -1,5 +1,5 @@ from __future__ import with_statement -from calibre.ebooks.metadata.opf import OPFReader + __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' @@ -17,8 +17,10 @@ from calibre.ebooks.epub import config as common_config from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.epub import initialize_container, PROFILES from calibre.ebooks.epub.split import split +from calibre.constants import preferred_encoding class HTMLProcessor(Processor): @@ -84,11 +86,11 @@ def convert(htmlfile, opts, notification=None): opts.output = os.path.abspath(opts.output) if opts.override_css is not None: try: - opts.override_css = open(opts.override_css, 'rb').read().decode('utf-8', 'replace') + opts.override_css = open(opts.override_css, 'rb').read().decode(preferred_encoding, 'replace') except: - opts.override_css = opts.override_css.decode('utf-8', 'replace') + opts.override_css = opts.override_css.decode(preferred_encoding, 'replace') if htmlfile.lower().endswith('.opf'): - opf = OPFReader(htmlfile, os.path.dirname(os.path.abspath(htmlfile))) + opf = OPF(htmlfile, os.path.dirname(os.path.abspath(htmlfile))) filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) mi = MetaInformation(opf) else: @@ -141,7 +143,7 @@ def convert(htmlfile, opts, notification=None): buf = cStringIO.StringIO() if mi.toc: rebase_toc(mi.toc, htmlfile_map, tdir) - if mi.toc is None or len(mi.toc) < 2: + if opts.use_auto_toc or mi.toc is None or len(mi.toc) < 2: mi.toc = generated_toc for item in mi.manifest: if getattr(item, 'mime_type', None) == 'text/html': diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index c567080c8d..8ab0fec437 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -8,7 +8,6 @@ Split the flows in an epub file to conform to size limitations. ''' import os, math, copy, logging, functools -from urllib import unquote from lxml.etree import XPath as _XPath from lxml import etree, html @@ -57,7 +56,8 @@ class Splitter(LoggingInterface): if stylesheet is not None: self.find_page_breaks(stylesheet, root) - self.trees = self.split(root.getroottree()) + self.trees = [] + self.split(root.getroottree()) self.commit() self.log_info('\t\tSplit into %d parts.', len(self.trees)) if self.opts.verbose: @@ -81,7 +81,6 @@ class Splitter(LoggingInterface): tree2 = copy.deepcopy(tree) root2 = tree2.getroot() body, body2 = root.body, root2.body - trees = [] path = tree.getpath(split_point) split_point2 = root2.xpath(path)[0] @@ -137,13 +136,12 @@ class Splitter(LoggingInterface): for t, r in [(tree, root), (tree2, root2)]: size = len(tostring(r)) if size <= self.opts.profile.flow_size: - trees.append(t) - self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(trees), size/1024.) + self.trees.append(t) + self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(self.trees), size/1024.) else: - trees.extend(self.split(t)) + self.split(t) + - return trees - def find_page_breaks(self, stylesheet, root): ''' Find all elements that have either page-break-before or page-break-after set. @@ -334,7 +332,7 @@ def split(pathtoopf, opts): html_files = [] for item in opf.itermanifest(): if 'html' in item.get('media-type', '').lower(): - html_files.append(unquote(item.get('href')).split('/')[-1]) + html_files.append(item.get('href').split('/')[-1]) changes = [] for f in html_files: if os.stat(content(f)).st_size > opts.profile.flow_size: diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index c8ad45de8c..65d73fe503 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -20,10 +20,9 @@ get_text = XPath("//text()") from calibre import LoggingInterface, unicode_path from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS from calibre.utils.config import Config, StringConfig -from calibre.ebooks.metadata.opf import OPFReader, OPFCreator from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.metadata.opf2 import OPF +from calibre.ebooks.metadata.opf2 import OPF, OPFCreator from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.utils.zipfile import ZipFile @@ -429,6 +428,8 @@ class Processor(Parser): def detect_chapters(self): self.detected_chapters = self.opts.chapter(self.root) for elem in self.detected_chapters: + text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) + self.log_info('\tDetected chapter: %s', text[:50]) if self.opts.chapter_mark in ('both', 'pagebreak'): style = elem.get('style', '').strip() if style and not style.endswith(';'): @@ -503,12 +504,16 @@ class Processor(Parser): # Add chapters to TOC if not self.opts.no_chapters_in_toc: + counter = 0 for elem in getattr(self, 'detected_chapters', []): text = (u''.join(elem.xpath('string()'))).strip() if text: name = self.htmlfile_map[self.htmlfile.path] href = 'content/'+name - add_item(href, None, text, target) + counter += 1 + id = elem.get('id', 'calibre_chapter_%d'%counter) + elem.set('id', id) + add_item(href, id, text, target) def extract_css(self): @@ -647,7 +652,7 @@ is used. def search_for_opf(dir): for f in os.listdir(dir): if f.lower().endswith('.opf'): - return OPFReader(open(os.path.join(dir, f), 'rb'), dir) + return OPF(open(os.path.join(dir, f), 'rb'), dir) def get_filelist(htmlfile, opts): @@ -749,7 +754,7 @@ def create_dir(htmlfile, opts): Create a directory that contains the open ebook ''' if htmlfile.lower().endswith('.opf'): - opf = OPFReader(open(htmlfile, 'rb'), os.path.dirname(os.path.abspath(htmlfile))) + opf = OPF(open(htmlfile, 'rb'), os.path.dirname(os.path.abspath(htmlfile))) filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) mi = MetaInformation(opf) else: diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 2dd1ba5f85..18b3ddd5cf 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -7,13 +7,154 @@ __docformat__ = 'restructuredtext en' lxml based OPF parser. ''' -import sys, unittest, functools, os -from urllib import unquote, quote +import sys, unittest, functools, os, mimetypes, uuid +from urllib import unquote +from urlparse import urlparse from lxml import etree from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.metadata import Resource, ResourceCollection +from calibre import relpath +from calibre.constants import __appname__ +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata import MetaInformation + + +class Resource(object): + ''' + Represents a resource (usually a file on the filesystem or a URL pointing + to the web. Such resources are commonly referred to in OPF files. + + They have the interface: + + :member:`path` + :member:`mime_type` + :method:`href` + + ''' + + def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True): + self._href = None + self._basedir = basedir + self.path = None + self.fragment = '' + try: + self.mime_type = mimetypes.guess_type(href_or_path)[0] + except: + self.mime_type = None + if self.mime_type is None: + self.mime_type = 'application/octet-stream' + if is_path: + path = href_or_path + if not os.path.isabs(path): + path = os.path.abspath(os.path.join(basedir, path)) + if isinstance(path, str): + path = path.decode(sys.getfilesystemencoding()) + self.path = path + else: + href_or_path = href_or_path + url = urlparse(href_or_path) + if url[0] not in ('', 'file'): + self._href = href_or_path + else: + pc = url[2] + if isinstance(pc, unicode): + pc = pc.encode('utf-8') + pc = pc.decode('utf-8') + self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) + self.fragment = url[-1] + + + def href(self, basedir=None): + ''' + Return a URL pointing to this resource. If it is a file on the filesystem + the URL is relative to `basedir`. + + `basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`). + If this resource has no basedir, then the current working directory is used as the basedir. + ''' + if basedir is None: + if self._basedir: + basedir = self._basedir + else: + basedir = os.getcwd() + if self.path is None: + return self._href + f = self.fragment.encode('utf-8') if isinstance(self.fragment, unicode) else self.fragment + frag = '#'+f if self.fragment else '' + if self.path == basedir: + return ''+frag + try: + rpath = relpath(self.path, basedir) + except OSError: # On windows path and basedir could be on different drives + rpath = self.path + if isinstance(rpath, unicode): + rpath = rpath.encode('utf-8') + return rpath.replace(os.sep, '/')+frag + + def set_basedir(self, path): + self._basedir = path + + def basedir(self): + return self._basedir + + def __repr__(self): + return 'Resource(%s, %s)'%(repr(self.path), repr(self.href())) + + +class ResourceCollection(object): + + def __init__(self): + self._resources = [] + + def __iter__(self): + for r in self._resources: + yield r + + def __len__(self): + return len(self._resources) + + def __getitem__(self, index): + return self._resources[index] + + def __bool__(self): + return len(self._resources) > 0 + + def __str__(self): + resources = map(repr, self) + return '[%s]'%', '.join(resources) + + def __repr__(self): + return str(self) + + def append(self, resource): + if not isinstance(resource, Resource): + raise ValueError('Can only append objects of type Resource') + self._resources.append(resource) + + def remove(self, resource): + self._resources.remove(resource) + + def replace(self, start, end, items): + 'Same as list[start:end] = items' + self._resources[start:end] = items + + @staticmethod + def from_directory_contents(top, topdown=True): + collection = ResourceCollection() + for spec in os.walk(top, topdown=topdown): + path = os.path.abspath(os.path.join(spec[0], spec[1])) + res = Resource.from_path(path) + res.set_basedir(top) + collection.append(res) + return collection + + def set_basedir(self, path): + for res in self: + res.set_basedir(path) + + + class ManifestItem(Resource): @@ -21,8 +162,6 @@ class ManifestItem(Resource): def from_opf_manifest_item(item, basedir): href = item.get('href', None) if href: - if unquote(href) == href: - href = quote(href) res = ManifestItem(href, basedir=basedir, is_path=False) mt = item.get('media-type', '').strip() if mt: @@ -293,6 +432,7 @@ class OPF(object): if not self.metadata: raise ValueError('Malformed OPF file: No element') self.metadata = self.metadata[0] + self.unquote_urls() self.manifest = Manifest() m = self.manifest_path(self.tree) if m: @@ -307,6 +447,7 @@ class OPF(object): self.guide = Guide.from_opf_guide(guide, basedir) self.cover_data = (None, None) + def get_text(self, elem): return u''.join(self.TEXT(elem)) @@ -355,9 +496,11 @@ class OPF(object): def iterguide(self): return self.guide_path(self.tree) - def render(self): - return etree.tostring(self.tree, encoding='UTF-8', xml_declaration=True, - pretty_print=True) + def unquote_urls(self): + for item in self.itermanifest(): + item.set('href', unquote(item.get('href', ''))) + for item in self.iterguide(): + item.set('href', unquote(item.get('href', ''))) @apply def authors(): @@ -450,6 +593,116 @@ class OPF(object): if val or val == []: setattr(self, attr, val) +class OPFCreator(MetaInformation): + + def __init__(self, base_path, *args, **kwargs): + ''' + Initialize. + @param base_path: An absolute path to the directory in which this OPF file + will eventually be. This is used by the L{create_manifest} method + to convert paths to files into relative paths. + ''' + MetaInformation.__init__(self, *args, **kwargs) + self.base_path = os.path.abspath(base_path) + if self.application_id is None: + self.application_id = str(uuid.uuid4()) + if not isinstance(self.toc, TOC): + self.toc = None + if not self.authors: + self.authors = [_('Unknown')] + if self.guide is None: + self.guide = Guide() + if self.cover: + self.guide.set_cover(self.cover) + + + def create_manifest(self, entries): + ''' + Create + + `entries`: List of (path, mime-type) If mime-type is None it is autodetected + ''' + entries = map(lambda x: x if os.path.isabs(x[0]) else + (os.path.abspath(os.path.join(self.base_path, x[0])), x[1]), + entries) + self.manifest = Manifest.from_paths(entries) + self.manifest.set_basedir(self.base_path) + + def create_manifest_from_files_in(self, files_and_dirs): + entries = [] + + def dodir(dir): + for spec in os.walk(dir): + root, files = spec[0], spec[-1] + for name in files: + path = os.path.join(root, name) + if os.path.isfile(path): + entries.append((path, None)) + + for i in files_and_dirs: + if os.path.isdir(i): + dodir(i) + else: + entries.append((i, None)) + + self.create_manifest(entries) + + def create_spine(self, entries): + ''' + Create the element. Must first call :method:`create_manifest`. + + `entries`: List of paths + ''' + entries = map(lambda x: x if os.path.isabs(x) else + os.path.abspath(os.path.join(self.base_path, x)), entries) + self.spine = Spine.from_paths(entries, self.manifest) + + def set_toc(self, toc): + ''' + Set the toc. You must call :method:`create_spine` before calling this + method. + + :param toc: A :class:`TOC` object + ''' + self.toc = toc + + def create_guide(self, guide_element): + self.guide = Guide.from_opf_guide(guide_element, self.base_path) + self.guide.set_basedir(self.base_path) + + def render(self, opf_stream, ncx_stream=None, ncx_manifest_entry=None): + from calibre.resources import opf_template + from calibre.utils.genshi.template import MarkupTemplate + template = MarkupTemplate(opf_template) + if self.manifest: + self.manifest.set_basedir(self.base_path) + if ncx_manifest_entry is not None: + if not os.path.isabs(ncx_manifest_entry): + ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry) + remove = [i for i in self.manifest if i.id == 'ncx'] + for item in remove: + self.manifest.remove(item) + self.manifest.append(ManifestItem(ncx_manifest_entry, self.base_path)) + self.manifest[-1].id = 'ncx' + self.manifest[-1].mime_type = 'application/x-dtbncx+xml' + if not self.guide: + self.guide = Guide() + if self.cover: + cover = self.cover + if not os.path.isabs(cover): + cover = os.path.abspath(os.path.join(self.base_path, cover)) + self.guide.set_cover(cover) + self.guide.set_basedir(self.base_path) + + opf = template.generate(__appname__=__appname__, mi=self).render('xml') + opf_stream.write(opf) + opf_stream.flush() + toc = getattr(self, 'toc', None) + if toc is not None and ncx_stream is not None: + toc.render(ncx_stream, self.application_id) + ncx_stream.flush() + + class OPFTest(unittest.TestCase): def setUp(self): diff --git a/src/calibre/gui2/dialogs/epub.ui b/src/calibre/gui2/dialogs/epub.ui index 32d0d40252..6f13a1fde3 100644 --- a/src/calibre/gui2/dialogs/epub.ui +++ b/src/calibre/gui2/dialogs/epub.ui @@ -77,7 +77,7 @@ - 3 + 0 @@ -89,36 +89,6 @@ Book Cover - - - - - - - - - :/images/book.svg - - - true - - - Qt::AlignCenter - - - - - - - - - Use cover from &source file - - - true - - - @@ -170,6 +140,36 @@ + + + + Use cover from &source file + + + true + + + + + + + + + + + + :/images/book.svg + + + true + + + Qt::AlignCenter + + + + + opt_prefer_metadata_cover @@ -590,12 +590,6 @@ p, li { white-space: pre-wrap; } - label_17 - opt_chapter - label_8 - opt_chapter_mark - label_9 - verticalSpacer @@ -604,10 +598,10 @@ p, li { white-space: pre-wrap; } Automatic &Table of Contents - + - + Number of &links to add to Table of Contents @@ -617,17 +611,17 @@ p, li { white-space: pre-wrap; } - + - Do not add &detected chapters ot the Table of Contents + Do not add &detected chapters to the Table of Contents - + - + Table of Contents &recursion @@ -637,6 +631,13 @@ p, li { white-space: pre-wrap; } + + + + &Force use of auto-generated Table of Contents + + +