Conversion pipeline is now a superset of any2epub :)

2025-11-13 10:06:59 -05:00 · 2009-04-22 14:35:32 -07:00 · 2009-04-22 14:35:32 -07:00 · 0b6dc7f8ed
commit 0b6dc7f8ed
parent 14636efa24
12 changed files with 374 additions and 25 deletions
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -117,6 +117,9 @@ def add_pipeline_options(parser, plumber):
                      'line_height',
                      'linearize_tables',
                      'extra_css',
+                      'margin_top', 'margin_left', 'margin_right',
+                      'margin_bottom', 'dont_justify',
+                      'insert_blank_line', 'remove_paragraph_spacing',
                  ]
                  ),

@ -124,6 +127,8 @@ def add_pipeline_options(parser, plumber):
                  _('Control auto-detection of document structure.'),
                  [
                      'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
+                      'prefer_metadata_cover', 'remove_first_image',
+                      'insert_comments',
                  ]
                  ),

--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -195,7 +195,7 @@ OptionRecommendation(name='toc_filter',

 OptionRecommendation(name='chapter',
        recommended_value="//*[((name()='h1' or name()='h2') and "
-              "re:test(., 'chapter|book|section|part', 'i')) or @class "
+              r"re:test(., 'chapter|book|section|part\s+', 'i')) or @class "
              "= 'chapter']", level=OptionRecommendation.LOW,
            help=_('An XPath expression to detect chapter titles. The default '
                'is to consider <h1> or <h2> tags that contain the words '
@ -227,6 +227,64 @@ OptionRecommendation(name='extra_css',
                'rules.')
        ),

+OptionRecommendation(name='margin_top',
+        recommended_value=5.0, level=OptionRecommendation.LOW,
+        help=_('Set the top margin in pts. Default is %default')),
+
+OptionRecommendation(name='margin_bottom',
+        recommended_value=5.0, level=OptionRecommendation.LOW,
+        help=_('Set the bottom margin in pts. Default is %default')),
+
+OptionRecommendation(name='margin_left',
+        recommended_value=5.0, level=OptionRecommendation.LOW,
+        help=_('Set the left margin in pts. Default is %default')),
+
+OptionRecommendation(name='margin_right',
+        recommended_value=5.0, level=OptionRecommendation.LOW,
+        help=_('Set the right margin in pts. Default is %default')),
+
+OptionRecommendation(name='dont_justify',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Do not force text to be justified in output. Whether text '
+            'is actually displayed justified or not depends on whether '
+            'the ebook format and reading device support justification.')
+        ),
+
+OptionRecommendation(name='remove_paragraph_spacing',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Remove spacing between paragraphs. Also sets an indent on '
+        'paragraphs of 1.5em. Spacing removal will not work '
+        'if the source file does not use paragraphs (<p> or <div> tags).')
+        ),
+
+OptionRecommendation(name='prefer_metadata_cover',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Use the cover detected from the source file in preference '
+        'to the specified cover.')
+        ),
+
+OptionRecommendation(name='insert_blank_line',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Insert a blank line between paragraphs. Will not work '
+            'if the source file does not use paragraphs (<p> or <div> tags).'
+            )
+        ),
+
+OptionRecommendation(name='remove_first_image',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Remove the first image from the input ebook. Useful if the '
+        'first image in the source file is a cover and you are specifying '
+        'an external cover.'
+            )
+        ),
+
+OptionRecommendation(name='insert_comments',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Insert the comments/summary from the book metadata at the start of '
+            'the book. This is useful if your ebook reader does not support '
+            'displaying the comments from the metadata.'
+            )
+        ),


 OptionRecommendation(name='read_metadata_from_opf',
@ -244,7 +302,8 @@ OptionRecommendation(name='title',

 OptionRecommendation(name='authors',
    recommended_value=None, level=OptionRecommendation.LOW,
-    help=_('Set the authors. Multiple authors should be separated ')),
+    help=_('Set the authors. Multiple authors should be separated by '
+    'ampersands.')),

 OptionRecommendation(name='title_sort',
    recommended_value=None, level=OptionRecommendation.LOW,
@ -428,7 +487,6 @@ OptionRecommendation(name='language',
            mi.cover = None
        self.user_metadata = mi

-
    def setup_options(self):
        '''
        Setup the `self.opts` object.
@ -479,9 +537,16 @@ OptionRecommendation(name='language',
        if not hasattr(self.oeb, 'manifest'):
            self.oeb = create_oebbook(self.log, self.oeb, self.opts)

+        from calibre.ebooks.oeb.transforms.guide import Clean
+        Clean()(self.oeb, self.opts)
+
        self.opts.source = self.opts.input_profile
        self.opts.dest = self.opts.output_profile

+        from calibre.ebooks.oeb.transforms.metadata import MergeMetadata
+        MergeMetadata()(self.oeb, self.user_metadata,
+                self.opts.prefer_metadata_cover)
+
        from calibre.ebooks.oeb.transforms.structure import DetectStructure
        DetectStructure()(self.oeb, self.opts)

@ -495,6 +560,9 @@ OptionRecommendation(name='language',
        else:
            fkey = map(float, fkey.split(','))

+        from calibre.ebooks.oeb.transforms.jacket import Jacket
+        Jacket()(self.oeb, self.opts)
+
        if self.opts.extra_css and os.path.exists(self.opts.extra_css):
            self.opts.extra_css = open(self.opts.extra_css, 'rb').read()

--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+from calibre.customize.conversion import OutputFormatPlugin
+from calibre import CurrentDir
+
+class EPUBOutput(OutputFormatPlugin):
+
+    name = 'EPUB Output'
+    author = 'Kovid Goyal'
+    file_type = 'epub'
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        self.log, self.opts = log, opts
+
+
--- a/src/calibre/ebooks/metadata/init.py
+++ b/src/calibre/ebooks/metadata/init.py
@ -260,6 +260,9 @@ class MetaInformation(object):
            x = 1.0
        return '%d'%x if int(x) == x else '%.2f'%x

+    def authors_from_string(self, raw):
+        self.authors = string_to_authors(raw)
+
    def __unicode__(self):
        ans = []
        def fmt(x, y):
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -514,7 +514,8 @@ class Metadata(object):
        scheme  = Attribute(lambda term: 'scheme' if \
                                term == OPF('meta') else OPF('scheme'),
                            [DC('identifier'), OPF('meta')])
-        file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')])
+        file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'),
+                                             DC('title')])
        role    = Attribute(OPF('role'), [DC('creator'), DC('contributor')])
        event   = Attribute(OPF('event'), [DC('date')])
        id      = Attribute('id')
@ -593,6 +594,19 @@ class Metadata(object):
            yield key
    __iter__ = iterkeys

+    def clear(self, key):
+        l = self.items[key]
+        for x in list(l):
+            l.remove(x)
+
+    def filter(self, key, predicate):
+        l = self.items[key]
+        for x in list(l):
+            if predicate(x):
+                l.remove(x)
+
+
+
    def __getitem__(self, key):
        return self.items[key]

@ -1011,7 +1025,7 @@ class Manifest(object):
                media_type = OEB_DOC_MIME
            elif media_type in OEB_STYLES:
                media_type = OEB_CSS_MIME
-            attrib = {'id': item.id, 'href': item.href,
+            attrib = {'id': item.id, 'href': urlunquote(item.href),
                      'media-type': media_type}
            if item.fallback:
                attrib['fallback'] = item.fallback
@ -1202,6 +1216,9 @@ class Guide(object):
        self.refs[type] = ref
        return ref

+    def remove(self, type):
+        return self.refs.pop(type, None)
+
    def iterkeys(self):
        for type in self.refs:
            yield type
@ -1229,7 +1246,7 @@ class Guide(object):
    def to_opf1(self, parent=None):
        elem = element(parent, 'guide')
        for ref in self.refs.values():
-            attrib = {'type': ref.type, 'href': ref.href}
+            attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
            if ref.title:
                attrib['title'] = ref.title
            element(elem, 'reference', attrib=attrib)
@ -1345,7 +1362,7 @@ class TOC(object):
    def to_opf1(self, tour):
        for node in self.nodes:
            element(tour, 'site', attrib={
-                'title': node.title, 'href': node.href})
+                'title': node.title, 'href': urlunquote(node.href)})
            node.to_opf1(tour)
        return tour

@ -1358,7 +1375,7 @@ class TOC(object):
            point = element(parent, NCX('navPoint'), attrib=attrib)
            label = etree.SubElement(point, NCX('navLabel'))
            element(label, NCX('text')).text = node.title
-            element(point, NCX('content'), src=node.href)
+            element(point, NCX('content'), src=urlunquote(node.href))
            node.to_ncx(point)
        return parent

--- a/src/calibre/ebooks/oeb/output.py
+++ b/src/calibre/ebooks/oeb/output.py
@ -9,6 +9,7 @@ from lxml import etree

 from calibre.customize.conversion import OutputFormatPlugin
 from calibre import CurrentDir
+from urllib import unquote

 class OEBOutput(OutputFormatPlugin):

@ -32,7 +33,7 @@ class OEBOutput(OutputFormatPlugin):
                        f.write(raw)

            for item in oeb_book.manifest:
-                path = os.path.abspath(item.href)
+                path = os.path.abspath(unquote(item.href))
                dir = os.path.dirname(path)
                if not os.path.exists(dir):
                    os.makedirs(dir)
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -11,6 +11,7 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 import os
 import itertools
 import re
+import logging
 import copy
 from weakref import WeakKeyDictionary
 from xml.dom import SyntaxErr as CSSSyntaxError
@ -106,7 +107,8 @@ class CSSSelector(etree.XPath):
 class Stylizer(object):
    STYLESHEETS = WeakKeyDictionary()

-    def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'], extra_css=''):
+    def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'],
+            extra_css='', user_css=''):
        self.oeb = oeb
        self.profile = profile
        self.logger = oeb.logger
@ -115,7 +117,8 @@ class Stylizer(object):
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [HTML_CSS_STYLESHEET]
        head = xpath(tree, '/h:html/h:head')[0]
-        parser = cssutils.CSSParser(fetcher=self._fetch_css_file)
+        parser = cssutils.CSSParser(fetcher=self._fetch_css_file,
+                log=logging.getLogger('calibre.css'))
        for elem in head:
            if elem.tag == XHTML('style') and elem.text \
               and elem.get('type', CSS_MIME) in OEB_STYLES:
@ -135,8 +138,9 @@ class Stylizer(object):
                        (path, item.href))
                    continue
                stylesheets.append(sitem.data)
-        if extra_css:
-            text = XHTML_CSS_NAMESPACE + extra_css
+        for x in (extra_css, user_css):
+            if x:
+                text = XHTML_CSS_NAMESPACE + x
                stylesheet = parser.parseString(text, href=cssname)
                stylesheet.namespaces['h'] = XHTML_NS
                stylesheets.append(stylesheet)
@ -288,6 +292,9 @@ class Style(object):
        self._lineHeight = None
        stylizer._styles[element] = self

+    def set(self, prop, val):
+        self._style[prop] = val
+
    def _update_cssdict(self, cssdict):
        self._style.update(cssdict)

--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@ -114,12 +114,27 @@ class CSSFlattener(object):
    def stylize_spine(self):
        self.stylizers = {}
        profile = self.context.source
+        css = ''
        for item in self.oeb.spine:
            html = item.data
+            body = html.find(XHTML('body'))
+            bs = body.get('style', '').split(';')
+            bs.append('margin-top: 0pt')
+            bs.append('margin-bottom: 0pt')
+            bs.append('margin-left : %fpt'%\
+                    float(self.context.margin_left))
+            bs.append('margin-right : %fpt'%\
+                    float(self.context.margin_right))
+            bs.append('text-align: '+ \
+                    ('left' if self.context.dont_justify else 'justify'))
+            body.set('style', '; '.join(bs))
+
            stylizer = Stylizer(html, item.href, self.oeb, profile,
-                    extra_css=self.context.extra_css)
+                    user_css=self.context.extra_css,
+                    extra_css=css)
            self.stylizers[item] = stylizer

+
    def baseline_node(self, node, stylizer, sizes, csize):
        csize = stylizer.style(node)['font-size']
        if node.text:
@ -219,6 +234,15 @@ class CSSFlattener(object):
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh
+        if (self.context.remove_paragraph_spacing or
+                self.context.insert_blank_line) and tag in ('p', 'div'):
+            for prop in ('margin', 'padding', 'border'):
+                for edge in ('top', 'bottom'):
+                    cssdict['%s-%s'%(prop, edge)] = '0pt'
+            if self.context.insert_blank_line:
+                cssdict['margin-top'] = cssdict['margin-bottom'] = '0.5em'
+            if self.context.remove_paragraph_spacing:
+                cssdict['text-indent'] = '1.5em'
        if cssdict:
            items = cssdict.items()
            items.sort()
@ -253,7 +277,11 @@ class CSSFlattener(object):
        href = item.relhref(href)
        etree.SubElement(head, XHTML('link'),
            rel='stylesheet', type=CSS_MIME, href=href)
-        if stylizer.page_rule:
+        stylizer.page_rule['margin-top'] = '%fpt'%\
+                float(self.context.margin_top)
+        stylizer.page_rule['margin-bottom'] = '%fpt'%\
+                float(self.context.margin_bottom)
+
        items = stylizer.page_rule.items()
        items.sort()
        css = '; '.join("%s: %s" % (key, val) for key, val in items)
@ -285,3 +313,4 @@ class CSSFlattener(object):
        for item in self.oeb.spine:
            stylizer = self.stylizers[item]
            self.flatten_head(item, stylizer, href)
+
--- a/src/calibre/ebooks/oeb/transforms/guide.py
+++ b/src/calibre/ebooks/oeb/transforms/guide.py
@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+class Clean(object):
+    '''Clean up guide, leaving only a pointer to the cover'''
+
+    def __call__(self, oeb, opts):
+        from calibre.ebooks.oeb.base import urldefrag
+        self.oeb, self.log, self.opts = oeb, oeb.log, opts
+
+        cover_href = ''
+        if 'cover' not in self.oeb.guide:
+            covers = []
+            for x in ('other.ms-coverimage-standard',
+                    'other.ms-titleimage-standard', 'other.ms-titleimage',
+                    'other.ms-coverimage', 'other.ms-thumbimage-standard',
+                    'other.ms-thumbimage'):
+                if x in self.oeb.guide:
+                    href = self.oeb.guide[x].href
+                    item = self.oeb.manifest.hrefs[href]
+                    covers.append([self.oeb.guide[x], len(item.data)])
+            covers.sort(cmp=lambda x,y:cmp(x[1], y[1]), reverse=True)
+            if covers:
+                ref = covers[0][0]
+                if len(covers) > 1:
+                    self.log('Choosing %s:%s as the cover'%(ref.type, ref.href))
+                ref.type = 'cover'
+                self.oeb.guide.refs['cover'] = ref
+                cover_href = urldefrag(ref.href)[0]
+
+        for x in list(self.oeb.guide):
+            href = urldefrag(self.oeb.guide[x].href)[0]
+            if x.lower() != 'cover':
+                try:
+                    if href != cover_href:
+                        self.oeb.manifest.remove(self.oeb.manifest.hrefs[href])
+                except KeyError:
+                    pass
+                self.oeb.guide.remove(x)
+
+
--- a/src/calibre/ebooks/oeb/transforms/jacket.py
+++ b/src/calibre/ebooks/oeb/transforms/jacket.py
@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import textwrap
+
+from lxml import etree
+
+from calibre.ebooks.oeb.base import XPNSMAP
+from calibre import guess_type
+
+class Jacket(object):
+    '''
+    Book jacket manipulation. Remove first image and insert comments at start of
+    book.
+    '''
+
+    JACKET_TEMPLATE = textwrap.dedent(u'''\
+    <html xmlns="%(xmlns)s">
+        <head>
+            <title>%(title)s</title>
+        </head>
+        <body>
+            <h1 style="text-align: center">%(title)s</h1>
+            <h2 style="text-align: center">%(jacket)s</h2>
+            <div>
+                %(comments)s
+            </div>
+        </body>
+    </html>
+    ''')
+
+    def remove_first_image(self):
+        for i, item in enumerate(self.oeb.spine):
+            if i > 2: break
+            for img in item.data.xpath('//h:img[@src]', namespace=XPNSMAP):
+                href = item.abshref(img.get('src'))
+                image = self.oeb.manifest.hrefs.get(href, None)
+                if image is not None:
+                    self.log('Removing first image', img.get('src'))
+                    self.oeb.manifest.remove(image)
+                    img.getparent().remove(img)
+                    return
+
+    def insert_comments(self, comments):
+        self.log('Inserting metadata comments into book...')
+        comments = comments.replace('\r\n', '\n').replace('\n\n', '<br/><br/>')
+        html = self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'],
+                title=self.opts.title, comments=comments,
+                jacket=_('Book Jacket'))
+        id, href = self.oeb.manifest.generate('jacket', 'jacket.xhtml')
+        root = etree.fromstring(html)
+        item = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root)
+        self.oeb.spine.insert(0, item, True)
+
+
+    def __call__(self, oeb, opts):
+        self.oeb, self.opts, self.log = oeb, opts, oeb.log
+        if opts.remove_first_image:
+            self.remove_fisrt_image()
+        if opts.insert_comments and opts.comments:
+            self.insert_comments(opts.comments)
--- a/src/calibre/ebooks/oeb/transforms/metadata.py
+++ b/src/calibre/ebooks/oeb/transforms/metadata.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+class MergeMetadata(object):
+    'Merge in user metadata, including cover'
+
+    def __call__(self, oeb, mi, prefer_metadata_cover=False):
+        from calibre.ebooks.oeb.base import DC
+        self.oeb, self.log = oeb, oeb.log
+        m = self.oeb.metadata
+        self.log('Merging user specified metadata...')
+        if mi.title:
+            m.clear('title')
+            m.add('title', mi.title)
+        if mi.title_sort:
+            if not m.title:
+                m.add(DC('title'), mi.title_sort)
+            m.title[0].file_as = mi.title_sort
+        if mi.authors:
+            m.filter('creator', lambda x : x.role.lower() == 'aut')
+            for a in mi.authors:
+                attrib = {'role':'aut'}
+                if mi.author_sort:
+                    attrib['file_as'] = mi.author_sort
+                m.add('creator', a, attrib=attrib)
+        if mi.comments:
+            m.clear('description')
+            m.add('description', mi.comments)
+        if mi.publisher:
+            m.clear('publisher')
+            m.add('publisher', mi.publisher)
+        if mi.series:
+            m.clear('series')
+            m.add('series', mi.series)
+        if mi.isbn:
+            has = False
+            for x in m.identifier:
+                if x.scheme.lower() == 'isbn':
+                    x.content = mi.isbn
+                    has = True
+            if not has:
+                m.add('identifier', mi.isbn, scheme='ISBN')
+        if mi.language:
+            m.clear('language')
+            m.add('language', mi.language)
+        if mi.book_producer:
+            m.filter('creator', lambda x : x.role.lower() == 'bkp')
+            m.add('creator', mi.book_producer, role='bkp')
+        if mi.series_index is not None:
+            m.clear('series_index')
+            m.add('series_index', '%.2f'%mi.series_index)
+        if mi.rating is not None:
+            m.clear('rating')
+            m.add('rating', '%.2f'%mi.rating)
+        if mi.tags:
+            m.clear('subject')
+            for t in mi.tags:
+                m.add('subject', t)
+
+        self.set_cover(mi, prefer_metadata_cover)
+
+    def set_cover(self, mi, prefer_metadata_cover):
+        cdata = ''
+        if mi.cover and os.access(mi.cover, os.R_OK):
+            cdata = open(mi.cover, 'rb').read()
+        elif mi.cover_data and mi.cover_data[-1]:
+            cdata = mi.cover_data[1]
+        if not cdata: return
+        if 'cover' in self.oeb.guide:
+            if not prefer_metadata_cover:
+                href = self.oeb.guide['cover'].href
+                self.oeb.manifest.hrefs[href]._data = cdata
+        else:
+            id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
+            self.oeb.manifest.add(id, href, 'image/jpeg', data=cdata)
+            self.oeb.guide.add('cover', 'Cover', href)
+
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -16,7 +16,7 @@ from lxml import etree
 from lxml.cssselect import CSSSelector

 from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
-        urldefrag, rewrite_links
+        urldefrag, rewrite_links, urlunquote
 from calibre.ebooks.epub import tostring, rules


@ -142,7 +142,7 @@ class Split(object):
            nhref = anchor_map[frag if frag else None]
            nhref = self.current_item.relhref(nhref)
            if frag:
-                nhref = '#'.join((nhref, frag))
+                nhref = '#'.join((urlunquote(nhref), frag))

            return nhref
        return url