Conversion pipeline is now a superset of any2epub :)

This commit is contained in:
Kovid Goyal 2009-04-22 14:35:32 -07:00
parent 14636efa24
commit 0b6dc7f8ed
12 changed files with 374 additions and 25 deletions

View File

@ -117,6 +117,9 @@ def add_pipeline_options(parser, plumber):
'line_height',
'linearize_tables',
'extra_css',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'dont_justify',
'insert_blank_line', 'remove_paragraph_spacing',
]
),
@ -124,6 +127,8 @@ def add_pipeline_options(parser, plumber):
_('Control auto-detection of document structure.'),
[
'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image',
'insert_comments',
]
),

View File

@ -195,7 +195,7 @@ OptionRecommendation(name='toc_filter',
OptionRecommendation(name='chapter',
recommended_value="//*[((name()='h1' or name()='h2') and "
"re:test(., 'chapter|book|section|part', 'i')) or @class "
r"re:test(., 'chapter|book|section|part\s+', 'i')) or @class "
"= 'chapter']", level=OptionRecommendation.LOW,
help=_('An XPath expression to detect chapter titles. The default '
'is to consider <h1> or <h2> tags that contain the words '
@ -227,6 +227,64 @@ OptionRecommendation(name='extra_css',
'rules.')
),
OptionRecommendation(name='margin_top',
recommended_value=5.0, level=OptionRecommendation.LOW,
help=_('Set the top margin in pts. Default is %default')),
OptionRecommendation(name='margin_bottom',
recommended_value=5.0, level=OptionRecommendation.LOW,
help=_('Set the bottom margin in pts. Default is %default')),
OptionRecommendation(name='margin_left',
recommended_value=5.0, level=OptionRecommendation.LOW,
help=_('Set the left margin in pts. Default is %default')),
OptionRecommendation(name='margin_right',
recommended_value=5.0, level=OptionRecommendation.LOW,
help=_('Set the right margin in pts. Default is %default')),
OptionRecommendation(name='dont_justify',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not force text to be justified in output. Whether text '
'is actually displayed justified or not depends on whether '
'the ebook format and reading device support justification.')
),
OptionRecommendation(name='remove_paragraph_spacing',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Remove spacing between paragraphs. Also sets an indent on '
'paragraphs of 1.5em. Spacing removal will not work '
'if the source file does not use paragraphs (<p> or <div> tags).')
),
OptionRecommendation(name='prefer_metadata_cover',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Use the cover detected from the source file in preference '
'to the specified cover.')
),
OptionRecommendation(name='insert_blank_line',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Insert a blank line between paragraphs. Will not work '
'if the source file does not use paragraphs (<p> or <div> tags).'
)
),
OptionRecommendation(name='remove_first_image',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Remove the first image from the input ebook. Useful if the '
'first image in the source file is a cover and you are specifying '
'an external cover.'
)
),
OptionRecommendation(name='insert_comments',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Insert the comments/summary from the book metadata at the start of '
'the book. This is useful if your ebook reader does not support '
'displaying the comments from the metadata.'
)
),
OptionRecommendation(name='read_metadata_from_opf',
@ -244,7 +302,8 @@ OptionRecommendation(name='title',
OptionRecommendation(name='authors',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the authors. Multiple authors should be separated ')),
help=_('Set the authors. Multiple authors should be separated by '
'ampersands.')),
OptionRecommendation(name='title_sort',
recommended_value=None, level=OptionRecommendation.LOW,
@ -428,7 +487,6 @@ OptionRecommendation(name='language',
mi.cover = None
self.user_metadata = mi
def setup_options(self):
'''
Setup the `self.opts` object.
@ -479,9 +537,16 @@ OptionRecommendation(name='language',
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts)
from calibre.ebooks.oeb.transforms.guide import Clean
Clean()(self.oeb, self.opts)
self.opts.source = self.opts.input_profile
self.opts.dest = self.opts.output_profile
from calibre.ebooks.oeb.transforms.metadata import MergeMetadata
MergeMetadata()(self.oeb, self.user_metadata,
self.opts.prefer_metadata_cover)
from calibre.ebooks.oeb.transforms.structure import DetectStructure
DetectStructure()(self.oeb, self.opts)
@ -495,6 +560,9 @@ OptionRecommendation(name='language',
else:
fkey = map(float, fkey.split(','))
from calibre.ebooks.oeb.transforms.jacket import Jacket
Jacket()(self.oeb, self.opts)
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
self.opts.extra_css = open(self.opts.extra_css, 'rb').read()

View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import OutputFormatPlugin
from calibre import CurrentDir
class EPUBOutput(OutputFormatPlugin):
name = 'EPUB Output'
author = 'Kovid Goyal'
file_type = 'epub'
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts = log, opts

View File

@ -260,6 +260,9 @@ class MetaInformation(object):
x = 1.0
return '%d'%x if int(x) == x else '%.2f'%x
def authors_from_string(self, raw):
self.authors = string_to_authors(raw)
def __unicode__(self):
ans = []
def fmt(x, y):

View File

@ -514,7 +514,8 @@ class Metadata(object):
scheme = Attribute(lambda term: 'scheme' if \
term == OPF('meta') else OPF('scheme'),
[DC('identifier'), OPF('meta')])
file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')])
file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'),
DC('title')])
role = Attribute(OPF('role'), [DC('creator'), DC('contributor')])
event = Attribute(OPF('event'), [DC('date')])
id = Attribute('id')
@ -593,6 +594,19 @@ class Metadata(object):
yield key
__iter__ = iterkeys
def clear(self, key):
l = self.items[key]
for x in list(l):
l.remove(x)
def filter(self, key, predicate):
l = self.items[key]
for x in list(l):
if predicate(x):
l.remove(x)
def __getitem__(self, key):
return self.items[key]
@ -1011,7 +1025,7 @@ class Manifest(object):
media_type = OEB_DOC_MIME
elif media_type in OEB_STYLES:
media_type = OEB_CSS_MIME
attrib = {'id': item.id, 'href': item.href,
attrib = {'id': item.id, 'href': urlunquote(item.href),
'media-type': media_type}
if item.fallback:
attrib['fallback'] = item.fallback
@ -1202,6 +1216,9 @@ class Guide(object):
self.refs[type] = ref
return ref
def remove(self, type):
return self.refs.pop(type, None)
def iterkeys(self):
for type in self.refs:
yield type
@ -1229,7 +1246,7 @@ class Guide(object):
def to_opf1(self, parent=None):
elem = element(parent, 'guide')
for ref in self.refs.values():
attrib = {'type': ref.type, 'href': ref.href}
attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
if ref.title:
attrib['title'] = ref.title
element(elem, 'reference', attrib=attrib)
@ -1345,7 +1362,7 @@ class TOC(object):
def to_opf1(self, tour):
for node in self.nodes:
element(tour, 'site', attrib={
'title': node.title, 'href': node.href})
'title': node.title, 'href': urlunquote(node.href)})
node.to_opf1(tour)
return tour
@ -1358,7 +1375,7 @@ class TOC(object):
point = element(parent, NCX('navPoint'), attrib=attrib)
label = etree.SubElement(point, NCX('navLabel'))
element(label, NCX('text')).text = node.title
element(point, NCX('content'), src=node.href)
element(point, NCX('content'), src=urlunquote(node.href))
node.to_ncx(point)
return parent

View File

@ -9,6 +9,7 @@ from lxml import etree
from calibre.customize.conversion import OutputFormatPlugin
from calibre import CurrentDir
from urllib import unquote
class OEBOutput(OutputFormatPlugin):
@ -32,7 +33,7 @@ class OEBOutput(OutputFormatPlugin):
f.write(raw)
for item in oeb_book.manifest:
path = os.path.abspath(item.href)
path = os.path.abspath(unquote(item.href))
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)

View File

@ -11,6 +11,7 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import os
import itertools
import re
import logging
import copy
from weakref import WeakKeyDictionary
from xml.dom import SyntaxErr as CSSSyntaxError
@ -106,7 +107,8 @@ class CSSSelector(etree.XPath):
class Stylizer(object):
STYLESHEETS = WeakKeyDictionary()
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'], extra_css=''):
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'],
extra_css='', user_css=''):
self.oeb = oeb
self.profile = profile
self.logger = oeb.logger
@ -115,7 +117,8 @@ class Stylizer(object):
cssname = os.path.splitext(basename)[0] + '.css'
stylesheets = [HTML_CSS_STYLESHEET]
head = xpath(tree, '/h:html/h:head')[0]
parser = cssutils.CSSParser(fetcher=self._fetch_css_file)
parser = cssutils.CSSParser(fetcher=self._fetch_css_file,
log=logging.getLogger('calibre.css'))
for elem in head:
if elem.tag == XHTML('style') and elem.text \
and elem.get('type', CSS_MIME) in OEB_STYLES:
@ -135,8 +138,9 @@ class Stylizer(object):
(path, item.href))
continue
stylesheets.append(sitem.data)
if extra_css:
text = XHTML_CSS_NAMESPACE + extra_css
for x in (extra_css, user_css):
if x:
text = XHTML_CSS_NAMESPACE + x
stylesheet = parser.parseString(text, href=cssname)
stylesheet.namespaces['h'] = XHTML_NS
stylesheets.append(stylesheet)
@ -288,6 +292,9 @@ class Style(object):
self._lineHeight = None
stylizer._styles[element] = self
def set(self, prop, val):
self._style[prop] = val
def _update_cssdict(self, cssdict):
self._style.update(cssdict)

View File

@ -114,12 +114,27 @@ class CSSFlattener(object):
def stylize_spine(self):
self.stylizers = {}
profile = self.context.source
css = ''
for item in self.oeb.spine:
html = item.data
body = html.find(XHTML('body'))
bs = body.get('style', '').split(';')
bs.append('margin-top: 0pt')
bs.append('margin-bottom: 0pt')
bs.append('margin-left : %fpt'%\
float(self.context.margin_left))
bs.append('margin-right : %fpt'%\
float(self.context.margin_right))
bs.append('text-align: '+ \
('left' if self.context.dont_justify else 'justify'))
body.set('style', '; '.join(bs))
stylizer = Stylizer(html, item.href, self.oeb, profile,
extra_css=self.context.extra_css)
user_css=self.context.extra_css,
extra_css=css)
self.stylizers[item] = stylizer
def baseline_node(self, node, stylizer, sizes, csize):
csize = stylizer.style(node)['font-size']
if node.text:
@ -219,6 +234,15 @@ class CSSFlattener(object):
if self.lineh and 'line-height' not in cssdict:
lineh = self.lineh / psize
cssdict['line-height'] = "%0.5fem" % lineh
if (self.context.remove_paragraph_spacing or
self.context.insert_blank_line) and tag in ('p', 'div'):
for prop in ('margin', 'padding', 'border'):
for edge in ('top', 'bottom'):
cssdict['%s-%s'%(prop, edge)] = '0pt'
if self.context.insert_blank_line:
cssdict['margin-top'] = cssdict['margin-bottom'] = '0.5em'
if self.context.remove_paragraph_spacing:
cssdict['text-indent'] = '1.5em'
if cssdict:
items = cssdict.items()
items.sort()
@ -253,7 +277,11 @@ class CSSFlattener(object):
href = item.relhref(href)
etree.SubElement(head, XHTML('link'),
rel='stylesheet', type=CSS_MIME, href=href)
if stylizer.page_rule:
stylizer.page_rule['margin-top'] = '%fpt'%\
float(self.context.margin_top)
stylizer.page_rule['margin-bottom'] = '%fpt'%\
float(self.context.margin_bottom)
items = stylizer.page_rule.items()
items.sort()
css = '; '.join("%s: %s" % (key, val) for key, val in items)
@ -285,3 +313,4 @@ class CSSFlattener(object):
for item in self.oeb.spine:
stylizer = self.stylizers[item]
self.flatten_head(item, stylizer, href)

View File

@ -0,0 +1,47 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class Clean(object):
'''Clean up guide, leaving only a pointer to the cover'''
def __call__(self, oeb, opts):
from calibre.ebooks.oeb.base import urldefrag
self.oeb, self.log, self.opts = oeb, oeb.log, opts
cover_href = ''
if 'cover' not in self.oeb.guide:
covers = []
for x in ('other.ms-coverimage-standard',
'other.ms-titleimage-standard', 'other.ms-titleimage',
'other.ms-coverimage', 'other.ms-thumbimage-standard',
'other.ms-thumbimage'):
if x in self.oeb.guide:
href = self.oeb.guide[x].href
item = self.oeb.manifest.hrefs[href]
covers.append([self.oeb.guide[x], len(item.data)])
covers.sort(cmp=lambda x,y:cmp(x[1], y[1]), reverse=True)
if covers:
ref = covers[0][0]
if len(covers) > 1:
self.log('Choosing %s:%s as the cover'%(ref.type, ref.href))
ref.type = 'cover'
self.oeb.guide.refs['cover'] = ref
cover_href = urldefrag(ref.href)[0]
for x in list(self.oeb.guide):
href = urldefrag(self.oeb.guide[x].href)[0]
if x.lower() != 'cover':
try:
if href != cover_href:
self.oeb.manifest.remove(self.oeb.manifest.hrefs[href])
except KeyError:
pass
self.oeb.guide.remove(x)

View File

@ -0,0 +1,66 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import textwrap
from lxml import etree
from calibre.ebooks.oeb.base import XPNSMAP
from calibre import guess_type
class Jacket(object):
'''
Book jacket manipulation. Remove first image and insert comments at start of
book.
'''
JACKET_TEMPLATE = textwrap.dedent(u'''\
<html xmlns="%(xmlns)s">
<head>
<title>%(title)s</title>
</head>
<body>
<h1 style="text-align: center">%(title)s</h1>
<h2 style="text-align: center">%(jacket)s</h2>
<div>
%(comments)s
</div>
</body>
</html>
''')
def remove_first_image(self):
for i, item in enumerate(self.oeb.spine):
if i > 2: break
for img in item.data.xpath('//h:img[@src]', namespace=XPNSMAP):
href = item.abshref(img.get('src'))
image = self.oeb.manifest.hrefs.get(href, None)
if image is not None:
self.log('Removing first image', img.get('src'))
self.oeb.manifest.remove(image)
img.getparent().remove(img)
return
def insert_comments(self, comments):
self.log('Inserting metadata comments into book...')
comments = comments.replace('\r\n', '\n').replace('\n\n', '<br/><br/>')
html = self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'],
title=self.opts.title, comments=comments,
jacket=_('Book Jacket'))
id, href = self.oeb.manifest.generate('jacket', 'jacket.xhtml')
root = etree.fromstring(html)
item = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root)
self.oeb.spine.insert(0, item, True)
def __call__(self, oeb, opts):
self.oeb, self.opts, self.log = oeb, opts, oeb.log
if opts.remove_first_image:
self.remove_fisrt_image()
if opts.insert_comments and opts.comments:
self.insert_comments(opts.comments)

View File

@ -0,0 +1,84 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
class MergeMetadata(object):
'Merge in user metadata, including cover'
def __call__(self, oeb, mi, prefer_metadata_cover=False):
from calibre.ebooks.oeb.base import DC
self.oeb, self.log = oeb, oeb.log
m = self.oeb.metadata
self.log('Merging user specified metadata...')
if mi.title:
m.clear('title')
m.add('title', mi.title)
if mi.title_sort:
if not m.title:
m.add(DC('title'), mi.title_sort)
m.title[0].file_as = mi.title_sort
if mi.authors:
m.filter('creator', lambda x : x.role.lower() == 'aut')
for a in mi.authors:
attrib = {'role':'aut'}
if mi.author_sort:
attrib['file_as'] = mi.author_sort
m.add('creator', a, attrib=attrib)
if mi.comments:
m.clear('description')
m.add('description', mi.comments)
if mi.publisher:
m.clear('publisher')
m.add('publisher', mi.publisher)
if mi.series:
m.clear('series')
m.add('series', mi.series)
if mi.isbn:
has = False
for x in m.identifier:
if x.scheme.lower() == 'isbn':
x.content = mi.isbn
has = True
if not has:
m.add('identifier', mi.isbn, scheme='ISBN')
if mi.language:
m.clear('language')
m.add('language', mi.language)
if mi.book_producer:
m.filter('creator', lambda x : x.role.lower() == 'bkp')
m.add('creator', mi.book_producer, role='bkp')
if mi.series_index is not None:
m.clear('series_index')
m.add('series_index', '%.2f'%mi.series_index)
if mi.rating is not None:
m.clear('rating')
m.add('rating', '%.2f'%mi.rating)
if mi.tags:
m.clear('subject')
for t in mi.tags:
m.add('subject', t)
self.set_cover(mi, prefer_metadata_cover)
def set_cover(self, mi, prefer_metadata_cover):
cdata = ''
if mi.cover and os.access(mi.cover, os.R_OK):
cdata = open(mi.cover, 'rb').read()
elif mi.cover_data and mi.cover_data[-1]:
cdata = mi.cover_data[1]
if not cdata: return
if 'cover' in self.oeb.guide:
if not prefer_metadata_cover:
href = self.oeb.guide['cover'].href
self.oeb.manifest.hrefs[href]._data = cdata
else:
id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
self.oeb.manifest.add(id, href, 'image/jpeg', data=cdata)
self.oeb.guide.add('cover', 'Cover', href)

View File

@ -16,7 +16,7 @@ from lxml import etree
from lxml.cssselect import CSSSelector
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
urldefrag, rewrite_links
urldefrag, rewrite_links, urlunquote
from calibre.ebooks.epub import tostring, rules
@ -142,7 +142,7 @@ class Split(object):
nhref = anchor_map[frag if frag else None]
nhref = self.current_item.relhref(nhref)
if frag:
nhref = '#'.join((nhref, frag))
nhref = '#'.join((urlunquote(nhref), frag))
return nhref
return url