From 4cd285859b6721c48eefd9b23fe47b0bfc5ab871 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 23 Apr 2009 22:31:11 -0700 Subject: [PATCH] Initial implementation of EPUB Output plugin --- src/calibre/customize/builtins.py | 3 +- src/calibre/customize/profiles.py | 4 +- src/calibre/ebooks/epub/__init__.py | 173 ------ src/calibre/ebooks/epub/fonts.py | 300 ---------- src/calibre/ebooks/epub/from_any.py | 93 --- src/calibre/ebooks/epub/from_feeds.py | 71 --- src/calibre/ebooks/epub/from_html.py | 547 ------------------ src/calibre/ebooks/epub/output.py | 221 ++++++- src/calibre/ebooks/oeb/iterator.py | 4 +- src/calibre/ebooks/oeb/transforms/guide.py | 13 +- src/calibre/ebooks/oeb/transforms/rescale.py | 37 ++ src/calibre/ebooks/oeb/transforms/split.py | 5 +- .../ebooks/oeb/transforms/structure.py | 19 +- 13 files changed, 285 insertions(+), 1205 deletions(-) delete mode 100644 src/calibre/ebooks/epub/fonts.py delete mode 100644 src/calibre/ebooks/epub/from_any.py delete mode 100644 src/calibre/ebooks/epub/from_feeds.py delete mode 100644 src/calibre/ebooks/epub/from_html.py create mode 100644 src/calibre/ebooks/oeb/transforms/rescale.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index e0e9158f0e..c726a19b2a 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -287,13 +287,14 @@ from calibre.ebooks.odt.input import ODTInput from calibre.ebooks.rtf.input import RTFInput from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput +from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, - FB2Input, ODTInput, RTFInput] + FB2Input, ODTInput, RTFInput, EPUBOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index c11529f025..67dd920135 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -3,7 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, re +import re from itertools import izip from calibre.customize import Plugin as _Plugin @@ -22,7 +22,7 @@ class Plugin(_Plugin): fbase = 12 fsizes = [5, 7, 9, 12, 13.5, 17, 20, 22, 24] - screen_size = (800, 600) + screen_size = (1600, 1200) dpi = 100 def __init__(self, *args, **kwargs): diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 2bc076a8ad..f5de8421e0 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -6,32 +6,7 @@ __docformat__ = 'restructuredtext en' ''' Conversion to EPUB. ''' -import sys, textwrap, re, os, uuid -from itertools import cycle -from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_STORED -from calibre.ebooks.html import tostring -from lxml import etree - -class DefaultProfile(object): - - flow_size = sys.maxint - screen_size = None - remove_special_chars = False - remove_object_tags = False - -class PRS505(DefaultProfile): - - flow_size = 270000 - screen_size = (590, 765) - remove_special_chars = re.compile(u'[\u200b\u00ad]') - remove_object_tags = True - - -PROFILES = { - 'PRS505' : PRS505, - 'None' : DefaultProfile, - } def rules(stylesheets): for s in stylesheets: @@ -58,152 +33,4 @@ def initialize_container(path_to_container, opf_name='metadata.opf'): zf.writestr('META-INF/container.xml', CONTAINER) return zf -def config(defaults=None, name='epub'): - desc = _('Options to control the conversion to EPUB') - if defaults is None: - c = Config(name, desc) - else: - c = StringConfig(defaults, desc) - c.update(common_config()) - c.remove_opt('output') - c.remove_opt('zip') - - c.add_opt('output', ['-o', '--output'], default=None, - help=_('The output EPUB file. If not specified, it is ' - 'derived from the input file name.')) - c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()), - help=_('Profile of the target device this EPUB is meant for. ' - 'Set to None to create a device independent EPUB. ' - 'The profile is used for device specific restrictions ' - 'on the EPUB. Choices are: ')+str(list(PROFILES.keys()))) - c.add_opt('override_css', ['--override-css'], default=None, - help=_('Either the path to a CSS stylesheet or raw CSS. ' - 'This CSS will override any existing CSS ' - 'declarations in the source files.')) - structure = c.add_group('structure detection', - _('Control auto-detection of document structure.')) - structure('chapter', ['--chapter'], - default="//*[re:match(name(), 'h[1-2]') and " - "re:test(., 'chapter|book|section|part', 'i')] | " - "//*[@class = 'chapter']", - help=_('''\ -An XPath expression to detect chapter titles. The default is to consider

or -

tags that contain the words "chapter","book","section" or "part" as chapter titles as -well as any tags that have class="chapter". -The expression used must evaluate to a list of elements. To disable chapter detection, -use the expression "/". See the XPath Tutorial in the calibre User Manual for further -help on using this feature. -''').replace('\n', ' ')) - structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'], - default='pagebreak', - help=_('Specify how to mark detected chapters. A value of ' - '"pagebreak" will insert page breaks before chapters. ' - 'A value of "rule" will insert a line before chapters. ' - 'A value of "none" will disable chapter marking and a ' - 'value of "both" will use both page breaks and lines ' - 'to mark chapters.')) - structure('cover', ['--cover'], default=None, - help=_('Path to the cover to be used for this book')) - structure('prefer_metadata_cover', ['--prefer-metadata-cover'], default=False, - action='store_true', - help=_('Use the cover detected from the source file in preference ' - 'to the specified cover.')) - structure('remove_first_image', ['--remove-first-image'], default=False, - help=_('Remove the first image from the input ebook. Useful if ' - 'the first image in the source file is a cover and you ' - 'are specifying an external cover.')) - structure('dont_split_on_page_breaks', ['--dont-split-on-page-breaks'], default=False, - help=_('Turn off splitting at page breaks. Normally, input files ' - 'are automatically split at every page break into ' - 'two files. This gives an output ebook that can be parsed ' - 'faster and with less resources. However, splitting is ' - 'slow and if your source file contains a very large ' - 'number of page breaks, you should turn off splitting ' - 'on page breaks.')) - structure('page', ['--page'], default=None, - help=_('XPath expression to detect page boundaries for building ' - 'a custom pagination map, as used by AdobeDE. Default is ' - 'not to build an explicit pagination map.')) - structure('page_names', ['--page-names'], default=None, - help=_('XPath expression to find the name of each page in the ' - 'pagination map relative to its boundary element. ' - 'Default is to number all pages staring with 1.')) - toc = c.add_group('toc', - _('''\ -Control the automatic generation of a Table of Contents. If an OPF file is detected -and it specifies a Table of Contents, then that will be used rather than trying -to auto-generate a Table of Contents. -''').replace('\n', ' ')) - toc('max_toc_links', ['--max-toc-links'], default=50, - help=_('Maximum number of links to insert into the TOC. Set to 0 ' - 'to disable. Default is: %default. Links are only added to the ' - 'TOC if less than the --toc-threshold number of chapters were detected.')) - toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, - help=_("Don't add auto-detected chapters to the Table of Contents.")) - toc('toc_threshold', ['--toc-threshold'], default=6, - help=_('If fewer than this number of chapters is detected, then links ' - 'are added to the Table of Contents. Default: %default')) - toc('level1_toc', ['--level1-toc'], default=None, - help=_('XPath expression that specifies all tags that should be added ' - 'to the Table of Contents at level one. If this is specified, ' - 'it takes precedence over other forms of auto-detection.')) - toc('level2_toc', ['--level2-toc'], default=None, - help=_('XPath expression that specifies all tags that should be added ' - 'to the Table of Contents at level two. Each entry is added ' - 'under the previous level one entry.')) - toc('level3_toc', ['--level3-toc'], default=None, - help=_('XPath expression that specifies all tags that should be added ' - 'to the Table of Contents at level three. Each entry is added ' - 'under the previous level two entry.')) - toc('from_ncx', ['--from-ncx'], default=None, - help=_('Path to a .ncx file that contains the table of contents to use ' - 'for this ebook. The NCX file should contain links relative to ' - 'the directory it is placed in. See ' - 'http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for ' - 'an overview of the NCX format.')) - toc('use_auto_toc', ['--use-auto-toc'], default=False, - help=_('Normally, if the source file already has a Table of Contents, ' - 'it is used in preference to the auto-generated one. ' - 'With this option, the auto-generated one is always used.')) - - layout = c.add_group('page layout', _('Control page layout')) - layout('margin_top', ['--margin-top'], default=5.0, - help=_('Set the top margin in pts. Default is %default')) - layout('margin_bottom', ['--margin-bottom'], default=5.0, - help=_('Set the bottom margin in pts. Default is %default')) - layout('margin_left', ['--margin-left'], default=5.0, - help=_('Set the left margin in pts. Default is %default')) - layout('margin_right', ['--margin-right'], default=5.0, - help=_('Set the right margin in pts. Default is %default')) - layout('base_font_size2', ['--base-font-size'], default=12.0, - help=_('The base font size in pts. Default is %defaultpt. ' - 'Set to 0 to disable rescaling of fonts.')) - layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=False, - help=_('Remove spacing between paragraphs. ' - 'Also sets a indent on paragraphs of 1.5em. ' - 'You can override this by adding p {text-indent: 0cm} to ' - '--override-css. Spacing removal will not work if the source ' - 'file forces inter-paragraph spacing.')) - layout('no_justification', ['--no-justification'], default=False, - help=_('Do not force text to be justified in output.')) - layout('linearize_tables', ['--linearize-tables'], default=False, - help=_('Remove table markup, converting it into paragraphs. ' - 'This is useful if your source file uses a table to manage layout.')) - layout('preserve_tag_structure', ['--preserve-tag-structure'], default=False, - help=_('Preserve the HTML tag structure while splitting large HTML files. ' - 'This is only neccessary if the HTML files contain CSS that ' - 'uses sibling selectors. Enabling this greatly slows down ' - 'processing of large HTML files.')) - - c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', - help=_('Print generated OPF file to stdout')) - c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug', - help=_('Print generated NCX file to stdout')) - c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', - default=False, - help=_('Keep intermediate files during processing by html2epub')) - c.add_opt('extract_to', ['--extract-to'], group='debug', default=None, - help=_('Extract the contents of the produced EPUB file to the ' - 'specified directory.')) - return c diff --git a/src/calibre/ebooks/epub/fonts.py b/src/calibre/ebooks/epub/fonts.py deleted file mode 100644 index 67e6066ed1..0000000000 --- a/src/calibre/ebooks/epub/fonts.py +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Font size rationalization. See :function:`relativize`. -''' - -import logging, re, operator, functools, collections, unittest, copy, sys -from xml.dom import SyntaxErr - -from lxml.cssselect import CSSSelector -from lxml import etree -from lxml.html import HtmlElement - -from calibre.ebooks.html_old import fromstring -from calibre.ebooks.epub import rules -from cssutils import CSSParser - -num = r'[-]?\d+|[-]?\d*\.\d+' -length = r'(?P0)|(?P{num})(?P%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num) -absolute_size = r'(?P(x?x-)?(small|large)|medium)' -relative_size = r'(?Psmaller|larger)' - -font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I) -line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) - -PTU = { - 'in' : 72., - 'cm' : 72/2.54, - 'mm' : 72/25.4, - 'pt' : 1.0, - 'pc' : 1/12., - } - -DEFAULT_FONT_SIZE = 12 - -class Rationalizer(object): - - @classmethod - def specificity(cls, s): - '''Map CSS specificity tuple to a single integer''' - return sum([10**(4-i) + x for i,x in enumerate(s)]) - - @classmethod - def compute_font_size(cls, elem): - ''' - Calculate the effective font size of an element traversing its ancestors as far as - neccessary. - ''' - cfs = elem.computed_font_size - if cfs is not None: - return - sfs = elem.specified_font_size - if callable(sfs): - parent = elem.getparent() - cls.compute_font_size(parent) - elem.computed_font_size = sfs(parent.computed_font_size) - else: - elem.computed_font_size = sfs - - @classmethod - def calculate_font_size(cls, style): - 'Return font size in pts from style object. For relative units returns a callable' - match = font_size_pat.search(style.font) - fs = '' - if match: - fs = match.group() - if style.fontSize: - fs = style.fontSize - - match = font_size_pat.search(fs) - if match is None: - return None - match = match.groupdict() - unit = match.get('unit', '') - if unit: unit = unit.lower() - if unit in PTU.keys(): - return PTU[unit] * float(match['num']) - if unit in ('em', 'ex'): - return functools.partial(operator.mul, float(match['num'])) - if unit == '%': - return functools.partial(operator.mul, float(match['num'])/100.) - abs = match.get('abs', '') - if abs: abs = abs.lower() - if abs: - x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1)) - return 12 * x - if match.get('zero', False): - return 0. - return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) - - @classmethod - def resolve_rules(cls, stylesheets): - for sheet in stylesheets: - if hasattr(sheet, 'fs_rules'): - continue - sheet.fs_rules = [] - sheet.lh_rules = [] - for r in sheet: - if r.type == r.STYLE_RULE: - font_size = cls.calculate_font_size(r.style) - if font_size is not None: - for s in r.selectorList: - sheet.fs_rules.append([CSSSelector(s.selectorText), font_size]) - orig = line_height_pat.search(r.style.lineHeight) - if orig is not None: - for s in r.selectorList: - sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]]) - - - @classmethod - def apply_font_size_rules(cls, stylesheets, root): - 'Add a ``specified_font_size`` attribute to every element that has a specified font size' - cls.resolve_rules(stylesheets) - for sheet in stylesheets: - for selector, font_size in sheet.fs_rules: - elems = selector(root) - for elem in elems: - elem.specified_font_size = font_size - - @classmethod - def remove_font_size_information(cls, stylesheets): - for r in rules(stylesheets): - r.style.removeProperty('font-size') - try: - new = font_size_pat.sub('', r.style.font).strip() - if new: - r.style.font = new - else: - r.style.removeProperty('font') - except SyntaxErr: - r.style.removeProperty('font') - if line_height_pat.search(r.style.lineHeight) is not None: - r.style.removeProperty('line-height') - - @classmethod - def compute_font_sizes(cls, root, stylesheets, base=12): - stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')] - cls.apply_font_size_rules(stylesheets, root) - - # Compute the effective font size of all tags - root.computed_font_size = DEFAULT_FONT_SIZE - for elem in root.iter(etree.Element): - cls.compute_font_size(elem) - - extra_css = {} - if base > 0: - # Calculate the "base" (i.e. most common) font size - font_sizes = collections.defaultdict(lambda : 0) - body = root.xpath('//body')[0] - IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6') - for elem in body.iter(etree.Element): - if elem.tag not in IGNORE: - t = getattr(elem, 'text', '') - if t: t = t.strip() - if t: - font_sizes[elem.computed_font_size] += len(t) - - t = getattr(elem, 'tail', '') - if t: t = t.strip() - if t: - parent = elem.getparent() - if parent.tag not in IGNORE: - font_sizes[parent.computed_font_size] += len(t) - - try: - most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0] - scale = base/most_common if most_common > 0 else 1. - except ValueError: - scale = 1. - - # rescale absolute line-heights - counter = 0 - for sheet in stylesheets: - for selector, lh in sheet.lh_rules: - for elem in selector(root): - elem.set('id', elem.get('id', 'cfs_%d'%counter)) - counter += 1 - if not extra_css.has_key(elem.get('id')): - extra_css[elem.get('id')] = [] - extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale)) - - - - # Rescale all computed font sizes - for elem in body.iter(etree.Element): - if isinstance(elem, HtmlElement): - elem.computed_font_size *= scale - - # Remove all font size specifications from the last stylesheet - cls.remove_font_size_information(stylesheets[-1:]) - - # Create the CSS to implement the rescaled font sizes - for elem in body.iter(etree.Element): - cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent())) - if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.: - elem.set('id', elem.get('id', 'cfs_%d'%counter)) - counter += 1 - if not extra_css.has_key(elem.get('id')): - extra_css[elem.get('id')] = [] - extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs))) - - css = CSSParser(loglevel=logging.ERROR).parseString('') - for id, r in extra_css.items(): - css.add('#%s {%s}'%(id, ';'.join(r))) - return css - - @classmethod - def rationalize(cls, stylesheets, root, opts): - logger = logging.getLogger('html2epub') - logger.info('\t\tRationalizing fonts...') - extra_css = None - if opts.base_font_size2 > 0: - try: - extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2) - except: - logger.warning('Failed to rationalize font sizes.') - if opts.verbose > 1: - logger.exception('') - finally: - root.remove_font_size_information() - logger.debug('\t\tDone rationalizing') - return extra_css - -################################################################################ -############## Testing -################################################################################ - -class FontTest(unittest.TestCase): - - def setUp(self): - from calibre.ebooks.epub import config - self.opts = config(defaults='').parse() - self.html = ''' - - - Test document - - -
- -

Some text

-
-

Some other text.

-

The longest piece of single font size text in this entire file. Used to test resizing.

- - - ''' - self.root = fromstring(self.html) - - def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1): - root1 = copy.deepcopy(self.root) - root1.computed_font_size = DEFAULT_FONT_SIZE - stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css) - stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base) - root2 = copy.deepcopy(root1) - root2.remove_font_size_information() - root2.computed_font_size = DEFAULT_FONT_SIZE - Rationalizer.apply_font_size_rules([stylesheet2], root2) - for elem in root2.iter(etree.Element): - Rationalizer.compute_font_size(elem) - for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)): - self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, - msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\ - (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size)) - return stylesheet2.cssText - - def testStripping(self): - 'Test that any original entries are removed from the CSS' - css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }' - css = CSSParser(loglevel=logging.ERROR).parseString(css) - Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css]) - self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), - 'p{font:bolditalic}') - - def testIdentity(self): - 'Test that no unnecessary font size changes are made' - extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}') - self.assertEqual(extra_css.strip(), '') - - def testRelativization(self): - 'Test conversion of absolute to relative sizes' - self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}') - - def testResizing(self): - 'Test resizing of fonts' - self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}') - - -def suite(): - return unittest.TestLoader().loadTestsFromTestCase(FontTest) - -def test(): - unittest.TextTestRunner(verbosity=2).run(suite()) - -if __name__ == '__main__': - sys.exit(test()) - diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py deleted file mode 100644 index 2f3f81124f..0000000000 --- a/src/calibre/ebooks/epub/from_any.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Convert any ebook format to epub. -''' - -import sys, os, re -from contextlib import nested - -from calibre import extract, walk -from calibre.ebooks import DRMError -from calibre.ebooks.epub import config as common_config -from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index -from calibre.ptempfile import TemporaryDirectory -from calibre.utils.zipfile import ZipFile -from calibre.customize.ui import run_plugins_on_preprocess - - -SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', - 'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub'] - -def unarchive(path, tdir): - extract(path, tdir) - files = list(walk(tdir)) - - for ext in ['opf'] + list(MAP.keys()): - for f in files: - if f.lower().endswith('.'+ext): - if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048: - continue - return f, ext - return find_html_index(files) - -def any2epub(opts, path, notification=None, create_epub=True, - oeb_cover=False, extract_to=None): - path = run_plugins_on_preprocess(path) - ext = os.path.splitext(path)[1] - if not ext: - raise ValueError('Unknown file type: '+path) - ext = ext.lower()[1:] - - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub' - - with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2): - if ext in ['rar', 'zip', 'oebzip']: - path, ext = unarchive(path, tdir1) - print 'Found %s file in archive'%(ext.upper()) - - if ext in MAP.keys(): - path = MAP[ext](path, tdir2, opts) - ext = 'opf' - - - if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None: - raise ValueError('Conversion from %s is not supported'%ext.upper()) - - print 'Creating EPUB file...' - html2epub(path, opts, notification=notification, - create_epub=create_epub, oeb_cover=oeb_cover, - extract_to=extract_to) - -def config(defaults=None): - return common_config(defaults=defaults) - - -def formats(): - return ['html', 'rar', 'zip', 'oebzip']+list(MAP.keys()) - -USAGE = _('''\ -%%prog [options] filename - -Convert any of a large number of ebook formats to a %s file. Supported formats are: %s -''') - -def option_parser(usage=USAGE): - return config().option_parser(usage=usage%('EPUB', formats())) - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print 'No input file specified.' - return 1 - any2epub(opts, args[1]) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/epub/from_feeds.py b/src/calibre/ebooks/epub/from_feeds.py deleted file mode 100644 index 6a12353f50..0000000000 --- a/src/calibre/ebooks/epub/from_feeds.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Convert periodical content into EPUB ebooks. -''' -import sys, glob, os -from calibre.web.feeds.main import config as feeds2disk_config, USAGE, run_recipe -from calibre.ebooks.epub.from_html import config as html2epub_config -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.epub.from_html import convert as html2epub -from calibre import strftime, sanitize_file_name - -def config(defaults=None): - c = feeds2disk_config(defaults=defaults) - c.remove('lrf') - c.remove('epub') - c.remove('output_dir') - c.update(html2epub_config(defaults=defaults)) - c.remove('chapter_mark') - return c - -def option_parser(): - c = config() - return c.option_parser(usage=USAGE) - -def convert(opts, recipe_arg, notification=None): - opts.lrf = False - opts.epub = True - if opts.debug: - opts.verbose = 2 - parser = option_parser() - with TemporaryDirectory('_feeds2epub') as tdir: - opts.output_dir = tdir - recipe = run_recipe(opts, recipe_arg, parser, notification=notification) - c = config() - recipe_opts = c.parse_string(recipe.html2epub_options) - c.smart_update(recipe_opts, opts) - opts = recipe_opts - opts.chapter_mark = 'none' - opts.dont_split_on_page_breaks = True - opf = glob.glob(os.path.join(tdir, '*.opf')) - if not opf: - raise Exception('Downloading of recipe: %s failed'%recipe_arg) - opf = opf[0] - - if opts.output is None: - fname = recipe.title + strftime(recipe.timefmt) + '.epub' - opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname)) - - print 'Generating epub...' - opts.encoding = 'utf-8' - opts.remove_paragraph_spacing = True - html2epub(opf, opts, notification=notification) - - -def main(args=sys.argv, notification=None, handler=None): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) != 2 and opts.feeds is None: - parser.print_help() - return 1 - recipe_arg = args[1] if len(args) > 1 else None - convert(opts, recipe_arg, notification=notification) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py deleted file mode 100644 index 3e1ec4c811..0000000000 --- a/src/calibre/ebooks/epub/from_html.py +++ /dev/null @@ -1,547 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Conversion of HTML/OPF files follows several stages: - - * All links in the HTML files or in the OPF manifest are - followed to build up a list of HTML files to be converted. - This stage is implemented by - :function:`calibre.ebooks.html.traverse` and - :class:`calibre.ebooks.html.HTMLFile`. - - * The HTML is pre-processed to make it more semantic. - All links in the HTML files to other resources like images, - stylesheets, etc. are relativized. The resources are copied - into the `resources` sub directory. This is accomplished by - :class:`calibre.ebooks.html.PreProcessor` and - :class:`calibre.ebooks.html.Parser`. - - * The HTML is processed. Various operations are performed. - All style declarations are extracted and consolidated into - a single style sheet. Chapters are auto-detected and marked. - Various font related manipulations are performed. See - :class:`HTMLProcessor`. - - * The processed HTML is saved and the - :module:`calibre.ebooks.epub.split` module is used to split up - large HTML files into smaller chunks. - - * The EPUB container is created. -''' - -import os, sys, cStringIO, logging, re, functools, shutil - -from lxml.etree import XPath -from lxml import html, etree -from PyQt4.Qt import QApplication, QPixmap, Qt - -from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\ - opf_traverse, create_metadata, rebase_toc, Link, parser -from calibre.ebooks.epub import config as common_config, tostring -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.metadata.opf2 import OPF -from calibre.ebooks.epub import initialize_container, PROFILES -from calibre.ebooks.epub.split import split -from calibre.ebooks.epub.pages import add_page_map -from calibre.ebooks.epub.fonts import Rationalizer -from calibre.constants import preferred_encoding -from calibre.customize.ui import run_plugins_on_postprocess -from calibre import walk, CurrentDir, to_unicode, fit_image - -content = functools.partial(os.path.join, u'content') - -def remove_bad_link(element, attribute, link, pos): - if attribute is not None: - if element.tag in ['link']: - element.getparent().remove(element) - else: - element.set(attribute, '') - del element.attrib[attribute] - -def check_links(opf_path, pretty_print): - ''' - Find and remove all invalid links in the HTML files - ''' - logger = logging.getLogger('html2epub') - logger.info('\tChecking files for bad links...') - pathtoopf = os.path.abspath(opf_path) - with CurrentDir(os.path.dirname(pathtoopf)): - opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) - html_files = [] - for item in opf.itermanifest(): - if 'html' in item.get('media-type', '').lower(): - f = item.get('href').split('/')[-1] - if isinstance(f, str): - f = f.decode('utf-8') - html_files.append(os.path.abspath(content(f))) - - for path in html_files: - if not os.access(path, os.R_OK): - continue - base = os.path.dirname(path) - root = html.fromstring(open(content(path), 'rb').read(), parser=parser) - for element, attribute, link, pos in list(root.iterlinks()): - link = to_unicode(link) - plink = Link(link, base) - bad = False - if plink.path is not None and not os.path.exists(plink.path): - bad = True - if bad: - remove_bad_link(element, attribute, link, pos) - open(content(path), 'wb').write(tostring(root, pretty_print)) - -def find_html_index(files): - ''' - Given a list of files, find the most likely root HTML file in the - list. - ''' - html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE) - html_files = [f for f in files if html_pat.search(f) is not None] - if not html_files: - raise ValueError(_('Could not find an ebook inside the archive')) - html_files = [(f, os.stat(f).st_size) for f in html_files] - html_files.sort(cmp = lambda x, y: cmp(x[1], y[1])) - html_files = [f[0] for f in html_files] - for q in ('toc', 'index'): - for f in html_files: - if os.path.splitext(os.path.basename(f))[0].lower() == q: - return f, os.path.splitext(f)[1].lower()[1:] - return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] - -def rescale_images(imgdir, screen_size, log): - pwidth, pheight = screen_size - if QApplication.instance() is None: - QApplication([]) - for f in os.listdir(imgdir): - path = os.path.join(imgdir, f) - if os.path.splitext(f)[1] in ('.css', '.js'): - continue - - p = QPixmap() - p.load(path) - if p.isNull(): - continue - width, height = p.width(), p.height() - scaled, new_width, new_height = fit_image(width, height, pwidth, - pheight) - if scaled: - log.info('Rescaling image: '+f) - p.scaled(new_width, new_height, Qt.IgnoreAspectRatio, - Qt.SmoothTransformation).save(path, 'JPEG') - - - - - -class HTMLProcessor(Processor, Rationalizer): - - def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets): - Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, - name='html2epub') - if opts.verbose > 2: - self.debug_tree('parsed') - self.detect_chapters() - - self.extract_css(stylesheets) - if self.opts.base_font_size2 > 0: - self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet], - self.root, self.opts) - if opts.verbose > 2: - self.debug_tree('nocss') - - if hasattr(self.body, 'xpath'): - for script in list(self.body.xpath('descendant::script')): - script.getparent().remove(script) - - self.fix_markup() - - def convert_image(self, img): - rpath = img.get('src', '') - path = os.path.join(os.path.dirname(self.save_path()), *rpath.split('/')) - if os.path.exists(path) and os.path.isfile(path): - if QApplication.instance() is None: - app = QApplication([]) - app - p = QPixmap() - p.load(path) - if not p.isNull(): - p.save(path + '_calibre_converted.jpg') - os.remove(path) - for key, val in self.resource_map.items(): - if val == rpath: - self.resource_map[key] = rpath+'_calibre_converted.jpg' - img.set('src', rpath+'_calibre_converted.jpg') - - def fix_markup(self): - ''' - Perform various markup transforms to get the output to render correctly - in the quirky ADE. - ''' - # Replace
that are children of as ADE doesn't handle them - if hasattr(self.body, 'xpath'): - for br in self.body.xpath('./br'): - if br.getparent() is None: - continue - try: - sibling = br.itersiblings().next() - except: - sibling = None - br.tag = 'p' - br.text = u'\u00a0' - if (br.tail and br.tail.strip()) or sibling is None or \ - getattr(sibling, 'tag', '') != 'br': - style = br.get('style', '').split(';') - style = filter(None, map(lambda x: x.strip(), style)) - style.append('margin: 0pt; border:0pt; height:0pt') - br.set('style', '; '.join(style)) - else: - sibling.getparent().remove(sibling) - if sibling.tail: - if not br.tail: - br.tail = '' - br.tail += sibling.tail - - - if self.opts.profile.remove_object_tags: - for tag in self.root.xpath('//embed'): - tag.getparent().remove(tag) - for tag in self.root.xpath('//object'): - if tag.get('type', '').lower().strip() in ('image/svg+xml',): - continue - tag.getparent().remove(tag) - - - for tag in self.root.xpath('//title|//style'): - if not tag.text: - tag.getparent().remove(tag) - for tag in self.root.xpath('//script'): - if not tag.text and not tag.get('src', False): - tag.getparent().remove(tag) - - for tag in self.root.xpath('//form'): - tag.getparent().remove(tag) - - for tag in self.root.xpath('//center'): - tag.tag = 'div' - tag.set('style', 'text-align:center') - - if self.opts.linearize_tables: - for tag in self.root.xpath('//table | //tr | //th | //td'): - tag.tag = 'div' - - # ADE can't handle & in an img url - for tag in self.root.xpath('//img[@src]'): - tag.set('src', tag.get('src', '').replace('&', '')) - - - def save(self): - for meta in list(self.root.xpath('//meta')): - meta.getparent().remove(meta) - # Strip all comments since Adobe DE is petrified of them - Processor.save(self, strip_comments=True) - - def remove_first_image(self): - images = self.root.xpath('//img') - if images: - images[0].getparent().remove(images[0]) - return True - return False - - - - -def config(defaults=None): - return common_config(defaults=defaults) - -def option_parser(): - c = config() - return c.option_parser(usage=_('''\ -%prog [options] file.html|opf - -Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file. -If you specify an OPF file instead of an HTML file, the list of links is takes from -the element of the OPF file. -''')) - -def parse_content(filelist, opts, tdir): - os.makedirs(os.path.join(tdir, 'content', 'resources')) - resource_map, stylesheets = {}, {} - toc = TOC(base_path=tdir, type='root') - stylesheet_map = {} - first_image_removed = False - for htmlfile in filelist: - logging.getLogger('html2epub').debug('Processing %s...'%htmlfile) - hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), - resource_map, filelist, stylesheets) - if not first_image_removed and opts.remove_first_image: - first_image_removed = hp.remove_first_image() - hp.populate_toc(toc) - hp.save() - stylesheet_map[os.path.basename(hp.save_path())] = \ - [s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None] - - logging.getLogger('html2epub').debug('Saving stylesheets...') - if opts.base_font_size2 > 0: - Rationalizer.remove_font_size_information(stylesheets.values()) - for path, css in stylesheets.items(): - raw = getattr(css, 'cssText', css) - if isinstance(raw, unicode): - raw = raw.encode('utf-8') - open(path, 'wb').write(raw) - if toc.count('chapter') > opts.toc_threshold: - toc.purge(['file', 'link', 'unknown']) - if toc.count('chapter') + toc.count('file') > opts.toc_threshold: - toc.purge(['link', 'unknown']) - toc.purge(['link'], max=opts.max_toc_links) - - return resource_map, hp.htmlfile_map, toc, stylesheet_map - -TITLEPAGE = '''\ - - - Cover - - - -
- cover -
- - -''' - -def create_cover_image(src, dest, screen_size, rescale_cover=True): - try: - from PyQt4.Qt import QImage, Qt - if QApplication.instance() is None: - QApplication([]) - im = QImage() - im.load(src) - if im.isNull(): - raise ValueError('Invalid cover image') - if rescale_cover and screen_size is not None: - width, height = im.width(), im.height() - dw, dh = (screen_size[0]-width)/float(width), (screen_size[1]-height)/float(height) - delta = min(dw, dh) - if delta > 0: - nwidth = int(width + delta*(width)) - nheight = int(height + delta*(height)) - im = im.scaled(int(nwidth), int(nheight), Qt.IgnoreAspectRatio, Qt.SmoothTransformation) - im.save(dest) - except: - import traceback - traceback.print_exc() - return False - return True - -def process_title_page(mi, filelist, htmlfilemap, opts, tdir): - old_title_page = None - f = lambda x : os.path.normcase(os.path.normpath(x)) - if not isinstance(mi.cover, basestring): - mi.cover = None - if mi.cover: - if f(filelist[0].path) == f(mi.cover): - old_title_page = htmlfilemap[filelist[0].path] - #logger = logging.getLogger('html2epub') - metadata_cover = mi.cover - if metadata_cover and not os.path.exists(metadata_cover): - metadata_cover = None - - cpath = '/'.join(('resources', '_cover_.jpg')) - cover_dest = os.path.join(tdir, 'content', *cpath.split('/')) - if metadata_cover is not None: - if not create_cover_image(metadata_cover, cover_dest, - opts.profile.screen_size): - metadata_cover = None - specified_cover = opts.cover - if specified_cover and not os.path.exists(specified_cover): - specified_cover = None - if specified_cover is not None: - if not create_cover_image(specified_cover, cover_dest, - opts.profile.screen_size): - specified_cover = None - - cover = metadata_cover if specified_cover is None or (opts.prefer_metadata_cover and metadata_cover is not None) else specified_cover - - if cover is not None: - titlepage = TITLEPAGE%cpath - tp = 'calibre_title_page.html' if old_title_page is None else old_title_page - tppath = os.path.join(tdir, 'content', tp) - with open(tppath, 'wb') as f: - f.write(titlepage) - return tp if old_title_page is None else None, True - elif os.path.exists(cover_dest): - os.remove(cover_dest) - return None, old_title_page is not None - -def find_oeb_cover(htmlfile): - if os.stat(htmlfile).st_size > 2048: - return None - match = re.search(r'(?i)]+src\s*=\s*[\'"](.+?)[\'"]', open(htmlfile, 'rb').read()) - if match: - return match.group(1) - -def condense_ncx(ncx_path): - tree = etree.parse(ncx_path) - for tag in tree.getroot().iter(tag=etree.Element): - if tag.text: - tag.text = tag.text.strip() - if tag.tail: - tag.tail = tag.tail.strip() - compressed = etree.tostring(tree.getroot(), encoding='utf-8') - open(ncx_path, 'wb').write(compressed) - -def convert(htmlfile, opts, notification=None, create_epub=True, - oeb_cover=False, extract_to=None): - htmlfile = os.path.abspath(htmlfile) - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub' - opts.profile = PROFILES[opts.profile] - opts.output = os.path.abspath(opts.output) - if opts.override_css is not None: - try: - opts.override_css = open(opts.override_css, 'rb').read().decode(preferred_encoding, 'replace') - except: - opts.override_css = opts.override_css.decode(preferred_encoding, 'replace') - if opts.from_opf: - opts.from_opf = os.path.abspath(opts.from_opf) - if opts.from_ncx: - opts.from_ncx = os.path.abspath(opts.from_ncx) - if htmlfile.lower().endswith('.opf'): - opf = OPF(htmlfile, os.path.dirname(os.path.abspath(htmlfile))) - filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) - if not filelist: - # Bad OPF look for a HTML file instead - htmlfile = find_html_index(walk(os.path.dirname(htmlfile)))[0] - if htmlfile is None: - raise ValueError('Could not find suitable file to convert.') - filelist = get_filelist(htmlfile, opts)[1] - mi = merge_metadata(None, opf, opts) - else: - opf, filelist = get_filelist(htmlfile, opts) - mi = merge_metadata(htmlfile, opf, opts) - opts.chapter = XPath(opts.chapter, - namespaces={'re':'http://exslt.org/regular-expressions'}) - for x in (1, 2, 3): - attr = 'level%d_toc'%x - if getattr(opts, attr): - setattr(opts, attr, XPath(getattr(opts, attr), - namespaces={'re':'http://exslt.org/regular-expressions'})) - else: - setattr(opts, attr, None) - - with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir: - if opts.keep_intermediate: - print 'Intermediate files in', tdir - resource_map, htmlfile_map, generated_toc, stylesheet_map = \ - parse_content(filelist, opts, tdir) - logger = logging.getLogger('html2epub') - resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()] - - - title_page, has_title_page = process_title_page(mi, filelist, htmlfile_map, opts, tdir) - spine = [htmlfile_map[f.path] for f in filelist] - if not oeb_cover and title_page is not None: - spine = [title_page] + spine - mi.cover = None - mi.cover_data = (None, None) - - - mi = create_metadata(tdir, mi, spine, resources) - buf = cStringIO.StringIO() - if mi.toc: - rebase_toc(mi.toc, htmlfile_map, tdir) - if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2: - mi.toc = generated_toc - if opts.from_ncx: - toc = TOC() - toc.read_ncx_toc(opts.from_ncx) - mi.toc = toc - for item in mi.manifest: - if getattr(item, 'mime_type', None) == 'text/html': - item.mime_type = 'application/xhtml+xml' - opf_path = os.path.join(tdir, 'metadata.opf') - with open(opf_path, 'wb') as f: - mi.render(f, buf, 'toc.ncx') - toc = buf.getvalue() - if toc: - with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f: - f.write(toc) - if opts.show_ncx: - print toc - split(opf_path, opts, stylesheet_map) - if opts.page: - logger.info('\tBuilding page map...') - add_page_map(opf_path, opts) - check_links(opf_path, opts.pretty_print) - - opf = OPF(opf_path, tdir) - opf.remove_guide() - oeb_cover_file = None - if oeb_cover and title_page is not None: - oeb_cover_file = find_oeb_cover(os.path.join(tdir, 'content', title_page)) - if has_title_page or (oeb_cover and oeb_cover_file): - opf.create_guide_element() - if has_title_page and not oeb_cover: - opf.add_guide_item('cover', 'Cover', 'content/'+spine[0]) - if oeb_cover and oeb_cover_file: - opf.add_guide_item('cover', 'Cover', 'content/'+oeb_cover_file) - - cpath = os.path.join(tdir, 'content', 'resources', '_cover_.jpg') - if os.path.exists(cpath): - opf.add_path_to_manifest(cpath, 'image/jpeg') - with open(opf_path, 'wb') as f: - f.write(opf.render()) - ncx_path = os.path.join(os.path.dirname(opf_path), 'toc.ncx') - if os.path.exists(ncx_path) and os.stat(ncx_path).st_size > opts.profile.flow_size: - logger.info('Condensing NCX from %d bytes...'%os.stat(ncx_path).st_size) - condense_ncx(ncx_path) - if os.stat(ncx_path).st_size > opts.profile.flow_size: - logger.warn('NCX still larger than allowed size at %d bytes. Menu based Table of Contents may not work on device.'%os.stat(ncx_path).st_size) - - if opts.profile.screen_size is not None: - rescale_images(os.path.join(tdir, 'content', 'resources'), - opts.profile.screen_size, logger) - - if create_epub: - epub = initialize_container(opts.output) - epub.add_dir(tdir) - epub.close() - run_plugins_on_postprocess(opts.output, 'epub') - logger.info(_('Output written to ')+opts.output) - - if opts.show_opf: - print open(opf_path, 'rb').read() - - if opts.extract_to is not None: - if os.path.exists(opts.extract_to): - shutil.rmtree(opts.extract_to) - shutil.copytree(tdir, opts.extract_to) - - if extract_to is not None: - if os.path.exists(extract_to): - shutil.rmtree(extract_to) - shutil.copytree(tdir, extract_to) - - - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print _('You must specify an input HTML file') - return 1 - convert(args[1], opts) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 4ce13720e0..a43ca4e5e3 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -6,9 +6,15 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import os +from urllib import unquote from calibre.customize.conversion import OutputFormatPlugin -from calibre import CurrentDir +from calibre.ptempfile import TemporaryDirectory +from calibre.constants import __appname__, __version__ +from calibre import strftime, guess_type +from lxml import etree + class EPUBOutput(OutputFormatPlugin): @@ -16,7 +22,218 @@ class EPUBOutput(OutputFormatPlugin): author = 'Kovid Goyal' file_type = 'epub' + TITLEPAGE_COVER = '''\ + + + Cover + + + +
+ cover +
+ + +''' + + TITLEPAGE = '''\ + + + + + +

%(title)s

+

+
+
+ calibre +
+
+

%(date)s

+




+

%(author)s

+








+

Produced by %(app)s

+
+
+ + +''' + def convert(self, oeb, output_path, input_plugin, opts, log): - self.log, self.opts = log, opts + self.log, self.opts, self.oeb = log, opts, oeb + + self.workaround_ade_quirks() + + from calibre.ebooks.oeb.transforms.rescale import RescaleImages + RescaleImages()(oeb, opts) + self.insert_cover() + + with TemporaryDirectory('_epub_output') as tdir: + from calibre.customize.ui import plugin_for_output_format + oeb_output = plugin_for_output_format('oeb') + oeb_output.convert(oeb, tdir, input_plugin, opts, log) + opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] + self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\ + if x.endswith('.ncx')][0]) + + from calibre.epub import initialize_container + epub = initialize_container(output_path, os.path.basename(opf)) + epub.add_dir(tdir) + epub.close() + + def default_cover(self): + ''' + Create a generic cover for books that dont have a cover + ''' + try: + from calibre.gui2 import images_rc # Needed for access to logo + from PyQt4.Qt import QApplication, QFile, QIODevice + except: + return None + from calibre.ebooks.metadata import authors_to_string + images_rc + m = self.oeb.metadata + title = unicode(m.title[0]) + a = [unicode(x) for x in m.creators if m.role == 'aut'] + author = authors_to_string(a) + if QApplication.instance() is None: QApplication([]) + f = QFile(':/library') + f.open(QIODevice.ReadOnly) + img_data = str(f.readAll()) + id, href = self.oeb.manifest.generate('calibre-logo', + 'calibre-logo.png') + self.oeb.manifest.add(id, href, 'image/png', data=img_data) + html = self.TITLEPAGE%dict(title=title, author=author, + date=strftime('%d %b, %Y'), + app=__appname__ +' '+__version__, + img=href) + id, href = self.oeb.manifest.generate('calibre-titlepage', + 'calibre-titlepage.xhtml') + return self.oeb.manifest.add(id, href, guess_type('t.xhtml')[0], + data=etree.fromstring(html)) + + + def insert_cover(self): + from calibre.ebooks.oeb.base import urldefrag + from calibre import guess_type + g, m = self.oeb.guide, self.oeb.manifest + if 'titlepage' not in g: + if 'cover' in g: + tp = self.TITLEPAGE_COVER%unquote(g['cover'].href) + id, href = m.generate('titlepage', 'titlepage.xhtml') + item = m.add(id, href, guess_type('t.xhtml'), + data=etree.fromstring(tp)) + else: + item = self.default_cover() + else: + item = self.oeb.manifest.hrefs[ + urldefrag(self.oeb.guide['titlepage'].href)[0]] + if item is not None: + self.oeb.spine.insert(0, item, True) + self.oeb.guide.refs['cover'].href = item.href + self.oeb.guide.refs['titlepage'].href = item.href + + + + def condense_ncx(self, ncx_path): + if not self.opts.pretty_print: + tree = etree.parse(ncx_path) + for tag in tree.getroot().iter(tag=etree.Element): + if tag.text: + tag.text = tag.text.strip() + if tag.tail: + tag.tail = tag.tail.strip() + compressed = etree.tostring(tree.getroot(), encoding='utf-8') + open(ncx_path, 'wb').write(compressed) + + + + def workaround_ade_quirks(self): + ''' + Perform various markup transforms to get the output to render correctly + in the quirky ADE. + ''' + from calibre.ebooks.oeb.base import XPNSMAP, XHTML + from lxml.etree import XPath as _XPath + from functools import partial + XPath = partial(_XPath, namespaces=XPNSMAP) + + for x in self.oeb.spine: + root = x.data + body = XPath('//h:body')(root) + if body: + body = body[0] + # Replace
that are children of as ADE doesn't handle them + if hasattr(body, 'xpath'): + for br in body.xpath('./h:br'): + if br.getparent() is None: + continue + try: + sibling = br.itersiblings().next() + except: + sibling = None + br.tag = XHTML('p') + br.text = u'\u00a0' + if (br.tail and br.tail.strip()) or sibling is None or \ + getattr(sibling, 'tag', '') != XHTML('br'): + style = br.get('style', '').split(';') + style = filter(None, map(lambda x: x.strip(), style)) + style.append('margin: 0pt; border:0pt; height:0pt') + br.set('style', '; '.join(style)) + else: + sibling.getparent().remove(sibling) + if sibling.tail: + if not br.tail: + br.tail = '' + br.tail += sibling.tail + + + if self.opts.output_profile.remove_object_tags: + for tag in root.xpath('//h:embed'): + tag.getparent().remove(tag) + for tag in root.xpath('//h:object'): + if tag.get('type', '').lower().strip() in ('image/svg+xml',): + continue + tag.getparent().remove(tag) + + for tag in root.xpath('//h:title|//h:style'): + if not tag.text: + tag.getparent().remove(tag) + for tag in root.xpath('//h:script'): + if not tag.text and not tag.get('src', False): + tag.getparent().remove(tag) + + for tag in root.xpath('//h:form'): + tag.getparent().remove(tag) + + for tag in root.xpath('//h:center'): + tag.tag = XHTML('div') + tag.set('style', 'text-align:center') + + # ADE can't handle & in an img url + for tag in self.root.xpath('//h:img[@src]'): + tag.set('src', tag.get('src', '').replace('&', '')) + + stylesheet = self.oeb.manifest.hrefs['stylesheet.css'] + stylesheet.data.add('a { color: inherit; text-decoration: inherit; ' + 'cursor: default; }') + stylesheet.data.add('a[href] { color: blue; ' + 'text-decoration: underline; cursor:pointer; }') + diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index ab3e90083d..ffafa6d1a2 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -12,13 +12,15 @@ from cStringIO import StringIO from PyQt4.Qt import QFontDatabase from calibre.customize.ui import available_input_formats -from calibre.ebooks.epub.from_html import TITLEPAGE from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.config import DynamicConfig from calibre.utils.logging import Log +from calibre.ebooks.epub.output import EPUBOutput + +TITLEPAGE = EPUBOutput.TITLEPAGE_COVER def character_count(html): ''' diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py index 06153c5a48..00830b1a8c 100644 --- a/src/calibre/ebooks/oeb/transforms/guide.py +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -14,7 +14,10 @@ class Clean(object): from calibre.ebooks.oeb.base import urldefrag self.oeb, self.log, self.opts = oeb, oeb.log, opts - cover_href = '' + protected_hrefs = set([]) + if 'titlepage' in self.oeb.guide: + protected_hrefs.add(urldefrag( + self.oeb.guide['titlepage'].href)[0]) if 'cover' not in self.oeb.guide: covers = [] for x in ('other.ms-coverimage-standard', @@ -32,15 +35,15 @@ class Clean(object): self.log('Choosing %s:%s as the cover'%(ref.type, ref.href)) ref.type = 'cover' self.oeb.guide.refs['cover'] = ref - cover_href = urldefrag(ref.href)[0] + protected_hrefs.add(urldefrag(ref.href)[0]) else: - cover_href = urldefrag(self.oeb.guide.refs['cover'].href)[0] + protected_hrefs.add(urldefrag(self.oeb.guide.refs['cover'].href)[0]) for x in list(self.oeb.guide): href = urldefrag(self.oeb.guide[x].href)[0] - if x.lower() != 'cover': + if x.lower() != ('cover', 'titlepage'): try: - if href != cover_href: + if href not in protected_hrefs: self.oeb.manifest.remove(self.oeb.manifest.hrefs[href]) except KeyError: pass diff --git a/src/calibre/ebooks/oeb/transforms/rescale.py b/src/calibre/ebooks/oeb/transforms/rescale.py new file mode 100644 index 0000000000..5b62e5fda5 --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/rescale.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre import fit_image + +class RescaleImages(object): + 'Rescale all images to fit inside given screen size' + + def __call__(self, oeb, opts): + from PyQt4.Qt import QApplication, QImage, Qt + from calibre.gui2 import pixmap_to_data + self.oeb, self.opts, self.log = oeb, opts, oeb.log + page_width, page_height = opts.dest.width, opts.dest.height + for item in oeb.manifest: + if item.media_type.startswith('image'): + raw = item.data + if not raw: continue + if QApplication.instance() is None: + QApplication([]) + + img = QImage(10, 10, QImage.Format_ARGB32_Premultiplied) + if not img.loadFromData(raw): continue + width, height = img.width(), img.height() + scaled, new_width, new_height = fit_image(width, height, + page_width, page_height) + if scaled: + self.log('Rescaling image', item.href) + img = img.scaled(new_width, new_height, + Qt.IgnoreAspectRatio, Qt.SmoothTransformation) + item.data = pixmap_to_data(img) + + diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index b54b0ebce0..d3505a5fd9 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -17,7 +17,7 @@ from lxml.cssselect import CSSSelector from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \ urldefrag, rewrite_links, urlunquote -from calibre.ebooks.epub import tostring, rules +from calibre.ebooks.epub import rules XPath = functools.partial(_XPath, namespaces=NAMESPACES) @@ -25,6 +25,9 @@ XPath = functools.partial(_XPath, namespaces=NAMESPACES) SPLIT_ATTR = 'cs' SPLIT_POINT_ATTR = 'csp' +def tostring(root): + return etree.tostring(root, encoding='utf-8') + class SplitError(ValueError): def __init__(self, path, root): diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py index 6499a5e9c4..9240873346 100644 --- a/src/calibre/ebooks/oeb/transforms/structure.py +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -11,7 +11,7 @@ import re from lxml import etree from urlparse import urlparse -from calibre.ebooks.oeb.base import XPNSMAP, TOC +from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML XPath = lambda x: etree.XPath(x, namespaces=XPNSMAP) class DetectStructure(object): @@ -63,11 +63,11 @@ class DetectStructure(object): if chapter_mark == 'none': continue elif chapter_mark == 'rule': - mark = etree.Element('hr') + mark = etree.Element(XHTML('hr')) elif chapter_mark == 'pagebreak': - mark = etree.Element('div', style=page_break_after) + mark = etree.Element(XHTML('div'), style=page_break_after) else: # chapter_mark == 'both': - mark = etree.Element('hr', style=page_break_before) + mark = etree.Element(XHTML('hr'), style=page_break_before) elem.addprevious(mark) def create_level_based_toc(self): @@ -114,12 +114,13 @@ class DetectStructure(object): def add_leveled_toc_items(self, item): level1 = XPath(self.opts.level1_toc)(item.data) level1_order = [] + document = item counter = 1 if level1: added = {} for elem in level1: - text, _href = self.elem_to_link(item, elem, counter) + text, _href = self.elem_to_link(document, elem, counter) counter += 1 if text: node = self.oeb.toc.add(text, _href, @@ -132,11 +133,11 @@ class DetectStructure(object): level2 = list(XPath(self.opts.level2_toc)(item.data)) for elem in level2: level1 = None - for item in item.data.iterdescendants(): + for item in document.data.iterdescendants(): if item in added.keys(): level1 = added[item] elif item == elem and level1 is not None: - text, _href = self.elem_to_link(item, elem, counter) + text, _href = self.elem_to_link(document, elem, counter) counter += 1 if text: added2[elem] = level1.add(text, _href, @@ -145,12 +146,12 @@ class DetectStructure(object): level3 = list(XPath(self.opts.level3_toc)(item.data)) for elem in level3: level2 = None - for item in item.data.iterdescendants(): + for item in document.data.iterdescendants(): if item in added2.keys(): level2 = added2[item] elif item == elem and level2 is not None: text, _href = \ - self.elem_to_link(item, elem, counter) + self.elem_to_link(document, elem, counter) counter += 1 if text: level2.add(text, _href,