Initial implementation of EPUB Output plugin

2025-07-07 10:14:46 -04:00 · 2009-04-23 22:31:11 -07:00 · 2009-04-23 22:31:11 -07:00 · 4cd285859b
commit 4cd285859b
parent b93029a4fe
13 changed files with 285 additions and 1205 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -287,13 +287,14 @@ from calibre.ebooks.odt.input import ODTInput
 from calibre.ebooks.rtf.input import RTFInput
 from calibre.ebooks.html.input import HTMLInput
 from calibre.ebooks.oeb.output import OEBOutput
 from calibre.ebooks.epub.output import EPUBOutput
 from calibre.ebooks.txt.output import TXTOutput
 from calibre.ebooks.pdf.output import PDFOutput
 from calibre.customize.profiles import input_profiles, output_profiles
 plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
        TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput,
-        FB2Input, ODTInput, RTFInput]
+        FB2Input, ODTInput, RTFInput, EPUBOutput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -3,7 +3,7 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import sys, re
+import re
 from itertools import izip
 from calibre.customize import Plugin as _Plugin
@ -22,7 +22,7 @@ class Plugin(_Plugin):
    fbase  = 12
    fsizes = [5, 7, 9, 12, 13.5, 17, 20, 22, 24]
-    screen_size = (800, 600)
+    screen_size = (1600, 1200)
    dpi = 100
    def __init__(self, *args, **kwargs):
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -6,32 +6,7 @@ __docformat__ = 'restructuredtext en'
 '''
 Conversion to EPUB.
 '''
 import sys, textwrap, re, os, uuid
 from itertools import cycle
 from calibre.utils.config import Config, StringConfig
 from calibre.utils.zipfile import ZipFile, ZIP_STORED
 from calibre.ebooks.html import tostring
 from lxml import etree
 class DefaultProfile(object):
    flow_size            = sys.maxint
    screen_size          = None
    remove_special_chars = False
    remove_object_tags   = False
 class PRS505(DefaultProfile):
    flow_size            = 270000
    screen_size          = (590, 765)
    remove_special_chars = re.compile(u'[\u200b\u00ad]')
    remove_object_tags   = True
 PROFILES = {
            'PRS505' : PRS505,
            'None'   : DefaultProfile,
            }
 def rules(stylesheets):
    for s in stylesheets:
@ -58,152 +33,4 @@ def initialize_container(path_to_container, opf_name='metadata.opf'):
    zf.writestr('META-INF/container.xml', CONTAINER)
    return zf
 def config(defaults=None, name='epub'):
    desc = _('Options to control the conversion to EPUB')
    if defaults is None:
        c = Config(name, desc)
    else:
        c = StringConfig(defaults, desc)
    c.update(common_config())
    c.remove_opt('output')
    c.remove_opt('zip')
    c.add_opt('output', ['-o', '--output'], default=None,
             help=_('The output EPUB file. If not specified, it is '
                    'derived from the input file name.'))
    c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()),
              help=_('Profile of the target device this EPUB is meant for. '
                     'Set to None to create a device independent EPUB. '
                     'The profile is used for device specific restrictions '
                     'on the EPUB. Choices are: ')+str(list(PROFILES.keys())))
    c.add_opt('override_css', ['--override-css'], default=None,
              help=_('Either the path to a CSS stylesheet or raw CSS. '
                     'This CSS will override any existing CSS '
                     'declarations in the source files.'))
    structure = c.add_group('structure detection',
                            _('Control auto-detection of document structure.'))
    structure('chapter', ['--chapter'],
              default="//*[re:match(name(), 'h[1-2]') and "
              "re:test(., 'chapter|book|section|part', 'i')] | "
              "//*[@class = 'chapter']",
            help=_('''\
 An XPath expression to detect chapter titles. The default is to consider <h1> or
 <h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as
 well as any tags that have class="chapter".
 The expression used must evaluate to a list of elements. To disable chapter detection,
 use the expression "/". See the XPath Tutorial in the calibre User Manual for further
 help on using this feature.
 ''').replace('\n', ' '))
    structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'],
              default='pagebreak',
              help=_('Specify how to mark detected chapters. A value of '
                     '"pagebreak" will insert page breaks before chapters. '
                     'A value of "rule" will insert a line before chapters. '
                     'A value of "none" will disable chapter marking and a '
                     'value of "both" will use both page breaks and lines '
                     'to mark chapters.'))
    structure('cover', ['--cover'], default=None,
              help=_('Path to the cover to be used for this book'))
    structure('prefer_metadata_cover', ['--prefer-metadata-cover'], default=False,
              action='store_true',
              help=_('Use the cover detected from the source file in preference '
                     'to the specified cover.'))
    structure('remove_first_image', ['--remove-first-image'], default=False,
              help=_('Remove the first image from the input ebook. Useful if '
                     'the first image in the source file is a cover and you '
                     'are specifying an external cover.'))
    structure('dont_split_on_page_breaks', ['--dont-split-on-page-breaks'], default=False,
              help=_('Turn off splitting at page breaks. Normally, input files '
                     'are automatically split at every page break into '
                     'two files. This gives an output ebook that can be parsed '
                     'faster and with less resources. However, splitting is '
                     'slow and if your source file contains a very large '
                     'number of page breaks, you should turn off splitting '
                     'on page breaks.'))
    structure('page', ['--page'], default=None,
              help=_('XPath expression to detect page boundaries for building '
                     'a custom pagination map, as used by AdobeDE. Default is '
                     'not to build an explicit pagination map.'))
    structure('page_names', ['--page-names'], default=None,
              help=_('XPath expression to find the name of each page in the '
                     'pagination map relative to its boundary element. '
                     'Default is to number all pages staring with 1.'))
    toc = c.add_group('toc',
        _('''\
 Control the automatic generation of a Table of Contents. If an OPF file is detected
 and it specifies a Table of Contents, then that will be used rather than trying
 to auto-generate a Table of Contents.
 ''').replace('\n', ' '))
    toc('max_toc_links', ['--max-toc-links'], default=50,
        help=_('Maximum number of links to insert into the TOC. Set to 0 '
               'to disable. Default is: %default. Links are only added to the '
               'TOC if less than the --toc-threshold number of chapters were detected.'))
    toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
        help=_("Don't add auto-detected chapters to the Table of Contents."))
    toc('toc_threshold', ['--toc-threshold'], default=6,
        help=_('If fewer than this number of chapters is detected, then links '
               'are added to the Table of Contents. Default: %default'))
    toc('level1_toc', ['--level1-toc'], default=None,
        help=_('XPath expression that specifies all tags that should be added '
               'to the Table of Contents at level one. If this is specified, '
               'it takes precedence over other forms of auto-detection.'))
    toc('level2_toc', ['--level2-toc'], default=None,
        help=_('XPath expression that specifies all tags that should be added '
               'to the Table of Contents at level two. Each entry is added '
               'under the previous level one entry.'))
    toc('level3_toc', ['--level3-toc'], default=None,
        help=_('XPath expression that specifies all tags that should be added '
               'to the Table of Contents at level three. Each entry is added '
               'under the previous level two entry.'))
    toc('from_ncx', ['--from-ncx'], default=None,
        help=_('Path to a .ncx file that contains the table of contents to use '
               'for this ebook. The NCX file should contain links relative to '
               'the directory it is placed in. See '
               'http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for '
               'an overview of the NCX format.'))
    toc('use_auto_toc', ['--use-auto-toc'], default=False,
        help=_('Normally, if the source file already has a Table of Contents, '
               'it is used in preference to the auto-generated one. '
               'With this option, the auto-generated one is always used.'))
    layout = c.add_group('page layout', _('Control page layout'))
    layout('margin_top', ['--margin-top'], default=5.0,
           help=_('Set the top margin in pts. Default is %default'))
    layout('margin_bottom', ['--margin-bottom'], default=5.0,
           help=_('Set the bottom margin in pts. Default is %default'))
    layout('margin_left', ['--margin-left'], default=5.0,
           help=_('Set the left margin in pts. Default is %default'))
    layout('margin_right', ['--margin-right'], default=5.0,
           help=_('Set the right margin in pts. Default is %default'))
    layout('base_font_size2', ['--base-font-size'], default=12.0,
           help=_('The base font size in pts. Default is %defaultpt. '
                  'Set to 0 to disable rescaling of fonts.'))
    layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=False,
           help=_('Remove spacing between paragraphs. '
                  'Also sets a indent on paragraphs of 1.5em. '
                  'You can override this by adding p {text-indent: 0cm} to '
                  '--override-css. Spacing removal will not work if the source '
                  'file forces inter-paragraph spacing.'))
    layout('no_justification', ['--no-justification'], default=False,
           help=_('Do not force text to be justified in output.'))
    layout('linearize_tables', ['--linearize-tables'], default=False,
           help=_('Remove table markup, converting it into paragraphs. '
                  'This is useful if your source file uses a table to manage layout.'))
    layout('preserve_tag_structure', ['--preserve-tag-structure'], default=False,
           help=_('Preserve the HTML tag structure while splitting large HTML files. '
                  'This is only neccessary if the HTML files contain CSS that '
                  'uses sibling selectors. Enabling this greatly slows down '
                  'processing of large HTML files.'))
    c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
              help=_('Print generated OPF file to stdout'))
    c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
              help=_('Print generated NCX file to stdout'))
    c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug',
              default=False,
              help=_('Keep intermediate files during processing by html2epub'))
    c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,
              help=_('Extract the contents of the produced EPUB file to the '
                     'specified directory.'))
    return c
--- a/src/calibre/ebooks/epub/fonts.py
+++ b/src/calibre/ebooks/epub/fonts.py
@ -1,300 +0,0 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 Font size rationalization. See :function:`relativize`.
 '''
 import logging, re, operator, functools, collections, unittest, copy, sys
 from xml.dom import SyntaxErr
 from lxml.cssselect import CSSSelector
 from lxml import etree
 from lxml.html import HtmlElement
 from calibre.ebooks.html_old import fromstring
 from calibre.ebooks.epub import rules
 from cssutils import CSSParser
 num           = r'[-]?\d+|[-]?\d*\.\d+'
 length        = r'(?P<zero>0)|(?P<num>{num})(?P<unit>%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num)
 absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)'
 relative_size = r'(?P<rel>smaller|larger)'
 font_size_pat   = re.compile('|'.join((relative_size, absolute_size, length)), re.I)
 line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))
 PTU = {
       'in' : 72.,
       'cm' : 72/2.54,
       'mm' : 72/25.4,
       'pt' : 1.0,
       'pc' : 1/12.,
       }
 DEFAULT_FONT_SIZE = 12
 class Rationalizer(object):
    @classmethod
    def specificity(cls, s):
        '''Map CSS specificity tuple to a single integer'''
        return sum([10**(4-i) + x for i,x in enumerate(s)])
    @classmethod
    def compute_font_size(cls, elem):
        '''
        Calculate the effective font size of an element traversing its ancestors as far as
        neccessary.
        '''
        cfs = elem.computed_font_size
        if cfs is not None:
            return
        sfs = elem.specified_font_size
        if callable(sfs):
            parent = elem.getparent()
            cls.compute_font_size(parent)
            elem.computed_font_size = sfs(parent.computed_font_size)
        else:
            elem.computed_font_size = sfs
    @classmethod
    def calculate_font_size(cls, style):
        'Return font size in pts from style object. For relative units returns a callable'
        match = font_size_pat.search(style.font)
        fs = ''
        if match:
            fs = match.group()
        if style.fontSize:
            fs = style.fontSize
        match = font_size_pat.search(fs)
        if match is None:
            return None
        match = match.groupdict()
        unit = match.get('unit', '')
        if unit: unit = unit.lower()
        if unit in PTU.keys():
            return PTU[unit] * float(match['num'])
        if unit in ('em', 'ex'):
            return functools.partial(operator.mul, float(match['num']))
        if unit == '%':
            return functools.partial(operator.mul, float(match['num'])/100.)
        abs = match.get('abs', '')
        if abs: abs = abs.lower()
        if abs:
            x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1))
            return 12 * x
        if match.get('zero', False):
            return 0.
        return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8)
    @classmethod
    def resolve_rules(cls, stylesheets):
        for sheet in stylesheets:
            if hasattr(sheet, 'fs_rules'):
                continue
            sheet.fs_rules = []
            sheet.lh_rules = []
            for r in sheet:
                if r.type == r.STYLE_RULE:
                    font_size = cls.calculate_font_size(r.style)
                    if font_size is not None:
                        for s in r.selectorList:
                            sheet.fs_rules.append([CSSSelector(s.selectorText), font_size])
                    orig = line_height_pat.search(r.style.lineHeight)
                    if orig is not None:
                        for s in r.selectorList:
                            sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]])
    @classmethod
    def apply_font_size_rules(cls, stylesheets, root):
        'Add a ``specified_font_size`` attribute to every element that has a specified font size'
        cls.resolve_rules(stylesheets)
        for sheet in stylesheets:
            for selector, font_size in sheet.fs_rules:
                elems = selector(root)
                for elem in elems:
                    elem.specified_font_size = font_size
    @classmethod
    def remove_font_size_information(cls, stylesheets):
        for r in rules(stylesheets):
            r.style.removeProperty('font-size')
            try:
                new = font_size_pat.sub('', r.style.font).strip()
                if new:
                    r.style.font = new
                else:
                    r.style.removeProperty('font')
            except SyntaxErr:
                r.style.removeProperty('font')
            if line_height_pat.search(r.style.lineHeight) is not None:
                r.style.removeProperty('line-height')
    @classmethod
    def compute_font_sizes(cls, root, stylesheets, base=12):
        stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')]
        cls.apply_font_size_rules(stylesheets, root)
        # Compute the effective font size of all tags
        root.computed_font_size = DEFAULT_FONT_SIZE
        for elem in root.iter(etree.Element):
            cls.compute_font_size(elem)
        extra_css = {}
        if base > 0:
            # Calculate the "base" (i.e. most common) font size
            font_sizes = collections.defaultdict(lambda : 0)
            body = root.xpath('//body')[0]
            IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')
            for elem in body.iter(etree.Element):
                if elem.tag not in IGNORE:
                    t = getattr(elem, 'text', '')
                    if t: t = t.strip()
                    if t:
                        font_sizes[elem.computed_font_size] += len(t)
                t = getattr(elem, 'tail', '')
                if t: t = t.strip()
                if t:
                    parent = elem.getparent()
                    if parent.tag not in IGNORE:
                        font_sizes[parent.computed_font_size] += len(t)
            try:
                most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0]
                scale = base/most_common if most_common > 0 else 1.
            except ValueError:
                scale = 1.
            # rescale absolute line-heights
            counter = 0
            for sheet in stylesheets:
                for selector, lh in sheet.lh_rules:
                    for elem in selector(root):
                        elem.set('id', elem.get('id', 'cfs_%d'%counter))
                        counter += 1
                        if not extra_css.has_key(elem.get('id')):
                            extra_css[elem.get('id')] = []
                        extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale))
            # Rescale all computed font sizes
            for elem in body.iter(etree.Element):
                if isinstance(elem, HtmlElement):
                    elem.computed_font_size *= scale
        # Remove all font size specifications from the last stylesheet
        cls.remove_font_size_information(stylesheets[-1:])
        # Create the CSS to implement the rescaled font sizes
        for elem in body.iter(etree.Element):
            cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent()))
            if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.:
                elem.set('id', elem.get('id', 'cfs_%d'%counter))
                counter += 1
                if not extra_css.has_key(elem.get('id')):
                    extra_css[elem.get('id')] = []
                extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs)))
        css = CSSParser(loglevel=logging.ERROR).parseString('')
        for id, r in extra_css.items():
            css.add('#%s {%s}'%(id, ';'.join(r)))
        return css
    @classmethod
    def rationalize(cls, stylesheets, root, opts):
        logger     = logging.getLogger('html2epub')
        logger.info('\t\tRationalizing fonts...')
        extra_css = None
        if opts.base_font_size2 > 0:
            try:
                extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2)
            except:
                logger.warning('Failed to rationalize font sizes.')
                if opts.verbose > 1:
                    logger.exception('')
            finally:
                root.remove_font_size_information()
        logger.debug('\t\tDone rationalizing')
        return extra_css
 ################################################################################
 ############## Testing
 ################################################################################
 class FontTest(unittest.TestCase):
    def setUp(self):
        from calibre.ebooks.epub import config
        self.opts = config(defaults='').parse()
        self.html = '''
        <html>
            <head>
                <title>Test document</title>
            </head>
            <body>
                <div id="div1">
                <!-- A comment -->
                    <p id="p1">Some <b>text</b></p>
                </div>
                <p id="p2">Some other <span class="it">text</span>.</p>
                <p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p>
            </body>
        </html>
        '''
        self.root = fromstring(self.html)
    def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1):
        root1 = copy.deepcopy(self.root)
        root1.computed_font_size = DEFAULT_FONT_SIZE
        stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css)
        stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base)
        root2 = copy.deepcopy(root1)
        root2.remove_font_size_information()
        root2.computed_font_size = DEFAULT_FONT_SIZE
        Rationalizer.apply_font_size_rules([stylesheet2], root2)
        for elem in root2.iter(etree.Element):
            Rationalizer.compute_font_size(elem)
        for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)):
            self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size,
                msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\
                (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size))
        return stylesheet2.cssText
    def testStripping(self):
        'Test that any original entries are removed from the CSS'
        css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }'
        css = CSSParser(loglevel=logging.ERROR).parseString(css)
        Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css])
        self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''),
                         'p{font:bolditalic}')
    def testIdentity(self):
        'Test that no unnecessary font size changes are made'
        extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}')
        self.assertEqual(extra_css.strip(), '')
    def testRelativization(self):
        'Test conversion of absolute to relative sizes'
        self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}')
    def testResizing(self):
        'Test resizing of fonts'
        self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}')
 def suite():
    return unittest.TestLoader().loadTestsFromTestCase(FontTest)
 def test():
    unittest.TextTestRunner(verbosity=2).run(suite())
 if __name__ == '__main__':
    sys.exit(test())
--- a/src/calibre/ebooks/epub/from_any.py
+++ b/src/calibre/ebooks/epub/from_any.py
@ -1,93 +0,0 @@
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 Convert any ebook format to epub.
 '''
 import sys, os, re
 from contextlib import nested
 from calibre import extract, walk
 from calibre.ebooks import DRMError
 from calibre.ebooks.epub import config as common_config
 from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index
 from calibre.ptempfile import TemporaryDirectory
 from calibre.utils.zipfile import ZipFile
 from calibre.customize.ui import run_plugins_on_preprocess
 SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf',
                  'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub']
 def unarchive(path, tdir):
    extract(path, tdir)
    files = list(walk(tdir))
    for ext in ['opf'] + list(MAP.keys()):
        for f in files:
            if f.lower().endswith('.'+ext):
                if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
                    continue
                return f, ext
    return find_html_index(files)
 def any2epub(opts, path, notification=None, create_epub=True,
             oeb_cover=False, extract_to=None):
    path = run_plugins_on_preprocess(path)
    ext = os.path.splitext(path)[1]
    if not ext:
        raise ValueError('Unknown file type: '+path)
    ext = ext.lower()[1:]
    if opts.output is None:
        opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub'
    with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2):
        if ext in ['rar', 'zip', 'oebzip']:
            path, ext = unarchive(path, tdir1)
            print 'Found %s file in archive'%(ext.upper())
        if ext in MAP.keys():
            path = MAP[ext](path, tdir2, opts)
            ext = 'opf'
        if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None:
            raise ValueError('Conversion from %s is not supported'%ext.upper())
        print 'Creating EPUB file...'
        html2epub(path, opts, notification=notification,
                  create_epub=create_epub, oeb_cover=oeb_cover,
                  extract_to=extract_to)
 def config(defaults=None):
    return common_config(defaults=defaults)
 def formats():
    return ['html', 'rar', 'zip', 'oebzip']+list(MAP.keys())
 USAGE = _('''\
 %%prog [options] filename
 Convert any of a large number of ebook formats to a %s file. Supported formats are: %s
 ''')
 def option_parser(usage=USAGE):
    return config().option_parser(usage=usage%('EPUB', formats()))
 def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)
    if len(args) < 2:
        parser.print_help()
        print 'No input file specified.'
        return 1
    any2epub(opts, args[1])
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/calibre/ebooks/epub/from_feeds.py
+++ b/src/calibre/ebooks/epub/from_feeds.py
@ -1,71 +0,0 @@
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 Convert periodical content into EPUB ebooks.
 '''
 import sys, glob, os
 from calibre.web.feeds.main import config as feeds2disk_config, USAGE, run_recipe
 from calibre.ebooks.epub.from_html import config as html2epub_config
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.epub.from_html import convert as html2epub
 from calibre import strftime, sanitize_file_name
 def config(defaults=None):
    c = feeds2disk_config(defaults=defaults)
    c.remove('lrf')
    c.remove('epub')
    c.remove('output_dir')
    c.update(html2epub_config(defaults=defaults))
    c.remove('chapter_mark')
    return c
 def option_parser():
    c = config()
    return c.option_parser(usage=USAGE)
 def convert(opts, recipe_arg, notification=None):
    opts.lrf  = False
    opts.epub = True
    if opts.debug:
        opts.verbose = 2
    parser = option_parser()
    with TemporaryDirectory('_feeds2epub') as tdir:
        opts.output_dir = tdir
        recipe = run_recipe(opts, recipe_arg, parser, notification=notification)
        c = config()
        recipe_opts = c.parse_string(recipe.html2epub_options)
        c.smart_update(recipe_opts, opts)
        opts = recipe_opts
        opts.chapter_mark = 'none'
        opts.dont_split_on_page_breaks = True
        opf = glob.glob(os.path.join(tdir, '*.opf'))
        if not opf:
            raise Exception('Downloading of recipe: %s failed'%recipe_arg)
        opf = opf[0]
        if opts.output is None:
            fname = recipe.title + strftime(recipe.timefmt) + '.epub'
            opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
        print 'Generating epub...'
        opts.encoding = 'utf-8'
        opts.remove_paragraph_spacing = True
        html2epub(opf, opts, notification=notification)
 def main(args=sys.argv, notification=None, handler=None):
    parser = option_parser()
    opts, args = parser.parse_args(args)
    if len(args) != 2 and opts.feeds is None:
        parser.print_help()
        return 1
    recipe_arg = args[1] if len(args) > 1 else None
    convert(opts, recipe_arg, notification=notification)
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -1,547 +0,0 @@
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 Conversion of HTML/OPF files follows several stages:
    * All links in the HTML files or in the OPF manifest are
    followed to build up a list of HTML files to be converted.
    This stage is implemented by
    :function:`calibre.ebooks.html.traverse` and
    :class:`calibre.ebooks.html.HTMLFile`.
    * The HTML is pre-processed to make it more semantic.
    All links in the HTML files to other resources like images,
    stylesheets, etc. are relativized. The resources are copied
    into the `resources` sub directory. This is accomplished by
    :class:`calibre.ebooks.html.PreProcessor` and
    :class:`calibre.ebooks.html.Parser`.
    * The HTML is processed. Various operations are performed.
    All style declarations are extracted and consolidated into
    a single style sheet. Chapters are auto-detected and marked.
    Various font related manipulations are performed. See
    :class:`HTMLProcessor`.
    * The processed HTML is saved and the
    :module:`calibre.ebooks.epub.split` module is used to split up
    large HTML files into smaller chunks.
    * The EPUB container is created.
 '''
 import os, sys, cStringIO, logging, re, functools, shutil
 from lxml.etree import XPath
 from lxml import html, etree
 from PyQt4.Qt import QApplication, QPixmap, Qt
 from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\
    opf_traverse, create_metadata, rebase_toc, Link, parser
 from calibre.ebooks.epub import config as common_config, tostring
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.epub import initialize_container, PROFILES
 from calibre.ebooks.epub.split import split
 from calibre.ebooks.epub.pages import add_page_map
 from calibre.ebooks.epub.fonts import Rationalizer
 from calibre.constants import preferred_encoding
 from calibre.customize.ui import run_plugins_on_postprocess
 from calibre import walk, CurrentDir, to_unicode, fit_image
 content = functools.partial(os.path.join, u'content')
 def remove_bad_link(element, attribute, link, pos):
    if attribute is not None:
        if element.tag in ['link']:
            element.getparent().remove(element)
        else:
            element.set(attribute, '')
            del element.attrib[attribute]
 def check_links(opf_path, pretty_print):
    '''
    Find and remove all invalid links in the HTML files
    '''
    logger = logging.getLogger('html2epub')
    logger.info('\tChecking files for bad links...')
    pathtoopf = os.path.abspath(opf_path)
    with CurrentDir(os.path.dirname(pathtoopf)):
        opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
        html_files = []
        for item in opf.itermanifest():
            if 'html' in item.get('media-type', '').lower():
                f = item.get('href').split('/')[-1]
                if isinstance(f, str):
                    f = f.decode('utf-8')
                html_files.append(os.path.abspath(content(f)))
        for path in html_files:
            if not os.access(path, os.R_OK):
                continue
            base = os.path.dirname(path)
            root = html.fromstring(open(content(path), 'rb').read(), parser=parser)
            for element, attribute, link, pos in list(root.iterlinks()):
                link = to_unicode(link)
                plink = Link(link, base)
                bad = False
                if plink.path is not None and not os.path.exists(plink.path):
                    bad = True
                if bad:
                    remove_bad_link(element, attribute, link, pos)
            open(content(path), 'wb').write(tostring(root, pretty_print))
 def find_html_index(files):
    '''
    Given a list of files, find the most likely root HTML file in the
    list.
    '''
    html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
    html_files = [f for f in files if html_pat.search(f) is not None]
    if not html_files:
        raise ValueError(_('Could not find an ebook inside the archive'))
    html_files = [(f, os.stat(f).st_size) for f in html_files]
    html_files.sort(cmp = lambda x, y: cmp(x[1], y[1]))
    html_files = [f[0] for f in html_files]
    for q in ('toc', 'index'):
        for f in html_files:
            if os.path.splitext(os.path.basename(f))[0].lower() == q:
                return f, os.path.splitext(f)[1].lower()[1:]
    return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
 def rescale_images(imgdir, screen_size, log):
    pwidth, pheight = screen_size
    if QApplication.instance() is None:
        QApplication([])
    for f in os.listdir(imgdir):
        path = os.path.join(imgdir, f)
        if os.path.splitext(f)[1] in ('.css', '.js'):
            continue
        p = QPixmap()
        p.load(path)
        if p.isNull():
            continue
        width, height = p.width(), p.height()
        scaled, new_width, new_height = fit_image(width, height, pwidth,
                pheight)
        if scaled:
            log.info('Rescaling image: '+f)
            p.scaled(new_width, new_height, Qt.IgnoreAspectRatio,
                    Qt.SmoothTransformation).save(path, 'JPEG')
 class HTMLProcessor(Processor, Rationalizer):
    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets):
        Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles,
                           name='html2epub')
        if opts.verbose > 2:
            self.debug_tree('parsed')
        self.detect_chapters()
        self.extract_css(stylesheets)
        if self.opts.base_font_size2 > 0:
            self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet],
                                             self.root, self.opts)
        if opts.verbose > 2:
            self.debug_tree('nocss')
        if hasattr(self.body, 'xpath'):
            for script in list(self.body.xpath('descendant::script')):
                script.getparent().remove(script)
        self.fix_markup()
    def convert_image(self, img):
        rpath = img.get('src', '')
        path = os.path.join(os.path.dirname(self.save_path()), *rpath.split('/'))
        if os.path.exists(path) and os.path.isfile(path):
            if QApplication.instance() is None:
                app = QApplication([])
                app
            p = QPixmap()
            p.load(path)
            if not p.isNull():
                p.save(path + '_calibre_converted.jpg')
                os.remove(path)
                for key, val in self.resource_map.items():
                    if val == rpath:
                        self.resource_map[key] = rpath+'_calibre_converted.jpg'
        img.set('src', rpath+'_calibre_converted.jpg')
    def fix_markup(self):
        '''
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        '''
        # Replace <br> that are children of <body> as ADE doesn't handle them
        if hasattr(self.body, 'xpath'):
            for br in self.body.xpath('./br'):
                if br.getparent() is None:
                    continue
                try:
                    sibling = br.itersiblings().next()
                except:
                    sibling = None
                br.tag = 'p'
                br.text = u'\u00a0'
                if (br.tail and br.tail.strip()) or sibling is None or \
                   getattr(sibling, 'tag', '') != 'br':
                    style = br.get('style', '').split(';')
                    style = filter(None, map(lambda x: x.strip(), style))
                    style.append('margin: 0pt; border:0pt; height:0pt')
                    br.set('style', '; '.join(style))
                else:
                    sibling.getparent().remove(sibling)
                    if sibling.tail:
                        if not br.tail:
                            br.tail = ''
                        br.tail += sibling.tail
        if self.opts.profile.remove_object_tags:
            for tag in self.root.xpath('//embed'):
                tag.getparent().remove(tag)
            for tag in self.root.xpath('//object'):
                if tag.get('type', '').lower().strip() in ('image/svg+xml',):
                    continue
                tag.getparent().remove(tag)
        for tag in self.root.xpath('//title|//style'):
            if not tag.text:
                tag.getparent().remove(tag)
        for tag in self.root.xpath('//script'):
            if not tag.text and not tag.get('src', False):
                tag.getparent().remove(tag)
        for tag in self.root.xpath('//form'):
            tag.getparent().remove(tag)
        for tag in self.root.xpath('//center'):
            tag.tag = 'div'
            tag.set('style', 'text-align:center')
        if self.opts.linearize_tables:
            for tag in self.root.xpath('//table | //tr | //th | //td'):
                tag.tag = 'div'
        # ADE can't handle &amp; in an img url
        for tag in self.root.xpath('//img[@src]'):
            tag.set('src', tag.get('src', '').replace('&', ''))
    def save(self):
        for meta in list(self.root.xpath('//meta')):
            meta.getparent().remove(meta)
        # Strip all comments since Adobe DE is petrified of them
        Processor.save(self, strip_comments=True)
    def remove_first_image(self):
        images = self.root.xpath('//img')
        if images:
            images[0].getparent().remove(images[0])
            return True
        return False
 def config(defaults=None):
    return common_config(defaults=defaults)
 def option_parser():
    c = config()
    return c.option_parser(usage=_('''\
 %prog [options] file.html|opf
 Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file.
 If you specify an OPF file instead of an HTML file, the list of links is takes from
 the <spine> element of the OPF file.
 '''))
 def parse_content(filelist, opts, tdir):
    os.makedirs(os.path.join(tdir, 'content', 'resources'))
    resource_map, stylesheets = {}, {}
    toc = TOC(base_path=tdir, type='root')
    stylesheet_map = {}
    first_image_removed = False
    for htmlfile in filelist:
        logging.getLogger('html2epub').debug('Processing %s...'%htmlfile)
        hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'),
                           resource_map, filelist, stylesheets)
        if not first_image_removed and opts.remove_first_image:
            first_image_removed = hp.remove_first_image()
        hp.populate_toc(toc)
        hp.save()
        stylesheet_map[os.path.basename(hp.save_path())] = \
            [s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None]
    logging.getLogger('html2epub').debug('Saving stylesheets...')
    if opts.base_font_size2 > 0:
        Rationalizer.remove_font_size_information(stylesheets.values())
        for path, css in stylesheets.items():
            raw = getattr(css, 'cssText', css)
            if isinstance(raw, unicode):
                raw = raw.encode('utf-8')
            open(path, 'wb').write(raw)
    if toc.count('chapter') > opts.toc_threshold:
        toc.purge(['file', 'link', 'unknown'])
    if toc.count('chapter') + toc.count('file') > opts.toc_threshold:
        toc.purge(['link', 'unknown'])
    toc.purge(['link'], max=opts.max_toc_links)
    return resource_map, hp.htmlfile_map, toc, stylesheet_map
 TITLEPAGE = '''\
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
    <head>
        <title>Cover</title>
        <style type="text/css" title="override_css">
            @page {padding: 0pt; margin:0pt}
            body { text-align: center; padding:0pt; margin: 0pt; }
            div { margin: 0pt; padding: 0pt; }
        </style>
    </head>
    <body>
        <div>
            <img src="%s" alt="cover" style="height: 100%%" />
        </div>
    </body>
 </html>
 '''
 def create_cover_image(src, dest, screen_size, rescale_cover=True):
    try:
        from PyQt4.Qt import QImage, Qt
        if QApplication.instance() is None:
            QApplication([])
        im = QImage()
        im.load(src)
        if im.isNull():
            raise ValueError('Invalid cover image')
        if rescale_cover and screen_size is not None:
            width, height = im.width(), im.height()
            dw, dh = (screen_size[0]-width)/float(width), (screen_size[1]-height)/float(height)
            delta = min(dw, dh)
            if delta > 0:
                nwidth = int(width + delta*(width))
                nheight = int(height + delta*(height))
                im = im.scaled(int(nwidth), int(nheight), Qt.IgnoreAspectRatio, Qt.SmoothTransformation)
        im.save(dest)
    except:
        import traceback
        traceback.print_exc()
        return False
    return True
 def process_title_page(mi, filelist, htmlfilemap, opts, tdir):
    old_title_page = None
    f = lambda x : os.path.normcase(os.path.normpath(x))
    if not isinstance(mi.cover, basestring):
        mi.cover = None
    if mi.cover:
        if f(filelist[0].path) == f(mi.cover):
            old_title_page = htmlfilemap[filelist[0].path]
    #logger = logging.getLogger('html2epub')
    metadata_cover = mi.cover
    if metadata_cover and not os.path.exists(metadata_cover):
        metadata_cover = None
    cpath = '/'.join(('resources', '_cover_.jpg'))
    cover_dest = os.path.join(tdir, 'content', *cpath.split('/'))
    if metadata_cover is not None:
        if not create_cover_image(metadata_cover, cover_dest,
                                  opts.profile.screen_size):
            metadata_cover = None
    specified_cover = opts.cover
    if specified_cover and not os.path.exists(specified_cover):
        specified_cover = None
    if specified_cover is not None:
        if not create_cover_image(specified_cover, cover_dest,
                                  opts.profile.screen_size):
            specified_cover = None
    cover = metadata_cover if specified_cover is None or (opts.prefer_metadata_cover and metadata_cover is not None) else specified_cover
    if cover is not None:
        titlepage = TITLEPAGE%cpath
        tp = 'calibre_title_page.html' if old_title_page is None else old_title_page
        tppath = os.path.join(tdir, 'content', tp)
        with open(tppath, 'wb') as f:
            f.write(titlepage)
        return tp if old_title_page is None else None, True
    elif os.path.exists(cover_dest):
        os.remove(cover_dest)
    return None, old_title_page is not None
 def find_oeb_cover(htmlfile):
    if os.stat(htmlfile).st_size > 2048:
        return None
    match = re.search(r'(?i)<img[^<>]+src\s*=\s*[\'"](.+?)[\'"]', open(htmlfile, 'rb').read())
    if match:
        return match.group(1)
 def condense_ncx(ncx_path):
    tree = etree.parse(ncx_path)
    for tag in tree.getroot().iter(tag=etree.Element):
        if tag.text:
            tag.text = tag.text.strip()
        if tag.tail:
            tag.tail = tag.tail.strip()
    compressed = etree.tostring(tree.getroot(), encoding='utf-8')
    open(ncx_path, 'wb').write(compressed)
 def convert(htmlfile, opts, notification=None, create_epub=True,
            oeb_cover=False, extract_to=None):
    htmlfile = os.path.abspath(htmlfile)
    if opts.output is None:
        opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
    opts.profile = PROFILES[opts.profile]
    opts.output = os.path.abspath(opts.output)
    if opts.override_css is not None:
        try:
            opts.override_css = open(opts.override_css, 'rb').read().decode(preferred_encoding, 'replace')
        except:
            opts.override_css = opts.override_css.decode(preferred_encoding, 'replace')
    if opts.from_opf:
        opts.from_opf = os.path.abspath(opts.from_opf)
    if opts.from_ncx:
        opts.from_ncx = os.path.abspath(opts.from_ncx)
    if htmlfile.lower().endswith('.opf'):
        opf = OPF(htmlfile, os.path.dirname(os.path.abspath(htmlfile)))
        filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
        if not filelist:
            # Bad OPF look for a HTML file instead
            htmlfile = find_html_index(walk(os.path.dirname(htmlfile)))[0]
            if htmlfile is None:
                raise ValueError('Could not find suitable file to convert.')
            filelist = get_filelist(htmlfile, opts)[1]
        mi = merge_metadata(None, opf, opts)
    else:
        opf, filelist = get_filelist(htmlfile, opts)
        mi = merge_metadata(htmlfile, opf, opts)
    opts.chapter = XPath(opts.chapter,
                    namespaces={'re':'http://exslt.org/regular-expressions'})
    for x in (1, 2, 3):
        attr = 'level%d_toc'%x
        if getattr(opts, attr):
            setattr(opts, attr, XPath(getattr(opts, attr),
                      namespaces={'re':'http://exslt.org/regular-expressions'}))
        else:
            setattr(opts, attr, None)
    with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir:
        if opts.keep_intermediate:
            print 'Intermediate files in', tdir
        resource_map, htmlfile_map, generated_toc, stylesheet_map = \
                                        parse_content(filelist, opts, tdir)
        logger = logging.getLogger('html2epub')
        resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
        title_page, has_title_page = process_title_page(mi, filelist, htmlfile_map, opts, tdir)
        spine = [htmlfile_map[f.path] for f in filelist]
        if not oeb_cover and title_page is not None:
            spine = [title_page] + spine
        mi.cover = None
        mi.cover_data = (None, None)
        mi = create_metadata(tdir, mi, spine, resources)
        buf = cStringIO.StringIO()
        if mi.toc:
            rebase_toc(mi.toc, htmlfile_map, tdir)
        if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2:
            mi.toc = generated_toc
        if opts.from_ncx:
            toc = TOC()
            toc.read_ncx_toc(opts.from_ncx)
            mi.toc = toc
        for item in mi.manifest:
            if getattr(item, 'mime_type', None) == 'text/html':
                item.mime_type = 'application/xhtml+xml'
        opf_path = os.path.join(tdir, 'metadata.opf')
        with open(opf_path, 'wb') as f:
            mi.render(f, buf, 'toc.ncx')
        toc = buf.getvalue()
        if toc:
            with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f:
                f.write(toc)
            if opts.show_ncx:
                print toc
        split(opf_path, opts, stylesheet_map)
        if opts.page:
            logger.info('\tBuilding page map...')
            add_page_map(opf_path, opts)
        check_links(opf_path, opts.pretty_print)
        opf = OPF(opf_path, tdir)
        opf.remove_guide()
        oeb_cover_file = None
        if oeb_cover and title_page is not None:
            oeb_cover_file = find_oeb_cover(os.path.join(tdir, 'content', title_page))
        if has_title_page or (oeb_cover and oeb_cover_file):
            opf.create_guide_element()
            if has_title_page and not oeb_cover:
                opf.add_guide_item('cover', 'Cover', 'content/'+spine[0])
            if oeb_cover and oeb_cover_file:
                opf.add_guide_item('cover', 'Cover', 'content/'+oeb_cover_file)
        cpath = os.path.join(tdir, 'content', 'resources', '_cover_.jpg')
        if os.path.exists(cpath):
            opf.add_path_to_manifest(cpath, 'image/jpeg')
        with open(opf_path, 'wb') as f:
            f.write(opf.render())
        ncx_path = os.path.join(os.path.dirname(opf_path), 'toc.ncx')
        if os.path.exists(ncx_path) and os.stat(ncx_path).st_size > opts.profile.flow_size:
            logger.info('Condensing NCX from %d bytes...'%os.stat(ncx_path).st_size)
            condense_ncx(ncx_path)
            if os.stat(ncx_path).st_size > opts.profile.flow_size:
                logger.warn('NCX still larger than allowed size at %d bytes. Menu based Table of Contents may not work on device.'%os.stat(ncx_path).st_size)
        if opts.profile.screen_size is not None:
            rescale_images(os.path.join(tdir, 'content', 'resources'),
                    opts.profile.screen_size, logger)
        if create_epub:
            epub = initialize_container(opts.output)
            epub.add_dir(tdir)
            epub.close()
            run_plugins_on_postprocess(opts.output, 'epub')
            logger.info(_('Output written to ')+opts.output)
        if opts.show_opf:
            print open(opf_path, 'rb').read()
        if opts.extract_to is not None:
            if os.path.exists(opts.extract_to):
                shutil.rmtree(opts.extract_to)
            shutil.copytree(tdir, opts.extract_to)
        if extract_to is not None:
            if os.path.exists(extract_to):
                shutil.rmtree(extract_to)
            shutil.copytree(tdir, extract_to)
 def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)
    if len(args) < 2:
        parser.print_help()
        print _('You must specify an input HTML file')
        return 1
    convert(args[1], opts)
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@ -6,9 +6,15 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import os
 from urllib import unquote
 from calibre.customize.conversion import OutputFormatPlugin
-from calibre import CurrentDir
+from calibre.ptempfile import TemporaryDirectory
 from calibre.constants import __appname__, __version__
 from calibre import strftime, guess_type
 from lxml import etree
 class EPUBOutput(OutputFormatPlugin):
@ -16,7 +22,218 @@ class EPUBOutput(OutputFormatPlugin):
    author = 'Kovid Goyal'
    file_type = 'epub'
    TITLEPAGE_COVER = '''\
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
    <head>
        <title>Cover</title>
        <style type="text/css" title="override_css">
            @page {padding: 0pt; margin:0pt}
            body { text-align: center; padding:0pt; margin: 0pt; }
            div { margin: 0pt; padding: 0pt; }
        </style>
    </head>
    <body>
        <div>
            <img src="%s" alt="cover" style="height: 100%%" />
        </div>
    </body>
 </html>
 '''
    TITLEPAGE = '''\
 <html  xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
    <head>
        <style type="text/css">
            body {
                background: white no-repeat fixed center center;
                text-align: center;
                vertical-align: center;
                overflow: hidden;
                font-size: 18px;
            }
            h1 { font-family: serif; }
            h2, h4 { font-family: monospace; }
        </style>
    </head>
    <body>
        <h1>%(title)s</h1>
        <br/><br/>
        <div style="position:relative">
            <div style="position: absolute; left: 0; top: 0; width:100%%; height:100%%; vertical-align:center">
                <img src="%(img)s" alt="calibre" style="opacity:0.3"/>
            </div>
            <div style="position: absolute; left: 0; top: 0; width:100%%; height:100%%; vertical-align:center">
                <h2>%(date)s</h2>
                <br/><br/><br/><br/><br/>
                <h3>%(author)s</h3>
                <br/><br/></br/><br/><br/><br/><br/><br/><br/>
                <h4>Produced by %(app)s</h4>
            </div>
        </div>
    </body>
 </html>
 '''
    def convert(self, oeb, output_path, input_plugin, opts, log):
-        self.log, self.opts = log, opts
+        self.log, self.opts, self.oeb = log, opts, oeb
        self.workaround_ade_quirks()
        from calibre.ebooks.oeb.transforms.rescale import RescaleImages
        RescaleImages()(oeb, opts)
        self.insert_cover()
        with TemporaryDirectory('_epub_output') as tdir:
            from calibre.customize.ui import plugin_for_output_format
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
            self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\
                    if x.endswith('.ncx')][0])
            from calibre.epub import initialize_container
            epub = initialize_container(output_path, os.path.basename(opf))
            epub.add_dir(tdir)
            epub.close()
    def default_cover(self):
        '''
        Create a generic cover for books that dont have a cover
        '''
        try:
            from calibre.gui2 import images_rc # Needed for access to logo
            from PyQt4.Qt import QApplication, QFile, QIODevice
        except:
            return None
        from calibre.ebooks.metadata import authors_to_string
        images_rc
        m = self.oeb.metadata
        title = unicode(m.title[0])
        a = [unicode(x) for x in m.creators if m.role == 'aut']
        author = authors_to_string(a)
        if QApplication.instance() is None: QApplication([])
        f = QFile(':/library')
        f.open(QIODevice.ReadOnly)
        img_data = str(f.readAll())
        id, href = self.oeb.manifest.generate('calibre-logo',
                'calibre-logo.png')
        self.oeb.manifest.add(id, href, 'image/png', data=img_data)
        html = self.TITLEPAGE%dict(title=title, author=author,
                date=strftime('%d %b, %Y'),
                app=__appname__ +' '+__version__,
                img=href)
        id, href = self.oeb.manifest.generate('calibre-titlepage',
                'calibre-titlepage.xhtml')
        return self.oeb.manifest.add(id, href, guess_type('t.xhtml')[0],
                data=etree.fromstring(html))
    def insert_cover(self):
        from calibre.ebooks.oeb.base import urldefrag
        from calibre import guess_type
        g, m = self.oeb.guide, self.oeb.manifest
        if 'titlepage' not in g:
            if 'cover' in g:
                tp = self.TITLEPAGE_COVER%unquote(g['cover'].href)
                id, href = m.generate('titlepage', 'titlepage.xhtml')
                item = m.add(id, href, guess_type('t.xhtml'),
                        data=etree.fromstring(tp))
            else:
                item = self.default_cover()
        else:
            item = self.oeb.manifest.hrefs[
                    urldefrag(self.oeb.guide['titlepage'].href)[0]]
        if item is not None:
            self.oeb.spine.insert(0, item, True)
            self.oeb.guide.refs['cover'].href = item.href
            self.oeb.guide.refs['titlepage'].href = item.href
    def condense_ncx(self, ncx_path):
        if not self.opts.pretty_print:
            tree = etree.parse(ncx_path)
            for tag in tree.getroot().iter(tag=etree.Element):
                if tag.text:
                    tag.text = tag.text.strip()
                if tag.tail:
                    tag.tail = tag.tail.strip()
            compressed = etree.tostring(tree.getroot(), encoding='utf-8')
            open(ncx_path, 'wb').write(compressed)
    def workaround_ade_quirks(self):
        '''
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        '''
        from calibre.ebooks.oeb.base import XPNSMAP, XHTML
        from lxml.etree import XPath as _XPath
        from functools import partial
        XPath = partial(_XPath, namespaces=XPNSMAP)
        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
            if body:
                body = body[0]
            # Replace <br> that are children of <body> as ADE doesn't handle them
            if hasattr(body, 'xpath'):
                for br in body.xpath('./h:br'):
                    if br.getparent() is None:
                        continue
                    try:
                        sibling = br.itersiblings().next()
                    except:
                        sibling = None
                    br.tag = XHTML('p')
                    br.text = u'\u00a0'
                    if (br.tail and br.tail.strip()) or sibling is None or \
                    getattr(sibling, 'tag', '') != XHTML('br'):
                        style = br.get('style', '').split(';')
                        style = filter(None, map(lambda x: x.strip(), style))
                        style.append('margin: 0pt; border:0pt; height:0pt')
                        br.set('style', '; '.join(style))
                    else:
                        sibling.getparent().remove(sibling)
                        if sibling.tail:
                            if not br.tail:
                                br.tail = ''
                            br.tail += sibling.tail
            if self.opts.output_profile.remove_object_tags:
                for tag in root.xpath('//h:embed'):
                    tag.getparent().remove(tag)
                for tag in root.xpath('//h:object'):
                    if tag.get('type', '').lower().strip() in ('image/svg+xml',):
                        continue
                    tag.getparent().remove(tag)
            for tag in root.xpath('//h:title|//h:style'):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in root.xpath('//h:script'):
                if not tag.text and not tag.get('src', False):
                    tag.getparent().remove(tag)
            for tag in root.xpath('//h:form'):
                tag.getparent().remove(tag)
            for tag in root.xpath('//h:center'):
                tag.tag = XHTML('div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in self.root.xpath('//h:img[@src]'):
                tag.set('src', tag.get('src', '').replace('&', ''))
            stylesheet = self.oeb.manifest.hrefs['stylesheet.css']
            stylesheet.data.add('a { color: inherit; text-decoration: inherit; '
                    'cursor: default; }')
            stylesheet.data.add('a[href] { color: blue; '
                    'text-decoration: underline; cursor:pointer; }')
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@ -12,13 +12,15 @@ from cStringIO import StringIO
 from PyQt4.Qt import QFontDatabase
 from calibre.customize.ui import available_input_formats
 from calibre.ebooks.epub.from_html import TITLEPAGE
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.zipfile import safe_replace, ZipFile
 from calibre.utils.config import DynamicConfig
 from calibre.utils.logging import Log
 from calibre.ebooks.epub.output import EPUBOutput
 TITLEPAGE = EPUBOutput.TITLEPAGE_COVER
 def character_count(html):
    '''
--- a/src/calibre/ebooks/oeb/transforms/guide.py
+++ b/src/calibre/ebooks/oeb/transforms/guide.py
@ -14,7 +14,10 @@ class Clean(object):
        from calibre.ebooks.oeb.base import urldefrag
        self.oeb, self.log, self.opts = oeb, oeb.log, opts
-        cover_href = ''
+        protected_hrefs = set([])
        if 'titlepage' in self.oeb.guide:
            protected_hrefs.add(urldefrag(
                self.oeb.guide['titlepage'].href)[0])
        if 'cover' not in self.oeb.guide:
            covers = []
            for x in ('other.ms-coverimage-standard',
@ -32,15 +35,15 @@ class Clean(object):
                    self.log('Choosing %s:%s as the cover'%(ref.type, ref.href))
                ref.type = 'cover'
                self.oeb.guide.refs['cover'] = ref
-                cover_href = urldefrag(ref.href)[0]
+                protected_hrefs.add(urldefrag(ref.href)[0])
        else:
-            cover_href = urldefrag(self.oeb.guide.refs['cover'].href)[0]
+            protected_hrefs.add(urldefrag(self.oeb.guide.refs['cover'].href)[0])
        for x in list(self.oeb.guide):
            href = urldefrag(self.oeb.guide[x].href)[0]
-            if x.lower() != 'cover':
+            if x.lower() != ('cover', 'titlepage'):
                try:
-                    if href != cover_href:
+                    if href not in protected_hrefs:
                        self.oeb.manifest.remove(self.oeb.manifest.hrefs[href])
                except KeyError:
                    pass
--- a/src/calibre/ebooks/oeb/transforms/rescale.py
+++ b/src/calibre/ebooks/oeb/transforms/rescale.py
@ -0,0 +1,37 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre import fit_image
 class RescaleImages(object):
    'Rescale all images to fit inside given screen size'
    def __call__(self, oeb, opts):
        from PyQt4.Qt import QApplication, QImage, Qt
        from calibre.gui2 import pixmap_to_data
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        page_width, page_height = opts.dest.width, opts.dest.height
        for item in oeb.manifest:
            if item.media_type.startswith('image'):
                raw = item.data
                if not raw: continue
                if QApplication.instance() is None:
                    QApplication([])
                img = QImage(10, 10, QImage.Format_ARGB32_Premultiplied)
                if not img.loadFromData(raw): continue
                width, height = img.width(), img.height()
                scaled, new_width, new_height = fit_image(width, height,
                        page_width, page_height)
                if scaled:
                    self.log('Rescaling image', item.href)
                    img = img.scaled(new_width, new_height,
                            Qt.IgnoreAspectRatio, Qt.SmoothTransformation)
                    item.data = pixmap_to_data(img)
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -17,7 +17,7 @@ from lxml.cssselect import CSSSelector
 from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
        urldefrag, rewrite_links, urlunquote
-from calibre.ebooks.epub import tostring, rules
+from calibre.ebooks.epub import rules
 XPath = functools.partial(_XPath, namespaces=NAMESPACES)
@ -25,6 +25,9 @@ XPath = functools.partial(_XPath, namespaces=NAMESPACES)
 SPLIT_ATTR       = 'cs'
 SPLIT_POINT_ATTR = 'csp'
 def tostring(root):
    return etree.tostring(root, encoding='utf-8')
 class SplitError(ValueError):
    def __init__(self, path, root):
--- a/src/calibre/ebooks/oeb/transforms/structure.py
+++ b/src/calibre/ebooks/oeb/transforms/structure.py
@ -11,7 +11,7 @@ import re
 from lxml import etree
 from urlparse import urlparse
-from calibre.ebooks.oeb.base import XPNSMAP, TOC
+from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML
 XPath = lambda x: etree.XPath(x, namespaces=XPNSMAP)
 class DetectStructure(object):
@ -63,11 +63,11 @@ class DetectStructure(object):
                if chapter_mark == 'none':
                    continue
                elif chapter_mark == 'rule':
-                    mark = etree.Element('hr')
+                    mark = etree.Element(XHTML('hr'))
                elif chapter_mark == 'pagebreak':
-                    mark = etree.Element('div', style=page_break_after)
+                    mark = etree.Element(XHTML('div'), style=page_break_after)
                else: # chapter_mark == 'both':
-                    mark = etree.Element('hr', style=page_break_before)
+                    mark = etree.Element(XHTML('hr'), style=page_break_before)
                elem.addprevious(mark)
    def create_level_based_toc(self):
@ -114,12 +114,13 @@ class DetectStructure(object):
    def add_leveled_toc_items(self, item):
        level1 = XPath(self.opts.level1_toc)(item.data)
        level1_order = []
        document = item
        counter = 1
        if level1:
            added = {}
            for elem in level1:
-                text, _href = self.elem_to_link(item, elem, counter)
+                text, _href = self.elem_to_link(document, elem, counter)
                counter += 1
                if text:
                    node = self.oeb.toc.add(text, _href,
@ -132,11 +133,11 @@ class DetectStructure(object):
                level2 = list(XPath(self.opts.level2_toc)(item.data))
                for elem in level2:
                    level1 = None
-                    for item in item.data.iterdescendants():
+                    for item in document.data.iterdescendants():
                        if item in added.keys():
                            level1 = added[item]
                        elif item == elem and level1 is not None:
-                            text, _href = self.elem_to_link(item, elem, counter)
+                            text, _href = self.elem_to_link(document, elem, counter)
                            counter += 1
                            if text:
                                added2[elem] = level1.add(text, _href,
@ -145,12 +146,12 @@ class DetectStructure(object):
                    level3 = list(XPath(self.opts.level3_toc)(item.data))
                    for elem in level3:
                        level2 = None
-                        for item in item.data.iterdescendants():
+                        for item in document.data.iterdescendants():
                            if item in added2.keys():
                                level2 = added2[item]
                            elif item == elem and level2 is not None:
                                text, _href = \
-                                        self.elem_to_link(item, elem, counter)
+                                        self.elem_to_link(document, elem, counter)
                                counter += 1
                                if text:
                                    level2.add(text, _href,