Merging of HTML files

2025-07-09 03:04:10 -04:00 · 2013-11-23 09:39:06 +05:30 · 2013-11-23 09:39:06 +05:30 · 099632502f
commit 099632502f
parent 5326452976
4 changed files with 205 additions and 15 deletions
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -539,7 +539,7 @@ class Container(object):  # {{{
            spine[-1].tail = last_tail
        self.dirty(self.opf_name)
-    def remove_item(self, name):
+    def remove_item(self, name, remove_from_guide=True):
        '''
        Remove the item identified by name from this container. This removes all
        references to the item in the OPF manifest, guide and spine as well as from
@ -571,10 +571,11 @@ class Container(object):  # {{{
                    self.remove_from_xml(meta)
                    self.dirty(self.opf_name)
-        for item in self.opf_xpath('//opf:guide/opf:reference[@href]'):
+        if remove_from_guide:
-            if self.href_to_name(item.get('href'), self.opf_name) == name:
+            for item in self.opf_xpath('//opf:guide/opf:reference[@href]'):
-                self.remove_from_xml(item)
+                if self.href_to_name(item.get('href'), self.opf_name) == name:
-                self.dirty(self.opf_name)
+                    self.remove_from_xml(item)
                    self.dirty(self.opf_name)
        path = self.name_path_map.pop(name, None)
        if path and os.path.exists(path):
@ -872,7 +873,7 @@ class EpubContainer(Container):
    def names_that_must_not_be_changed(self):
        return super(EpubContainer, self).names_that_must_not_be_changed | {'META-INF/' + x for x in self.META_INF}
-    def remove_item(self, name):
+    def remove_item(self, name, remove_from_guide=True):
        # Handle removal of obfuscated fonts
        if name == 'META-INF/encryption.xml':
            self.obfuscated_fonts.clear()
@ -890,7 +891,7 @@ class EpubContainer(Container):
                if name == self.href_to_name(cr.get('URI')):
                    self.remove_from_xml(em.getparent())
                    self.dirty('META-INF/encryption.xml')
-        super(EpubContainer, self).remove_item(name)
+        super(EpubContainer, self).remove_item(name, remove_from_guide=remove_from_guide)
    def process_encryption(self):
        fonts = {}
--- a/src/calibre/ebooks/oeb/polish/split.py
+++ b/src/calibre/ebooks/oeb/polish/split.py
@ -6,12 +6,16 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
-import copy
+import copy, os
 from future_builtins import map
 from urlparse import urlparse
-from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF
+from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML
 from calibre.ebooks.oeb.polish.toc import node_from_loc
 from calibre.ebooks.oeb.polish.replace import LinkRebaser
 class AbortError(ValueError):
    pass
 def in_table(node):
    while node is not None:
@ -167,9 +171,9 @@ def split(container, name, loc_or_xpath, before=True):
    else:
        split_point = node_from_loc(root, loc_or_xpath)
    if in_table(split_point):
-        raise ValueError('Cannot split inside tables')
+        raise AbortError('Cannot split inside tables')
    if split_point.tag.endswith('}body'):
-        raise ValueError('Cannot split on the <body> tag')
+        raise AbortError('Cannot split on the <body> tag')
    tree1, tree2 = do_split(split_point, container.log, before=before)
    root1, root2 = tree1.getroot(), tree2.getroot()
    anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
@ -211,3 +215,157 @@ def split(container, name, loc_or_xpath, before=True):
    container.insert_into_xml(spine, si, index=index)
    container.dirty(container.opf_name)
    return bottom_name
 class MergeLinkReplacer(object):
    def __init__(self, base, anchor_map, master, container):
        self.container, self.anchor_map = container, anchor_map
        self.master = master
        self.base = base
        self.replaced = False
    def __call__(self, url):
        if url and url.startswith('#'):
            return url
        name = self.container.href_to_name(url, self.base)
        amap = self.anchor_map.get(name, None)
        if amap is None:
            return url
        purl = urlparse(url)
        frag = purl.fragment or ''
        frag = amap.get(frag, frag)
        url = self.container.name_to_href(self.master, self.base) + '#' + frag
        self.replaced = True
        return url
 def add_text(body, text):
    if len(body) > 0:
        body[-1].tail = (body[-1].tail or '') + text
    else:
        body.text = (body.text or '') + text
 def all_anchors(root):
    return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
 def all_stylesheets(container, name):
    for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
        name = container.href_to_name(link.get('href'), name)
        typ = link.get('type', 'text/css')
        if typ == 'text/css':
            yield name
 def unique_anchor(seen_anchors, current):
    c = 0
    ans = current
    while ans in seen_anchors:
        c += 1
        ans = '%s_%d' % (current, c)
    return ans
 def remove_name_attributes(root):
    # Remove all name attributes, replacing them with id attributes
    for elem in root.xpath('//*[@id and @name]'):
        del elem.attrib['name']
    for elem in root.xpath('//*[@name]'):
        elem.set('id', elem.attrib.pop('name'))
 def merge_html(container, names, master):
    p = container.parsed
    root = p(master)
    # Ensure master has a <head>
    head = root.find('h:head', namespaces=XPNSMAP)
    if head is None:
        head = root.makeelement(XHTML('head'))
        container.insert_into_xml(root, head, 0)
    seen_anchors = all_anchors(root)
    seen_stylesheets = set(all_stylesheets(container, master))
    master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
    master_base = os.path.dirname(master)
    anchor_map = {n:{} for n in names if n != master}
    for name in names:
        if name == master:
            continue
        # Insert new stylesheets into master
        for sheet in all_stylesheets(container, name):
            if sheet not in seen_stylesheets:
                seen_stylesheets.add(sheet)
                link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
                container.insert_into_xml(head, link)
        # Rebase links if master is in a different directory
        if os.path.dirname(name) != master_base:
            container.replace_links(name, LinkRebaser(container, name, master))
        root = p(name)
        children = []
        for body in p(name).findall('h:body', namespaces=XPNSMAP):
            children.append(body.text if body.text and body.text.strip() else '\n\n')
            children.extend(body)
        first_child = ''
        for first_child in children:
            if not isinstance(first_child, basestring):
                break
        if isinstance(first_child, basestring):
            # Empty document, ignore
            continue
        amap = anchor_map[name]
        remove_name_attributes(root)
        for elem in root.xpath('//*[@id]'):
            val = elem.get('id')
            if not val:
                continue
            if val in seen_anchors:
                nval = unique_anchor(seen_anchors, val)
                elem.set('id', nval)
                amap[val] = nval
            else:
                seen_anchors.add(val)
        if 'id' not in first_child.attrib:
            first_child.set('id', unique_anchor(seen_anchors, 'top'))
            seen_anchors.add(first_child.get('id'))
        amap[''] = first_child.get('id')
        # Fix links that point to local changed anchors
        for a in XPath('//h:a[starts-with(@href, "#")]')(root):
            q = a.get('href')[1:]
            if q in amap:
                a.set('href', '#' + amap[q])
        for child in children:
            if isinstance(child, basestring):
                add_text(master_body, child)
            else:
                master_body.append(copy.deepcopy(child))
        container.remove_item(name, remove_from_guide=False)
    # Fix all links in the container that point to merged files
    for fname, media_type in container.mime_map.iteritems():
        repl = MergeLinkReplacer(fname, anchor_map, master, container)
        container.replace_links(fname, repl)
 def merge(container, category, names, master):
    if category not in {'text', 'styles'}:
        raise AbortError('Cannot merge files of type: %s' % category)
    if len(names) < 2:
        raise AbortError('Must specify at least two files to be merged')
    if master not in names:
        raise AbortError('The master file must be one of the files being merged')
    if category == 'text':
        merge_html(container, names, master)
    elif category == 'styles':
        merge_css(container, names, master)  # noqa
    container.dirty(master)
--- a/src/calibre/ebooks/oeb/polish/tests/container.py
+++ b/src/calibre/ebooks/oeb/polish/tests/container.py
@ -12,7 +12,7 @@ from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_simple_book, get_
 from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, OCF_NS
 from calibre.ebooks.oeb.polish.replace import rename_files
-from calibre.ebooks.oeb.polish.split import split
+from calibre.ebooks.oeb.polish.split import split, merge
 from calibre.utils.filenames import nlinks_file
 from calibre.ptempfile import TemporaryFile
@ -188,3 +188,20 @@ class ContainerTests(BaseTest):
        self.assertEqual(1, len(root.xpath('//*[@id="container"]')), 'Split point was not adjusted')
        self.assertEqual(0, len(troot.xpath('//*[@id="container"]')), 'Split point was not adjusted')
        self.check_links(c)
    def test_merge_file(self):
        ' Test merging of files '
        book = get_simple_book()
        c = get_container(book)
        merge(c, 'text', ('index_split_000.html', 'index_split_001.html'), 'index_split_000.html')
        self.check_links(c)
        book = get_simple_book()
        c = get_container(book)
        one, two = 'one/one.html', 'two/two.html'
        c.add_file(one, b'<head><link href="../stylesheet.css"><p><a name="one" href="../two/two.html">1</a><a name="two" href="../two/two.html#one">2</a>')  # noqa
        c.add_file(two, b'<head><link href="../page_styles.css"><p><a name="one" href="two.html#two">1</a><a name="two" href="../one/one.html#one">2</a><a href="#one">3</a>')  # noqa
        merge(c, 'text', (one, two), one)
        self.check_links(c)
        root = c.parsed(one)
        self.assertEqual(1, len(root.xpath('//*[@href="../page_styles.css"]')))
--- a/src/calibre/gui2/tweak_book/boss.py
+++ b/src/calibre/gui2/tweak_book/boss.py
@ -20,7 +20,7 @@ from calibre.ebooks.oeb.base import urlnormalize
 from calibre.ebooks.oeb.polish.main import SUPPORTED, tweak_polish
 from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, guess_type
 from calibre.ebooks.oeb.polish.replace import rename_files
-from calibre.ebooks.oeb.polish.split import split
+from calibre.ebooks.oeb.polish.split import split, merge, AbortError
 from calibre.gui2 import error_dialog, choose_files, question_dialog, info_dialog
 from calibre.gui2.dialogs.confirm_delete import confirm
 from calibre.gui2.tweak_book import set_current_container, current_container, tprefs, actions, editors
@ -54,6 +54,7 @@ class Boss(QObject):
        fl.reorder_spine.connect(self.reorder_spine)
        fl.rename_requested.connect(self.rename_requested)
        fl.edit_file.connect(self.edit_file_requested)
        fl.merge_requested.connect(self.merge_requested)
        self.gui.central.current_editor_changed.connect(self.apply_current_editor_state)
        self.gui.central.close_requested.connect(self.editor_close_requested)
        self.gui.central.search_panel.search_triggered.connect(self.search)
@ -515,15 +516,28 @@ class Boss(QObject):
    def split_requested(self, name, loc):
        if not self.check_dirtied():
            return
-        self.add_savepoint(self.gui.elided_text(_('Split %s') % name))
+        self.add_savepoint(_('Split %s') % self.gui.elided_text(name))
        try:
            bottom_name = split(current_container(), name, loc)
-        except:
+        except AbortError:
            self.rewind_savepoint()
            raise
        self.apply_container_update_to_gui()
        self.edit_file(bottom_name, 'html')
    def merge_requested(self, category, names, master):
        if not self.check_dirtied():
            return
        self.add_savepoint(_('Merge files into %s') % self.gui.elided_text(master))
        try:
            merge(current_container(), category, names, master)
        except AbortError:
            self.rewind_savepoint()
            raise
        self.apply_container_update_to_gui()
        if master in editors:
            self.show_editor(master)
    def sync_editor_to_preview(self, name, lnum):
        editor = self.edit_file(name, 'html')
        self.ignore_preview_to_editor_sync = True