Merging of HTML files

2025-07-09 03:04:10 -04:00 · 2013-11-23 09:39:06 +05:30 · 2013-11-23 09:39:06 +05:30 · 099632502f
commit 099632502f
parent 5326452976
4 changed files with 205 additions and 15 deletions
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -539,7 +539,7 @@ class Container(object):  # {{{
            spine[-1].tail = last_tail
        self.dirty(self.opf_name)

-    def remove_item(self, name):
+    def remove_item(self, name, remove_from_guide=True):
        '''
        Remove the item identified by name from this container. This removes all
        references to the item in the OPF manifest, guide and spine as well as from
@ -571,6 +571,7 @@ class Container(object):  # {{{
                    self.remove_from_xml(meta)
                    self.dirty(self.opf_name)

+        if remove_from_guide:
            for item in self.opf_xpath('//opf:guide/opf:reference[@href]'):
                if self.href_to_name(item.get('href'), self.opf_name) == name:
                    self.remove_from_xml(item)
@ -872,7 +873,7 @@ class EpubContainer(Container):
    def names_that_must_not_be_changed(self):
        return super(EpubContainer, self).names_that_must_not_be_changed | {'META-INF/' + x for x in self.META_INF}

-    def remove_item(self, name):
+    def remove_item(self, name, remove_from_guide=True):
        # Handle removal of obfuscated fonts
        if name == 'META-INF/encryption.xml':
            self.obfuscated_fonts.clear()
@ -890,7 +891,7 @@ class EpubContainer(Container):
                if name == self.href_to_name(cr.get('URI')):
                    self.remove_from_xml(em.getparent())
                    self.dirty('META-INF/encryption.xml')
-        super(EpubContainer, self).remove_item(name)
+        super(EpubContainer, self).remove_item(name, remove_from_guide=remove_from_guide)

    def process_encryption(self):
        fonts = {}
--- a/src/calibre/ebooks/oeb/polish/split.py
+++ b/src/calibre/ebooks/oeb/polish/split.py
@ -6,12 +6,16 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

-import copy
+import copy, os
 from future_builtins import map
 from urlparse import urlparse

-from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF
+from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML
 from calibre.ebooks.oeb.polish.toc import node_from_loc
+from calibre.ebooks.oeb.polish.replace import LinkRebaser
+
+class AbortError(ValueError):
+    pass

 def in_table(node):
    while node is not None:
@ -167,9 +171,9 @@ def split(container, name, loc_or_xpath, before=True):
    else:
        split_point = node_from_loc(root, loc_or_xpath)
    if in_table(split_point):
-        raise ValueError('Cannot split inside tables')
+        raise AbortError('Cannot split inside tables')
    if split_point.tag.endswith('}body'):
-        raise ValueError('Cannot split on the <body> tag')
+        raise AbortError('Cannot split on the <body> tag')
    tree1, tree2 = do_split(split_point, container.log, before=before)
    root1, root2 = tree1.getroot(), tree2.getroot()
    anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
@ -211,3 +215,157 @@ def split(container, name, loc_or_xpath, before=True):
    container.insert_into_xml(spine, si, index=index)
    container.dirty(container.opf_name)
    return bottom_name
+
+class MergeLinkReplacer(object):
+
+    def __init__(self, base, anchor_map, master, container):
+        self.container, self.anchor_map = container, anchor_map
+        self.master = master
+        self.base = base
+        self.replaced = False
+
+    def __call__(self, url):
+        if url and url.startswith('#'):
+            return url
+        name = self.container.href_to_name(url, self.base)
+        amap = self.anchor_map.get(name, None)
+        if amap is None:
+            return url
+        purl = urlparse(url)
+        frag = purl.fragment or ''
+        frag = amap.get(frag, frag)
+        url = self.container.name_to_href(self.master, self.base) + '#' + frag
+        self.replaced = True
+        return url
+
+
+def add_text(body, text):
+    if len(body) > 0:
+        body[-1].tail = (body[-1].tail or '') + text
+    else:
+        body.text = (body.text or '') + text
+
+def all_anchors(root):
+    return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
+
+def all_stylesheets(container, name):
+    for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
+        name = container.href_to_name(link.get('href'), name)
+        typ = link.get('type', 'text/css')
+        if typ == 'text/css':
+            yield name
+
+def unique_anchor(seen_anchors, current):
+    c = 0
+    ans = current
+    while ans in seen_anchors:
+        c += 1
+        ans = '%s_%d' % (current, c)
+    return ans
+
+def remove_name_attributes(root):
+    # Remove all name attributes, replacing them with id attributes
+    for elem in root.xpath('//*[@id and @name]'):
+        del elem.attrib['name']
+    for elem in root.xpath('//*[@name]'):
+        elem.set('id', elem.attrib.pop('name'))
+
+def merge_html(container, names, master):
+    p = container.parsed
+    root = p(master)
+
+    # Ensure master has a <head>
+    head = root.find('h:head', namespaces=XPNSMAP)
+    if head is None:
+        head = root.makeelement(XHTML('head'))
+        container.insert_into_xml(root, head, 0)
+
+    seen_anchors = all_anchors(root)
+    seen_stylesheets = set(all_stylesheets(container, master))
+    master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
+    master_base = os.path.dirname(master)
+    anchor_map = {n:{} for n in names if n != master}
+
+    for name in names:
+        if name == master:
+            continue
+        # Insert new stylesheets into master
+        for sheet in all_stylesheets(container, name):
+            if sheet not in seen_stylesheets:
+                seen_stylesheets.add(sheet)
+                link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
+                container.insert_into_xml(head, link)
+
+        # Rebase links if master is in a different directory
+        if os.path.dirname(name) != master_base:
+            container.replace_links(name, LinkRebaser(container, name, master))
+
+        root = p(name)
+        children = []
+        for body in p(name).findall('h:body', namespaces=XPNSMAP):
+            children.append(body.text if body.text and body.text.strip() else '\n\n')
+            children.extend(body)
+
+        first_child = ''
+        for first_child in children:
+            if not isinstance(first_child, basestring):
+                break
+        if isinstance(first_child, basestring):
+            # Empty document, ignore
+            continue
+
+        amap = anchor_map[name]
+        remove_name_attributes(root)
+
+        for elem in root.xpath('//*[@id]'):
+            val = elem.get('id')
+            if not val:
+                continue
+            if val in seen_anchors:
+                nval = unique_anchor(seen_anchors, val)
+                elem.set('id', nval)
+                amap[val] = nval
+            else:
+                seen_anchors.add(val)
+
+        if 'id' not in first_child.attrib:
+            first_child.set('id', unique_anchor(seen_anchors, 'top'))
+            seen_anchors.add(first_child.get('id'))
+
+        amap[''] = first_child.get('id')
+
+        # Fix links that point to local changed anchors
+        for a in XPath('//h:a[starts-with(@href, "#")]')(root):
+            q = a.get('href')[1:]
+            if q in amap:
+                a.set('href', '#' + amap[q])
+
+        for child in children:
+            if isinstance(child, basestring):
+                add_text(master_body, child)
+            else:
+                master_body.append(copy.deepcopy(child))
+
+        container.remove_item(name, remove_from_guide=False)
+
+    # Fix all links in the container that point to merged files
+    for fname, media_type in container.mime_map.iteritems():
+        repl = MergeLinkReplacer(fname, anchor_map, master, container)
+        container.replace_links(fname, repl)
+
+
+def merge(container, category, names, master):
+    if category not in {'text', 'styles'}:
+        raise AbortError('Cannot merge files of type: %s' % category)
+    if len(names) < 2:
+        raise AbortError('Must specify at least two files to be merged')
+    if master not in names:
+        raise AbortError('The master file must be one of the files being merged')
+
+    if category == 'text':
+        merge_html(container, names, master)
+    elif category == 'styles':
+        merge_css(container, names, master)  # noqa
+
+    container.dirty(master)
+
--- a/src/calibre/ebooks/oeb/polish/tests/container.py
+++ b/src/calibre/ebooks/oeb/polish/tests/container.py
@ -12,7 +12,7 @@ from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_simple_book, get_

 from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, OCF_NS
 from calibre.ebooks.oeb.polish.replace import rename_files
-from calibre.ebooks.oeb.polish.split import split
+from calibre.ebooks.oeb.polish.split import split, merge
 from calibre.utils.filenames import nlinks_file
 from calibre.ptempfile import TemporaryFile

@ -188,3 +188,20 @@ class ContainerTests(BaseTest):
        self.assertEqual(1, len(root.xpath('//*[@id="container"]')), 'Split point was not adjusted')
        self.assertEqual(0, len(troot.xpath('//*[@id="container"]')), 'Split point was not adjusted')
        self.check_links(c)
+
+    def test_merge_file(self):
+        ' Test merging of files '
+        book = get_simple_book()
+        c = get_container(book)
+        merge(c, 'text', ('index_split_000.html', 'index_split_001.html'), 'index_split_000.html')
+        self.check_links(c)
+
+        book = get_simple_book()
+        c = get_container(book)
+        one, two = 'one/one.html', 'two/two.html'
+        c.add_file(one, b'<head><link href="../stylesheet.css"><p><a name="one" href="../two/two.html">1</a><a name="two" href="../two/two.html#one">2</a>')  # noqa
+        c.add_file(two, b'<head><link href="../page_styles.css"><p><a name="one" href="two.html#two">1</a><a name="two" href="../one/one.html#one">2</a><a href="#one">3</a>')  # noqa
+        merge(c, 'text', (one, two), one)
+        self.check_links(c)
+        root = c.parsed(one)
+        self.assertEqual(1, len(root.xpath('//*[@href="../page_styles.css"]')))
--- a/src/calibre/gui2/tweak_book/boss.py
+++ b/src/calibre/gui2/tweak_book/boss.py
@ -20,7 +20,7 @@ from calibre.ebooks.oeb.base import urlnormalize
 from calibre.ebooks.oeb.polish.main import SUPPORTED, tweak_polish
 from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, guess_type
 from calibre.ebooks.oeb.polish.replace import rename_files
-from calibre.ebooks.oeb.polish.split import split
+from calibre.ebooks.oeb.polish.split import split, merge, AbortError
 from calibre.gui2 import error_dialog, choose_files, question_dialog, info_dialog
 from calibre.gui2.dialogs.confirm_delete import confirm
 from calibre.gui2.tweak_book import set_current_container, current_container, tprefs, actions, editors
@ -54,6 +54,7 @@ class Boss(QObject):
        fl.reorder_spine.connect(self.reorder_spine)
        fl.rename_requested.connect(self.rename_requested)
        fl.edit_file.connect(self.edit_file_requested)
+        fl.merge_requested.connect(self.merge_requested)
        self.gui.central.current_editor_changed.connect(self.apply_current_editor_state)
        self.gui.central.close_requested.connect(self.editor_close_requested)
        self.gui.central.search_panel.search_triggered.connect(self.search)
@ -515,15 +516,28 @@ class Boss(QObject):
    def split_requested(self, name, loc):
        if not self.check_dirtied():
            return
-        self.add_savepoint(self.gui.elided_text(_('Split %s') % name))
+        self.add_savepoint(_('Split %s') % self.gui.elided_text(name))
        try:
            bottom_name = split(current_container(), name, loc)
-        except:
+        except AbortError:
            self.rewind_savepoint()
            raise
        self.apply_container_update_to_gui()
        self.edit_file(bottom_name, 'html')

+    def merge_requested(self, category, names, master):
+        if not self.check_dirtied():
+            return
+        self.add_savepoint(_('Merge files into %s') % self.gui.elided_text(master))
+        try:
+            merge(current_container(), category, names, master)
+        except AbortError:
+            self.rewind_savepoint()
+            raise
+        self.apply_container_update_to_gui()
+        if master in editors:
+            self.show_editor(master)
+
    def sync_editor_to_preview(self, name, lnum):
        editor = self.edit_file(name, 'html')
        self.ignore_preview_to_editor_sync = True