Merging of HTML files

This commit is contained in:
Kovid Goyal 2013-11-23 09:39:06 +05:30
parent 5326452976
commit 099632502f
4 changed files with 205 additions and 15 deletions

View File

@ -539,7 +539,7 @@ class Container(object): # {{{
spine[-1].tail = last_tail spine[-1].tail = last_tail
self.dirty(self.opf_name) self.dirty(self.opf_name)
def remove_item(self, name): def remove_item(self, name, remove_from_guide=True):
''' '''
Remove the item identified by name from this container. This removes all Remove the item identified by name from this container. This removes all
references to the item in the OPF manifest, guide and spine as well as from references to the item in the OPF manifest, guide and spine as well as from
@ -571,10 +571,11 @@ class Container(object): # {{{
self.remove_from_xml(meta) self.remove_from_xml(meta)
self.dirty(self.opf_name) self.dirty(self.opf_name)
for item in self.opf_xpath('//opf:guide/opf:reference[@href]'): if remove_from_guide:
if self.href_to_name(item.get('href'), self.opf_name) == name: for item in self.opf_xpath('//opf:guide/opf:reference[@href]'):
self.remove_from_xml(item) if self.href_to_name(item.get('href'), self.opf_name) == name:
self.dirty(self.opf_name) self.remove_from_xml(item)
self.dirty(self.opf_name)
path = self.name_path_map.pop(name, None) path = self.name_path_map.pop(name, None)
if path and os.path.exists(path): if path and os.path.exists(path):
@ -872,7 +873,7 @@ class EpubContainer(Container):
def names_that_must_not_be_changed(self): def names_that_must_not_be_changed(self):
return super(EpubContainer, self).names_that_must_not_be_changed | {'META-INF/' + x for x in self.META_INF} return super(EpubContainer, self).names_that_must_not_be_changed | {'META-INF/' + x for x in self.META_INF}
def remove_item(self, name): def remove_item(self, name, remove_from_guide=True):
# Handle removal of obfuscated fonts # Handle removal of obfuscated fonts
if name == 'META-INF/encryption.xml': if name == 'META-INF/encryption.xml':
self.obfuscated_fonts.clear() self.obfuscated_fonts.clear()
@ -890,7 +891,7 @@ class EpubContainer(Container):
if name == self.href_to_name(cr.get('URI')): if name == self.href_to_name(cr.get('URI')):
self.remove_from_xml(em.getparent()) self.remove_from_xml(em.getparent())
self.dirty('META-INF/encryption.xml') self.dirty('META-INF/encryption.xml')
super(EpubContainer, self).remove_item(name) super(EpubContainer, self).remove_item(name, remove_from_guide=remove_from_guide)
def process_encryption(self): def process_encryption(self):
fonts = {} fonts = {}

View File

@ -6,12 +6,16 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy import copy, os
from future_builtins import map from future_builtins import map
from urlparse import urlparse from urlparse import urlparse
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML
from calibre.ebooks.oeb.polish.toc import node_from_loc from calibre.ebooks.oeb.polish.toc import node_from_loc
from calibre.ebooks.oeb.polish.replace import LinkRebaser
class AbortError(ValueError):
pass
def in_table(node): def in_table(node):
while node is not None: while node is not None:
@ -167,9 +171,9 @@ def split(container, name, loc_or_xpath, before=True):
else: else:
split_point = node_from_loc(root, loc_or_xpath) split_point = node_from_loc(root, loc_or_xpath)
if in_table(split_point): if in_table(split_point):
raise ValueError('Cannot split inside tables') raise AbortError('Cannot split inside tables')
if split_point.tag.endswith('}body'): if split_point.tag.endswith('}body'):
raise ValueError('Cannot split on the <body> tag') raise AbortError('Cannot split on the <body> tag')
tree1, tree2 = do_split(split_point, container.log, before=before) tree1, tree2 = do_split(split_point, container.log, before=before)
root1, root2 = tree1.getroot(), tree2.getroot() root1, root2 = tree1.getroot(), tree2.getroot()
anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''} anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
@ -211,3 +215,157 @@ def split(container, name, loc_or_xpath, before=True):
container.insert_into_xml(spine, si, index=index) container.insert_into_xml(spine, si, index=index)
container.dirty(container.opf_name) container.dirty(container.opf_name)
return bottom_name return bottom_name
class MergeLinkReplacer(object):
def __init__(self, base, anchor_map, master, container):
self.container, self.anchor_map = container, anchor_map
self.master = master
self.base = base
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
return url
name = self.container.href_to_name(url, self.base)
amap = self.anchor_map.get(name, None)
if amap is None:
return url
purl = urlparse(url)
frag = purl.fragment or ''
frag = amap.get(frag, frag)
url = self.container.name_to_href(self.master, self.base) + '#' + frag
self.replaced = True
return url
def add_text(body, text):
if len(body) > 0:
body[-1].tail = (body[-1].tail or '') + text
else:
body.text = (body.text or '') + text
def all_anchors(root):
return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
def all_stylesheets(container, name):
for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
name = container.href_to_name(link.get('href'), name)
typ = link.get('type', 'text/css')
if typ == 'text/css':
yield name
def unique_anchor(seen_anchors, current):
c = 0
ans = current
while ans in seen_anchors:
c += 1
ans = '%s_%d' % (current, c)
return ans
def remove_name_attributes(root):
# Remove all name attributes, replacing them with id attributes
for elem in root.xpath('//*[@id and @name]'):
del elem.attrib['name']
for elem in root.xpath('//*[@name]'):
elem.set('id', elem.attrib.pop('name'))
def merge_html(container, names, master):
p = container.parsed
root = p(master)
# Ensure master has a <head>
head = root.find('h:head', namespaces=XPNSMAP)
if head is None:
head = root.makeelement(XHTML('head'))
container.insert_into_xml(root, head, 0)
seen_anchors = all_anchors(root)
seen_stylesheets = set(all_stylesheets(container, master))
master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
master_base = os.path.dirname(master)
anchor_map = {n:{} for n in names if n != master}
for name in names:
if name == master:
continue
# Insert new stylesheets into master
for sheet in all_stylesheets(container, name):
if sheet not in seen_stylesheets:
seen_stylesheets.add(sheet)
link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
container.insert_into_xml(head, link)
# Rebase links if master is in a different directory
if os.path.dirname(name) != master_base:
container.replace_links(name, LinkRebaser(container, name, master))
root = p(name)
children = []
for body in p(name).findall('h:body', namespaces=XPNSMAP):
children.append(body.text if body.text and body.text.strip() else '\n\n')
children.extend(body)
first_child = ''
for first_child in children:
if not isinstance(first_child, basestring):
break
if isinstance(first_child, basestring):
# Empty document, ignore
continue
amap = anchor_map[name]
remove_name_attributes(root)
for elem in root.xpath('//*[@id]'):
val = elem.get('id')
if not val:
continue
if val in seen_anchors:
nval = unique_anchor(seen_anchors, val)
elem.set('id', nval)
amap[val] = nval
else:
seen_anchors.add(val)
if 'id' not in first_child.attrib:
first_child.set('id', unique_anchor(seen_anchors, 'top'))
seen_anchors.add(first_child.get('id'))
amap[''] = first_child.get('id')
# Fix links that point to local changed anchors
for a in XPath('//h:a[starts-with(@href, "#")]')(root):
q = a.get('href')[1:]
if q in amap:
a.set('href', '#' + amap[q])
for child in children:
if isinstance(child, basestring):
add_text(master_body, child)
else:
master_body.append(copy.deepcopy(child))
container.remove_item(name, remove_from_guide=False)
# Fix all links in the container that point to merged files
for fname, media_type in container.mime_map.iteritems():
repl = MergeLinkReplacer(fname, anchor_map, master, container)
container.replace_links(fname, repl)
def merge(container, category, names, master):
if category not in {'text', 'styles'}:
raise AbortError('Cannot merge files of type: %s' % category)
if len(names) < 2:
raise AbortError('Must specify at least two files to be merged')
if master not in names:
raise AbortError('The master file must be one of the files being merged')
if category == 'text':
merge_html(container, names, master)
elif category == 'styles':
merge_css(container, names, master) # noqa
container.dirty(master)

View File

@ -12,7 +12,7 @@ from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_simple_book, get_
from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, OCF_NS from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, OCF_NS
from calibre.ebooks.oeb.polish.replace import rename_files from calibre.ebooks.oeb.polish.replace import rename_files
from calibre.ebooks.oeb.polish.split import split from calibre.ebooks.oeb.polish.split import split, merge
from calibre.utils.filenames import nlinks_file from calibre.utils.filenames import nlinks_file
from calibre.ptempfile import TemporaryFile from calibre.ptempfile import TemporaryFile
@ -188,3 +188,20 @@ class ContainerTests(BaseTest):
self.assertEqual(1, len(root.xpath('//*[@id="container"]')), 'Split point was not adjusted') self.assertEqual(1, len(root.xpath('//*[@id="container"]')), 'Split point was not adjusted')
self.assertEqual(0, len(troot.xpath('//*[@id="container"]')), 'Split point was not adjusted') self.assertEqual(0, len(troot.xpath('//*[@id="container"]')), 'Split point was not adjusted')
self.check_links(c) self.check_links(c)
def test_merge_file(self):
' Test merging of files '
book = get_simple_book()
c = get_container(book)
merge(c, 'text', ('index_split_000.html', 'index_split_001.html'), 'index_split_000.html')
self.check_links(c)
book = get_simple_book()
c = get_container(book)
one, two = 'one/one.html', 'two/two.html'
c.add_file(one, b'<head><link href="../stylesheet.css"><p><a name="one" href="../two/two.html">1</a><a name="two" href="../two/two.html#one">2</a>') # noqa
c.add_file(two, b'<head><link href="../page_styles.css"><p><a name="one" href="two.html#two">1</a><a name="two" href="../one/one.html#one">2</a><a href="#one">3</a>') # noqa
merge(c, 'text', (one, two), one)
self.check_links(c)
root = c.parsed(one)
self.assertEqual(1, len(root.xpath('//*[@href="../page_styles.css"]')))

View File

@ -20,7 +20,7 @@ from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.polish.main import SUPPORTED, tweak_polish from calibre.ebooks.oeb.polish.main import SUPPORTED, tweak_polish
from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, guess_type from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, guess_type
from calibre.ebooks.oeb.polish.replace import rename_files from calibre.ebooks.oeb.polish.replace import rename_files
from calibre.ebooks.oeb.polish.split import split from calibre.ebooks.oeb.polish.split import split, merge, AbortError
from calibre.gui2 import error_dialog, choose_files, question_dialog, info_dialog from calibre.gui2 import error_dialog, choose_files, question_dialog, info_dialog
from calibre.gui2.dialogs.confirm_delete import confirm from calibre.gui2.dialogs.confirm_delete import confirm
from calibre.gui2.tweak_book import set_current_container, current_container, tprefs, actions, editors from calibre.gui2.tweak_book import set_current_container, current_container, tprefs, actions, editors
@ -54,6 +54,7 @@ class Boss(QObject):
fl.reorder_spine.connect(self.reorder_spine) fl.reorder_spine.connect(self.reorder_spine)
fl.rename_requested.connect(self.rename_requested) fl.rename_requested.connect(self.rename_requested)
fl.edit_file.connect(self.edit_file_requested) fl.edit_file.connect(self.edit_file_requested)
fl.merge_requested.connect(self.merge_requested)
self.gui.central.current_editor_changed.connect(self.apply_current_editor_state) self.gui.central.current_editor_changed.connect(self.apply_current_editor_state)
self.gui.central.close_requested.connect(self.editor_close_requested) self.gui.central.close_requested.connect(self.editor_close_requested)
self.gui.central.search_panel.search_triggered.connect(self.search) self.gui.central.search_panel.search_triggered.connect(self.search)
@ -515,15 +516,28 @@ class Boss(QObject):
def split_requested(self, name, loc): def split_requested(self, name, loc):
if not self.check_dirtied(): if not self.check_dirtied():
return return
self.add_savepoint(self.gui.elided_text(_('Split %s') % name)) self.add_savepoint(_('Split %s') % self.gui.elided_text(name))
try: try:
bottom_name = split(current_container(), name, loc) bottom_name = split(current_container(), name, loc)
except: except AbortError:
self.rewind_savepoint() self.rewind_savepoint()
raise raise
self.apply_container_update_to_gui() self.apply_container_update_to_gui()
self.edit_file(bottom_name, 'html') self.edit_file(bottom_name, 'html')
def merge_requested(self, category, names, master):
if not self.check_dirtied():
return
self.add_savepoint(_('Merge files into %s') % self.gui.elided_text(master))
try:
merge(current_container(), category, names, master)
except AbortError:
self.rewind_savepoint()
raise
self.apply_container_update_to_gui()
if master in editors:
self.show_editor(master)
def sync_editor_to_preview(self, name, lnum): def sync_editor_to_preview(self, name, lnum):
editor = self.edit_file(name, 'html') editor = self.edit_file(name, 'html')
self.ignore_preview_to_editor_sync = True self.ignore_preview_to_editor_sync = True