Merging of HTML files

This commit is contained in:
Kovid Goyal 2013-11-23 09:39:06 +05:30
parent 5326452976
commit 099632502f
4 changed files with 205 additions and 15 deletions

View File

@ -539,7 +539,7 @@ class Container(object): # {{{
spine[-1].tail = last_tail
self.dirty(self.opf_name)
def remove_item(self, name):
def remove_item(self, name, remove_from_guide=True):
'''
Remove the item identified by name from this container. This removes all
references to the item in the OPF manifest, guide and spine as well as from
@ -571,6 +571,7 @@ class Container(object): # {{{
self.remove_from_xml(meta)
self.dirty(self.opf_name)
if remove_from_guide:
for item in self.opf_xpath('//opf:guide/opf:reference[@href]'):
if self.href_to_name(item.get('href'), self.opf_name) == name:
self.remove_from_xml(item)
@ -872,7 +873,7 @@ class EpubContainer(Container):
def names_that_must_not_be_changed(self):
return super(EpubContainer, self).names_that_must_not_be_changed | {'META-INF/' + x for x in self.META_INF}
def remove_item(self, name):
def remove_item(self, name, remove_from_guide=True):
# Handle removal of obfuscated fonts
if name == 'META-INF/encryption.xml':
self.obfuscated_fonts.clear()
@ -890,7 +891,7 @@ class EpubContainer(Container):
if name == self.href_to_name(cr.get('URI')):
self.remove_from_xml(em.getparent())
self.dirty('META-INF/encryption.xml')
super(EpubContainer, self).remove_item(name)
super(EpubContainer, self).remove_item(name, remove_from_guide=remove_from_guide)
def process_encryption(self):
fonts = {}

View File

@ -6,12 +6,16 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy
import copy, os
from future_builtins import map
from urlparse import urlparse
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML
from calibre.ebooks.oeb.polish.toc import node_from_loc
from calibre.ebooks.oeb.polish.replace import LinkRebaser
class AbortError(ValueError):
pass
def in_table(node):
while node is not None:
@ -167,9 +171,9 @@ def split(container, name, loc_or_xpath, before=True):
else:
split_point = node_from_loc(root, loc_or_xpath)
if in_table(split_point):
raise ValueError('Cannot split inside tables')
raise AbortError('Cannot split inside tables')
if split_point.tag.endswith('}body'):
raise ValueError('Cannot split on the <body> tag')
raise AbortError('Cannot split on the <body> tag')
tree1, tree2 = do_split(split_point, container.log, before=before)
root1, root2 = tree1.getroot(), tree2.getroot()
anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
@ -211,3 +215,157 @@ def split(container, name, loc_or_xpath, before=True):
container.insert_into_xml(spine, si, index=index)
container.dirty(container.opf_name)
return bottom_name
class MergeLinkReplacer(object):
def __init__(self, base, anchor_map, master, container):
self.container, self.anchor_map = container, anchor_map
self.master = master
self.base = base
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
return url
name = self.container.href_to_name(url, self.base)
amap = self.anchor_map.get(name, None)
if amap is None:
return url
purl = urlparse(url)
frag = purl.fragment or ''
frag = amap.get(frag, frag)
url = self.container.name_to_href(self.master, self.base) + '#' + frag
self.replaced = True
return url
def add_text(body, text):
if len(body) > 0:
body[-1].tail = (body[-1].tail or '') + text
else:
body.text = (body.text or '') + text
def all_anchors(root):
return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
def all_stylesheets(container, name):
for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
name = container.href_to_name(link.get('href'), name)
typ = link.get('type', 'text/css')
if typ == 'text/css':
yield name
def unique_anchor(seen_anchors, current):
c = 0
ans = current
while ans in seen_anchors:
c += 1
ans = '%s_%d' % (current, c)
return ans
def remove_name_attributes(root):
# Remove all name attributes, replacing them with id attributes
for elem in root.xpath('//*[@id and @name]'):
del elem.attrib['name']
for elem in root.xpath('//*[@name]'):
elem.set('id', elem.attrib.pop('name'))
def merge_html(container, names, master):
p = container.parsed
root = p(master)
# Ensure master has a <head>
head = root.find('h:head', namespaces=XPNSMAP)
if head is None:
head = root.makeelement(XHTML('head'))
container.insert_into_xml(root, head, 0)
seen_anchors = all_anchors(root)
seen_stylesheets = set(all_stylesheets(container, master))
master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
master_base = os.path.dirname(master)
anchor_map = {n:{} for n in names if n != master}
for name in names:
if name == master:
continue
# Insert new stylesheets into master
for sheet in all_stylesheets(container, name):
if sheet not in seen_stylesheets:
seen_stylesheets.add(sheet)
link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
container.insert_into_xml(head, link)
# Rebase links if master is in a different directory
if os.path.dirname(name) != master_base:
container.replace_links(name, LinkRebaser(container, name, master))
root = p(name)
children = []
for body in p(name).findall('h:body', namespaces=XPNSMAP):
children.append(body.text if body.text and body.text.strip() else '\n\n')
children.extend(body)
first_child = ''
for first_child in children:
if not isinstance(first_child, basestring):
break
if isinstance(first_child, basestring):
# Empty document, ignore
continue
amap = anchor_map[name]
remove_name_attributes(root)
for elem in root.xpath('//*[@id]'):
val = elem.get('id')
if not val:
continue
if val in seen_anchors:
nval = unique_anchor(seen_anchors, val)
elem.set('id', nval)
amap[val] = nval
else:
seen_anchors.add(val)
if 'id' not in first_child.attrib:
first_child.set('id', unique_anchor(seen_anchors, 'top'))
seen_anchors.add(first_child.get('id'))
amap[''] = first_child.get('id')
# Fix links that point to local changed anchors
for a in XPath('//h:a[starts-with(@href, "#")]')(root):
q = a.get('href')[1:]
if q in amap:
a.set('href', '#' + amap[q])
for child in children:
if isinstance(child, basestring):
add_text(master_body, child)
else:
master_body.append(copy.deepcopy(child))
container.remove_item(name, remove_from_guide=False)
# Fix all links in the container that point to merged files
for fname, media_type in container.mime_map.iteritems():
repl = MergeLinkReplacer(fname, anchor_map, master, container)
container.replace_links(fname, repl)
def merge(container, category, names, master):
if category not in {'text', 'styles'}:
raise AbortError('Cannot merge files of type: %s' % category)
if len(names) < 2:
raise AbortError('Must specify at least two files to be merged')
if master not in names:
raise AbortError('The master file must be one of the files being merged')
if category == 'text':
merge_html(container, names, master)
elif category == 'styles':
merge_css(container, names, master) # noqa
container.dirty(master)

View File

@ -12,7 +12,7 @@ from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_simple_book, get_
from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, OCF_NS
from calibre.ebooks.oeb.polish.replace import rename_files
from calibre.ebooks.oeb.polish.split import split
from calibre.ebooks.oeb.polish.split import split, merge
from calibre.utils.filenames import nlinks_file
from calibre.ptempfile import TemporaryFile
@ -188,3 +188,20 @@ class ContainerTests(BaseTest):
self.assertEqual(1, len(root.xpath('//*[@id="container"]')), 'Split point was not adjusted')
self.assertEqual(0, len(troot.xpath('//*[@id="container"]')), 'Split point was not adjusted')
self.check_links(c)
def test_merge_file(self):
' Test merging of files '
book = get_simple_book()
c = get_container(book)
merge(c, 'text', ('index_split_000.html', 'index_split_001.html'), 'index_split_000.html')
self.check_links(c)
book = get_simple_book()
c = get_container(book)
one, two = 'one/one.html', 'two/two.html'
c.add_file(one, b'<head><link href="../stylesheet.css"><p><a name="one" href="../two/two.html">1</a><a name="two" href="../two/two.html#one">2</a>') # noqa
c.add_file(two, b'<head><link href="../page_styles.css"><p><a name="one" href="two.html#two">1</a><a name="two" href="../one/one.html#one">2</a><a href="#one">3</a>') # noqa
merge(c, 'text', (one, two), one)
self.check_links(c)
root = c.parsed(one)
self.assertEqual(1, len(root.xpath('//*[@href="../page_styles.css"]')))

View File

@ -20,7 +20,7 @@ from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.polish.main import SUPPORTED, tweak_polish
from calibre.ebooks.oeb.polish.container import get_container as _gc, clone_container, guess_type
from calibre.ebooks.oeb.polish.replace import rename_files
from calibre.ebooks.oeb.polish.split import split
from calibre.ebooks.oeb.polish.split import split, merge, AbortError
from calibre.gui2 import error_dialog, choose_files, question_dialog, info_dialog
from calibre.gui2.dialogs.confirm_delete import confirm
from calibre.gui2.tweak_book import set_current_container, current_container, tprefs, actions, editors
@ -54,6 +54,7 @@ class Boss(QObject):
fl.reorder_spine.connect(self.reorder_spine)
fl.rename_requested.connect(self.rename_requested)
fl.edit_file.connect(self.edit_file_requested)
fl.merge_requested.connect(self.merge_requested)
self.gui.central.current_editor_changed.connect(self.apply_current_editor_state)
self.gui.central.close_requested.connect(self.editor_close_requested)
self.gui.central.search_panel.search_triggered.connect(self.search)
@ -515,15 +516,28 @@ class Boss(QObject):
def split_requested(self, name, loc):
if not self.check_dirtied():
return
self.add_savepoint(self.gui.elided_text(_('Split %s') % name))
self.add_savepoint(_('Split %s') % self.gui.elided_text(name))
try:
bottom_name = split(current_container(), name, loc)
except:
except AbortError:
self.rewind_savepoint()
raise
self.apply_container_update_to_gui()
self.edit_file(bottom_name, 'html')
def merge_requested(self, category, names, master):
if not self.check_dirtied():
return
self.add_savepoint(_('Merge files into %s') % self.gui.elided_text(master))
try:
merge(current_container(), category, names, master)
except AbortError:
self.rewind_savepoint()
raise
self.apply_container_update_to_gui()
if master in editors:
self.show_editor(master)
def sync_editor_to_preview(self, name, lnum):
editor = self.edit_file(name, 'html')
self.ignore_preview_to_editor_sync = True