Initial (untested) port of splitting code to OEBBook

This commit is contained in:
Kovid Goyal 2009-04-12 12:09:38 -07:00
parent dc5299b8a8
commit a423691dd5
4 changed files with 280 additions and 321 deletions

View File

@ -272,11 +272,26 @@ def XPath(expr):
def xpath(elem, expr): def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP) return elem.xpath(expr, namespaces=XPNSMAP)
def xml2str(root, pretty_print=False): def _prepare_xml_for_serialization(root):
return etree.tostring(root, encoding='utf-8', xml_declaration=True, root.set('xmlns', XHTML_NS)
root.set('{%s}xlink'%XHTML_NS, XLINK_NS)
for x in root.iter():
if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg':
x.set('xmlns', SVG_NS)
def xml2str(root, pretty_print=False, strip_comments=False):
_prepare_xml_for_serialization(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print) pretty_print=pretty_print)
if strip_comments:
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
return ans
def xml2unicode(root, pretty_print=False): def xml2unicode(root, pretty_print=False):
_prepare_xml_for_serialization(root)
return etree.tostring(root, pretty_print=pretty_print) return etree.tostring(root, pretty_print=pretty_print)
ASCII_CHARS = set(chr(x) for x in xrange(128)) ASCII_CHARS = set(chr(x) for x in xrange(128))
@ -826,6 +841,11 @@ class Manifest(object):
return xml2str(data, pretty_print=self.oeb.pretty_print) return xml2str(data, pretty_print=self.oeb.pretty_print)
if isinstance(data, unicode): if isinstance(data, unicode):
return data.encode('utf-8') return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, unicode):
data = data.encode('utf-8')
return data
return str(data) return str(data)
def __unicode__(self): def __unicode__(self):
@ -834,6 +854,8 @@ class Manifest(object):
return xml2unicode(data, pretty_print=self.oeb.pretty_print) return xml2unicode(data, pretty_print=self.oeb.pretty_print)
if isinstance(data, unicode): if isinstance(data, unicode):
return data return data
if hasattr(data, 'cssText'):
return data.cssText
return unicode(data) return unicode(data)
def __eq__(self, other): def __eq__(self, other):
@ -1044,6 +1066,12 @@ class Spine(object):
self.items[i].spine_position = i self.items[i].spine_position = i
item.spine_position = None item.spine_position = None
def index(self, item):
for i, x in enumerate(self):
if item == x:
return i
return -1
def __iter__(self): def __iter__(self):
for item in self.items: for item in self.items:
yield item yield item

View File

@ -162,7 +162,6 @@ class EbookIterator(object):
s.pages = p s.pages = p
start = 1 start = 1
for s in self.spine: for s in self.spine:
s.start_page = start s.start_page = start
start += s.pages start += s.pages

View File

@ -22,7 +22,6 @@ class OEBOutput(OutputFormatPlugin):
if not os.path.exists(output_path): if not os.path.exists(output_path):
os.makedirs(output_path) os.makedirs(output_path)
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME
from calibre.ebooks.html import tostring as html_tostring
with CurrentDir(output_path): with CurrentDir(output_path):
results = oeb_book.to_opf2(page_map=True) results = oeb_book.to_opf2(page_map=True)
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
@ -38,16 +37,7 @@ class OEBOutput(OutputFormatPlugin):
dir = os.path.dirname(path) dir = os.path.dirname(path)
if not os.path.exists(dir): if not os.path.exists(dir):
os.makedirs(dir) os.makedirs(dir)
raw = item.data
if not isinstance(raw, basestring):
if hasattr(raw, 'cssText'):
raw = raw.cssText
else:
raw = html_tostring(raw,
pretty_print=opts.pretty_print)
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(raw) f.write(str(item))

View File

@ -4,21 +4,25 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
''' '''
Split the flows in an epub file to conform to size limitations. Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
forces at "likely" locations to conform to size limitations. This transform
assumes a prior call to the flatcss transform.
''' '''
import os, math, functools, collections, re, copy, sys import os, math, functools, collections, re, copy
from lxml.etree import XPath as _XPath from lxml.etree import XPath as _XPath
from lxml import etree, html from lxml import etree, html
from lxml.cssselect import CSSSelector from lxml.cssselect import CSSSelector
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP, urldefrag, \
rewrite_links
from calibre.ebooks.epub import tostring, rules from calibre.ebooks.epub import tostring, rules
from calibre import CurrentDir
XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'}) NAMESPACES = dict(XPNSMAP)
content = functools.partial(os.path.join, 'content') NAMESPACES['re'] = 'http://exslt.org/regular-expressions'
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
SPLIT_ATTR = 'cs' SPLIT_ATTR = 'cs'
SPLIT_POINT_ATTR = 'csp' SPLIT_POINT_ATTR = 'csp'
@ -27,149 +31,166 @@ class SplitError(ValueError):
def __init__(self, path, root): def __init__(self, path, root):
size = len(tostring(root))/1024. size = len(tostring(root))/1024.
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% ValueError.__init__(self,
(os.path.basename(path), size)) _('Could not find reasonable point at which to split: '
'%s Sub-tree size: %d KB')%
(path, size))
class Split(object):
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
max_flow_size=0):
self.split_on_page_breaks = split_on_page_breaks
self.page_breaks_xpath = page_breaks_xpath
self.max_flow_size = max_flow_size
if self.page_breaks_xpath is not None:
self.page_breaks_xpath = XPath(self.page_breaks_xpath)
def __call__(self, oeb, context):
self.oeb = oeb
self.log = oeb.log
self.map = {}
self.page_break_selectors = None
for item in self.oeb.manifest.items:
if etree.iselement(item.data):
self.split_item(item)
self.fix_links()
def split_item(self, item):
if self.split_on_page_breaks:
if self.page_breaks_xpath is None:
page_breaks, page_break_ids = self.find_page_breaks(item)
else:
page_breaks, page_break_ids = self.page_breaks_xpath(item.data)
splitter = FlowSplitter(item, page_breaks, page_break_ids,
self.max_flow_size, self.oeb)
if splitter.was_split:
self.map[item.href] = dict(splitter.anchor_map)
def find_page_breaks(self, item):
if self.page_break_selectors is None:
self.page_break_selectors = set([])
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES]
page_break_selectors = set([])
for rule in rules(stylesheets):
before = getattr(rule.style.getPropertyCSSValue(
'page-break-before'), 'cssText', '').strip().lower()
after = getattr(rule.style.getPropertyCSSValue(
'page-break-after'), 'cssText', '').strip().lower()
try:
if before and before != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText),
True))
except:
pass
try:
if after and after != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText),
False))
except:
pass
page_breaks = set([])
for selector, before in page_break_selectors:
for elem in selector(item.data):
elem.pb_before = before
page_breaks.add(elem)
for i, elem in enumerate(item.data.iter()):
elem.pb_order = i
page_breaks = list(page_breaks)
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
page_break_ids, page_breaks_ = [], []
for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i))
id = x.get('id')
page_breaks_.append((XPath('//*[@id="%s"]'%id), x.pb_before))
page_break_ids.append(id)
return page_breaks_, page_break_ids
def fix_links(self, opf):
'''
Fix references to the split files in other content files.
'''
for item in self.oeb.manifest:
if etree.iselement(item.data):
self.current_item = item
rewrite_links(item.data, self.rewrite_links)
def rewrite_links(self, url):
href, frag = urldefrag(url)
href = self.current_item.abshref(href)
if href in self.map:
anchor_map = self.map[href]
nhref = anchor_map[frag if frag else None]
if frag:
nhref = '#'.joinn(href, frag)
return nhref
return url
class Splitter(object): class FlowSplitter(object):
def __init__(self, path, opts, stylesheet_map, opf): def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb):
self.setup_cli_handler(opts.verbose) self.item = item
self.path = path self.oeb = oeb
self.always_remove = not opts.preserve_tag_structure or \ self.log = oeb.log
os.stat(content(path)).st_size > 5*opts.profile.flow_size self.page_breaks = page_breaks
self.base = (os.path.splitext(path)[0].replace('%', '%%') + '_split_%d.html') self.page_break_ids = page_break_ids
self.opts = opts self.max_flow_size = max_flow_size
self.orig_size = os.stat(content(path)).st_size self.base = item.abshref(item.href)
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
root = html.fromstring(open(content(path)).read())
self.page_breaks, self.trees = [], [] base, ext = os.path.splitext(self.base)
self.split_size = 0 self.base = base.replace('%', '%%')+'_split_%d'+ext
# Split on page breaks self.trees = [self.item.data]
self.splitting_on_page_breaks = True self.splitting_on_page_breaks = True
if not opts.dont_split_on_page_breaks: if self.page_breaks:
self.log_info('\tSplitting on page breaks...') self.split_on_page_breaks(self.item.data)
if self.path in stylesheet_map:
self.find_page_breaks(stylesheet_map[self.path], root)
self.split_on_page_breaks(root.getroottree())
trees = list(self.trees)
else:
self.trees = [root.getroottree()]
trees = list(self.trees)
# Split any remaining over-sized trees
self.splitting_on_page_breaks = False self.splitting_on_page_breaks = False
if self.opts.profile.flow_size < sys.maxint:
if self.max_flow_size > 0:
lt_found = False lt_found = False
self.log_info('\tLooking for large trees...') self.log('\tLooking for large trees...')
for i, tree in enumerate(list(trees)): trees = list(self.trees)
for i, tree in enumerate(list(self.trees)):
self.trees = [] self.trees = []
size = len(tostring(tree.getroot())) size = len(tostring(tree.getroot()))
if size > self.opts.profile.flow_size: if size > self.opts.profile.flow_size:
lt_found = True lt_found = True
try: self.split_to_size(tree)
self.split_to_size(tree)
except (SplitError, RuntimeError): # Splitting fails
if not self.always_remove:
self.always_remove = True
self.split_to_size(tree)
else:
raise
trees[i:i+1] = list(self.trees) trees[i:i+1] = list(self.trees)
if not lt_found: if not lt_found:
self.log_info('\tNo large trees found') self.log_info('\tNo large trees found')
self.trees = trees
self.trees = trees
self.was_split = len(self.trees) > 1 self.was_split = len(self.trees) > 1
if self.was_split: self.commit()
self.commit()
self.log_info('\t\tSplit into %d parts.', len(self.trees))
if self.opts.verbose:
for f in self.files:
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
self.fix_opf(opf)
self.trees = None def split_on_page_breaks(self, orig_tree):
ordered_ids = []
for elem in orig_tree.xpath('//*[@id]'):
id = elem.get('id')
if id in self.page_break_ids:
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
self.trees = []
def split_text(self, text, root, size): tree = orig_tree
self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) for pattern, before in ordered_ids:
rest = text.replace('\r', '') self.log.debug('\t\tSplitting on page-break')
parts = re.split('\n\n', rest) elem = pattern(tree)
self.log_debug('\t\t\t\tFound %d parts'%len(parts)) if elem:
if max(map(len, parts)) > size: before, after = self.do_split(tree, elem[0], before)
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root) self.trees.append(before)
ans = [] tree = after
buf = '' self.trees.append(tree)
for part in parts: self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
if len(buf) + len(part) < size:
buf += '\n\n'+part
else:
ans.append(buf)
buf = part
return ans
def split_to_size(self, tree):
self.log_debug('\t\tSplitting...')
root = tree.getroot()
# Split large <pre> tags
for pre in list(root.xpath('//pre')):
text = u''.join(pre.xpath('descendant::text()'))
pre.text = text
for child in list(pre.iterchildren()):
pre.remove(child)
if len(pre.text) > self.opts.profile.flow_size*0.5:
frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
new_pres = []
for frag in frags:
pre2 = copy.copy(pre)
pre2.text = frag
pre2.tail = u''
new_pres.append(pre2)
new_pres[-1].tail = pre.tail
p = pre.getparent()
i = p.index(pre)
p[i:i+1] = new_pres
split_point, before = self.find_split_point(root)
if split_point is None or self.split_size > 6*self.orig_size:
if not self.always_remove:
self.log_warn(_('\t\tToo much markup. Re-splitting without '
'structure preservation. This may cause '
'incorrect rendering.'))
raise SplitError(self.path, root)
for t in self.do_split(tree, split_point, before):
r = t.getroot()
if self.is_page_empty(r):
continue
size = len(tostring(r))
if size <= self.opts.profile.flow_size:
self.trees.append(t)
#print tostring(t.getroot(), pretty_print=True)
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
len(self.trees), size/1024.)
self.split_size += size
else:
self.split_to_size(t)
def is_page_empty(self, root):
body = root.find('body')
if body is None:
return False
txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
if len(txt) > 4:
#if len(txt) < 100:
# print 1111111, html.tostring(body, method='html', encoding=unicode)
return False
for img in root.xpath('//img'):
if img.get('style', '') != 'display:none':
return False
return True
def do_split(self, tree, split_point, before): def do_split(self, tree, split_point, before):
''' '''
@ -190,7 +211,7 @@ class Splitter(object):
split_point2 = root2.xpath(path)[0] split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True): def nix_element(elem, top=True):
if self.always_remove: if True:
parent = elem.getparent() parent = elem.getparent()
index = parent.index(elem) index = parent.index(elem)
if top: if top:
@ -198,7 +219,6 @@ class Splitter(object):
else: else:
index = parent.index(elem) index = parent.index(elem)
parent[index:index+1] = list(elem.iterchildren()) parent[index:index+1] = list(elem.iterchildren())
else: else:
elem.text = u'' elem.text = u''
elem.tail = u'' elem.tail = u''
@ -241,67 +261,76 @@ class Splitter(object):
return tree, tree2 return tree, tree2
def is_page_empty(self, root):
body = root.find('body')
if body is None:
return False
txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
if len(txt) > 4:
return False
for img in root.xpath('//img'):
if img.get('style', '') != 'display:none':
return False
return True
def split_on_page_breaks(self, orig_tree): def split_text(self, text, root, size):
ordered_ids = [] self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
for elem in orig_tree.xpath('//*[@id]'): rest = text.replace('\r', '')
id = elem.get('id') parts = re.split('\n\n', rest)
if id in self.page_break_ids: self.log.debug('\t\t\t\tFound %d parts'%len(parts))
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)]) if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag '
self.trees = [] 'with a very large paragraph', root)
tree = orig_tree ans = []
for pattern, before in ordered_ids: buf = ''
self.log_info('\t\tSplitting on page-break') for part in parts:
elem = pattern(tree) if len(buf) + len(part) < size:
if elem: buf += '\n\n'+part
before, after = self.do_split(tree, elem[0], before) else:
self.trees.append(before) ans.append(buf)
tree = after buf = part
self.trees.append(tree) return ans
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
def split_to_size(self, tree):
self.log.debug('\t\tSplitting...')
root = tree.getroot()
# Split large <pre> tags
for pre in list(root.xpath('//pre')):
text = u''.join(pre.xpath('descendant::text()'))
pre.text = text
for child in list(pre.iterchildren()):
pre.remove(child)
if len(pre.text) > self.max_flow_size*0.5:
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
new_pres = []
for frag in frags:
pre2 = copy.copy(pre)
pre2.text = frag
pre2.tail = u''
new_pres.append(pre2)
new_pres[-1].tail = pre.tail
p = pre.getparent()
i = p.index(pre)
p[i:i+1] = new_pres
def find_page_breaks(self, stylesheets, root): split_point, before = self.find_split_point(root)
''' if split_point is None:
Find all elements that have either page-break-before or page-break-after set. raise SplitError(self.item.href, root)
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
have ids, an id is created).
'''
page_break_selectors = set([])
for rule in rules(stylesheets):
before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
try:
if before and before != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText), True))
except:
pass
try:
if after and after != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText), False))
except:
pass
page_breaks = set([])
for selector, before in page_break_selectors:
for elem in selector(root):
elem.pb_before = before
page_breaks.add(elem)
for i, elem in enumerate(root.iter()):
elem.pb_order = i
page_breaks = list(page_breaks)
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
self.page_break_ids = []
for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i))
id = x.get('id')
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
self.page_break_ids.append(id)
for t in self.do_split(tree, split_point, before):
r = t.getroot()
if self.is_page_empty(r):
continue
size = len(tostring(r))
if size <= self.max_flow_size:
self.trees.append(t)
#print tostring(t.getroot(), pretty_print=True)
self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)',
len(self.trees), size/1024.)
self.split_size += size
else:
self.split_to_size(t)
def find_split_point(self, root): def find_split_point(self, root):
''' '''
@ -336,8 +365,7 @@ class Splitter(object):
'//br', '//br',
'//li', '//li',
): ):
elems = root.xpath(path, elems = root.xpath(path, namespaces=NAMESPACES)
namespaces={'re':'http://exslt.org/regular-expressions'})
elem = pick_elem(elems) elem = pick_elem(elems)
if elem is not None: if elem is not None:
try: try:
@ -355,6 +383,8 @@ class Splitter(object):
all anchors in the original tree. Internal links are re-directed. The all anchors in the original tree. Internal links are re-directed. The
original file is deleted and the split files are saved. original file is deleted and the split files are saved.
''' '''
if not self.was_split:
return
self.anchor_map = collections.defaultdict(lambda :self.base%0) self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.files = [] self.files = []
@ -368,134 +398,46 @@ class Splitter(object):
elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_ATTR, None)
elem.attrib.pop(SPLIT_POINT_ATTR, '0') elem.attrib.pop(SPLIT_POINT_ATTR, '0')
for current, tree in zip(self.files, self.trees): spine_pos = self.item.spine_pos
for a in tree.getroot().xpath('//a[@href]'): for current, tree in zip(map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
href = a.get('href').strip() href = a.get('href').strip()
if href.startswith('#'): if href.startswith('#'):
anchor = href[1:] anchor = href[1:]
file = self.anchor_map[anchor] file = self.anchor_map[anchor]
if file != current: if file != current:
a.set('href', file+href) a.set('href', file+href)
open(content(current), 'wb').\
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
os.remove(content(self.path)) new_id = self.oeb.manifest.generate(id=self.item.id)[0]
new_item = self.oeb.manifest.add(new_id, current,
self.item.media_type, data=tree.getroot())
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
if self.oeb.guide:
for ref in self.oeb.guide:
href, frag = urldefrag(ref.href)
if href == self.item.href:
nhref = self.anchor_map[frag if frag else None]
if frag:
nhref = '#'.join(nhref, frag)
ref.href = nhref
def fix_toc_entry(toc):
if toc.href:
href, frag = urldefrag(toc.href)
if href == self.item.href:
nhref = self.anchor_map[frag if frag else None]
if frag:
nhref = '#'.join(nhref, frag)
toc.href = nhref
for x in toc:
fix_toc_entry(x)
def fix_opf(self, opf): if self.oeb.toc:
''' fix_toc_entry(self.oeb.toc)
Fix references to the split file in the OPF.
'''
items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path]
new_items = [('content/'+f, None) for f in self.files]
id_map = {}
for item in items:
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
for id in id_map.keys(): self.oeb.manifest.remove(self.item)
opf.replace_spine_items_by_idref(id, id_map[id])
for ref in opf.iterguide():
href = ref.get('href', '')
if href.startswith('content/'+self.path):
href = href.split('#')
frag = None
if len(href) > 1:
frag = href[1]
if frag not in self.anchor_map:
self.log_warning('\t\tUnable to re-map OPF link', href)
continue
new_file = self.anchor_map[frag]
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
def fix_content_links(html_files, changes, opts):
split_files = [f.path for f in changes]
anchor_maps = [f.anchor_map for f in changes]
files = list(html_files)
for j, f in enumerate(split_files):
try:
i = files.index(f)
files[i:i+1] = changes[j].files
except ValueError:
continue
for htmlfile in files:
changed = False
root = html.fromstring(open(content(htmlfile), 'rb').read())
for a in root.xpath('//a[@href]'):
href = a.get('href')
if not href.startswith('#'):
href = href.split('#')
anchor = href[1] if len(href) > 1 else None
href = href[0]
if href in split_files:
try:
newf = anchor_maps[split_files.index(href)][anchor]
except:
print '\t\tUnable to remap HTML link:', href, anchor
continue
frag = ('#'+anchor) if anchor else ''
a.set('href', newf+frag)
changed = True
if changed:
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
def fix_ncx(path, changes):
split_files = [f.path for f in changes]
anchor_maps = [f.anchor_map for f in changes]
tree = etree.parse(path)
changed = False
for content in tree.getroot().xpath('//x:content[@src]',
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
href = content.get('src')
if not href.startswith('#'):
href = href.split('#')
anchor = href[1] if len(href) > 1 else None
href = href[0].split('/')[-1]
if href in split_files:
try:
newf = anchor_maps[split_files.index(href)][anchor]
except:
print 'Unable to remap NCX link:', href, anchor
frag = ('#'+anchor) if anchor else ''
content.set('src', 'content/'+newf+frag)
changed = True
if changed:
open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
def find_html_files(opf):
'''
Find all HTML files referenced by `opf`.
'''
html_files = []
for item in opf.itermanifest():
if 'html' in item.get('media-type', '').lower():
f = item.get('href').split('/')[-1]
f2 = f.replace('&', '%26')
if not os.path.exists(content(f)) and os.path.exists(content(f2)):
f = f2
item.set('href', item.get('href').replace('&', '%26'))
if os.path.exists(content(f)):
html_files.append(f)
return html_files
def split(pathtoopf, opts, stylesheet_map):
pathtoopf = os.path.abspath(pathtoopf)
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
with CurrentDir(os.path.dirname(pathtoopf)):
html_files = find_html_files(opf)
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
changes = [c for c in changes if c.was_split]
fix_content_links(html_files, changes, opts)
for item in opf.itermanifest():
if item.get('media-type', '') == 'application/x-dtbncx+xml':
fix_ncx(item.get('href'), changes)
break
open(pathtoopf, 'wb').write(opf.render())