Initial (untested) port of splitting code to OEBBook

This commit is contained in:
Kovid Goyal 2009-04-12 12:09:38 -07:00
parent dc5299b8a8
commit a423691dd5
4 changed files with 280 additions and 321 deletions

View File

@ -272,11 +272,26 @@ def XPath(expr):
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
def xml2str(root, pretty_print=False):
return etree.tostring(root, encoding='utf-8', xml_declaration=True,
def _prepare_xml_for_serialization(root):
root.set('xmlns', XHTML_NS)
root.set('{%s}xlink'%XHTML_NS, XLINK_NS)
for x in root.iter():
if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg':
x.set('xmlns', SVG_NS)
def xml2str(root, pretty_print=False, strip_comments=False):
_prepare_xml_for_serialization(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print)
if strip_comments:
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
return ans
def xml2unicode(root, pretty_print=False):
_prepare_xml_for_serialization(root)
return etree.tostring(root, pretty_print=pretty_print)
ASCII_CHARS = set(chr(x) for x in xrange(128))
@ -826,6 +841,11 @@ class Manifest(object):
return xml2str(data, pretty_print=self.oeb.pretty_print)
if isinstance(data, unicode):
return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, unicode):
data = data.encode('utf-8')
return data
return str(data)
def __unicode__(self):
@ -834,6 +854,8 @@ class Manifest(object):
return xml2unicode(data, pretty_print=self.oeb.pretty_print)
if isinstance(data, unicode):
return data
if hasattr(data, 'cssText'):
return data.cssText
return unicode(data)
def __eq__(self, other):
@ -1044,6 +1066,12 @@ class Spine(object):
self.items[i].spine_position = i
item.spine_position = None
def index(self, item):
for i, x in enumerate(self):
if item == x:
return i
return -1
def __iter__(self):
for item in self.items:
yield item

View File

@ -162,7 +162,6 @@ class EbookIterator(object):
s.pages = p
start = 1
for s in self.spine:
s.start_page = start
start += s.pages

View File

@ -22,7 +22,6 @@ class OEBOutput(OutputFormatPlugin):
if not os.path.exists(output_path):
os.makedirs(output_path)
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME
from calibre.ebooks.html import tostring as html_tostring
with CurrentDir(output_path):
results = oeb_book.to_opf2(page_map=True)
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
@ -38,16 +37,7 @@ class OEBOutput(OutputFormatPlugin):
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
raw = item.data
if not isinstance(raw, basestring):
if hasattr(raw, 'cssText'):
raw = raw.cssText
else:
raw = html_tostring(raw,
pretty_print=opts.pretty_print)
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
with open(path, 'wb') as f:
f.write(raw)
f.write(str(item))

View File

@ -4,21 +4,25 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Split the flows in an epub file to conform to size limitations.
Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
forces at "likely" locations to conform to size limitations. This transform
assumes a prior call to the flatcss transform.
'''
import os, math, functools, collections, re, copy, sys
import os, math, functools, collections, re, copy
from lxml.etree import XPath as _XPath
from lxml import etree, html
from lxml.cssselect import CSSSelector
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP, urldefrag, \
rewrite_links
from calibre.ebooks.epub import tostring, rules
from calibre import CurrentDir
XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'})
content = functools.partial(os.path.join, 'content')
NAMESPACES = dict(XPNSMAP)
NAMESPACES['re'] = 'http://exslt.org/regular-expressions'
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
SPLIT_ATTR = 'cs'
SPLIT_POINT_ATTR = 'csp'
@ -27,149 +31,166 @@ class SplitError(ValueError):
def __init__(self, path, root):
size = len(tostring(root))/1024.
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
(os.path.basename(path), size))
ValueError.__init__(self,
_('Could not find reasonable point at which to split: '
'%s Sub-tree size: %d KB')%
(path, size))
class Split(object):
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
max_flow_size=0):
self.split_on_page_breaks = split_on_page_breaks
self.page_breaks_xpath = page_breaks_xpath
self.max_flow_size = max_flow_size
if self.page_breaks_xpath is not None:
self.page_breaks_xpath = XPath(self.page_breaks_xpath)
def __call__(self, oeb, context):
self.oeb = oeb
self.log = oeb.log
self.map = {}
self.page_break_selectors = None
for item in self.oeb.manifest.items:
if etree.iselement(item.data):
self.split_item(item)
self.fix_links()
def split_item(self, item):
if self.split_on_page_breaks:
if self.page_breaks_xpath is None:
page_breaks, page_break_ids = self.find_page_breaks(item)
else:
page_breaks, page_break_ids = self.page_breaks_xpath(item.data)
splitter = FlowSplitter(item, page_breaks, page_break_ids,
self.max_flow_size, self.oeb)
if splitter.was_split:
self.map[item.href] = dict(splitter.anchor_map)
def find_page_breaks(self, item):
if self.page_break_selectors is None:
self.page_break_selectors = set([])
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES]
page_break_selectors = set([])
for rule in rules(stylesheets):
before = getattr(rule.style.getPropertyCSSValue(
'page-break-before'), 'cssText', '').strip().lower()
after = getattr(rule.style.getPropertyCSSValue(
'page-break-after'), 'cssText', '').strip().lower()
try:
if before and before != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText),
True))
except:
pass
try:
if after and after != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText),
False))
except:
pass
page_breaks = set([])
for selector, before in page_break_selectors:
for elem in selector(item.data):
elem.pb_before = before
page_breaks.add(elem)
for i, elem in enumerate(item.data.iter()):
elem.pb_order = i
page_breaks = list(page_breaks)
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
page_break_ids, page_breaks_ = [], []
for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i))
id = x.get('id')
page_breaks_.append((XPath('//*[@id="%s"]'%id), x.pb_before))
page_break_ids.append(id)
return page_breaks_, page_break_ids
def fix_links(self, opf):
'''
Fix references to the split files in other content files.
'''
for item in self.oeb.manifest:
if etree.iselement(item.data):
self.current_item = item
rewrite_links(item.data, self.rewrite_links)
def rewrite_links(self, url):
href, frag = urldefrag(url)
href = self.current_item.abshref(href)
if href in self.map:
anchor_map = self.map[href]
nhref = anchor_map[frag if frag else None]
if frag:
nhref = '#'.joinn(href, frag)
return nhref
return url
class Splitter(object):
class FlowSplitter(object):
def __init__(self, path, opts, stylesheet_map, opf):
self.setup_cli_handler(opts.verbose)
self.path = path
self.always_remove = not opts.preserve_tag_structure or \
os.stat(content(path)).st_size > 5*opts.profile.flow_size
self.base = (os.path.splitext(path)[0].replace('%', '%%') + '_split_%d.html')
self.opts = opts
self.orig_size = os.stat(content(path)).st_size
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
root = html.fromstring(open(content(path)).read())
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb):
self.item = item
self.oeb = oeb
self.log = oeb.log
self.page_breaks = page_breaks
self.page_break_ids = page_break_ids
self.max_flow_size = max_flow_size
self.base = item.abshref(item.href)
self.page_breaks, self.trees = [], []
self.split_size = 0
base, ext = os.path.splitext(self.base)
self.base = base.replace('%', '%%')+'_split_%d'+ext
# Split on page breaks
self.trees = [self.item.data]
self.splitting_on_page_breaks = True
if not opts.dont_split_on_page_breaks:
self.log_info('\tSplitting on page breaks...')
if self.path in stylesheet_map:
self.find_page_breaks(stylesheet_map[self.path], root)
self.split_on_page_breaks(root.getroottree())
trees = list(self.trees)
else:
self.trees = [root.getroottree()]
trees = list(self.trees)
# Split any remaining over-sized trees
if self.page_breaks:
self.split_on_page_breaks(self.item.data)
self.splitting_on_page_breaks = False
if self.opts.profile.flow_size < sys.maxint:
if self.max_flow_size > 0:
lt_found = False
self.log_info('\tLooking for large trees...')
for i, tree in enumerate(list(trees)):
self.log('\tLooking for large trees...')
trees = list(self.trees)
for i, tree in enumerate(list(self.trees)):
self.trees = []
size = len(tostring(tree.getroot()))
if size > self.opts.profile.flow_size:
lt_found = True
try:
self.split_to_size(tree)
except (SplitError, RuntimeError): # Splitting fails
if not self.always_remove:
self.always_remove = True
self.split_to_size(tree)
else:
raise
self.split_to_size(tree)
trees[i:i+1] = list(self.trees)
if not lt_found:
self.log_info('\tNo large trees found')
self.trees = trees
self.trees = trees
self.was_split = len(self.trees) > 1
if self.was_split:
self.commit()
self.log_info('\t\tSplit into %d parts.', len(self.trees))
if self.opts.verbose:
for f in self.files:
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
self.fix_opf(opf)
self.commit()
self.trees = None
def split_on_page_breaks(self, orig_tree):
ordered_ids = []
for elem in orig_tree.xpath('//*[@id]'):
id = elem.get('id')
if id in self.page_break_ids:
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
def split_text(self, text, root, size):
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
rest = text.replace('\r', '')
parts = re.split('\n\n', rest)
self.log_debug('\t\t\t\tFound %d parts'%len(parts))
if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
ans = []
buf = ''
for part in parts:
if len(buf) + len(part) < size:
buf += '\n\n'+part
else:
ans.append(buf)
buf = part
return ans
def split_to_size(self, tree):
self.log_debug('\t\tSplitting...')
root = tree.getroot()
# Split large <pre> tags
for pre in list(root.xpath('//pre')):
text = u''.join(pre.xpath('descendant::text()'))
pre.text = text
for child in list(pre.iterchildren()):
pre.remove(child)
if len(pre.text) > self.opts.profile.flow_size*0.5:
frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
new_pres = []
for frag in frags:
pre2 = copy.copy(pre)
pre2.text = frag
pre2.tail = u''
new_pres.append(pre2)
new_pres[-1].tail = pre.tail
p = pre.getparent()
i = p.index(pre)
p[i:i+1] = new_pres
split_point, before = self.find_split_point(root)
if split_point is None or self.split_size > 6*self.orig_size:
if not self.always_remove:
self.log_warn(_('\t\tToo much markup. Re-splitting without '
'structure preservation. This may cause '
'incorrect rendering.'))
raise SplitError(self.path, root)
for t in self.do_split(tree, split_point, before):
r = t.getroot()
if self.is_page_empty(r):
continue
size = len(tostring(r))
if size <= self.opts.profile.flow_size:
self.trees.append(t)
#print tostring(t.getroot(), pretty_print=True)
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
len(self.trees), size/1024.)
self.split_size += size
else:
self.split_to_size(t)
def is_page_empty(self, root):
body = root.find('body')
if body is None:
return False
txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
if len(txt) > 4:
#if len(txt) < 100:
# print 1111111, html.tostring(body, method='html', encoding=unicode)
return False
for img in root.xpath('//img'):
if img.get('style', '') != 'display:none':
return False
return True
self.trees = []
tree = orig_tree
for pattern, before in ordered_ids:
self.log.debug('\t\tSplitting on page-break')
elem = pattern(tree)
if elem:
before, after = self.do_split(tree, elem[0], before)
self.trees.append(before)
tree = after
self.trees.append(tree)
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
def do_split(self, tree, split_point, before):
'''
@ -190,7 +211,7 @@ class Splitter(object):
split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True):
if self.always_remove:
if True:
parent = elem.getparent()
index = parent.index(elem)
if top:
@ -198,7 +219,6 @@ class Splitter(object):
else:
index = parent.index(elem)
parent[index:index+1] = list(elem.iterchildren())
else:
elem.text = u''
elem.tail = u''
@ -241,67 +261,76 @@ class Splitter(object):
return tree, tree2
def is_page_empty(self, root):
body = root.find('body')
if body is None:
return False
txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
if len(txt) > 4:
return False
for img in root.xpath('//img'):
if img.get('style', '') != 'display:none':
return False
return True
def split_on_page_breaks(self, orig_tree):
ordered_ids = []
for elem in orig_tree.xpath('//*[@id]'):
id = elem.get('id')
if id in self.page_break_ids:
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
self.trees = []
tree = orig_tree
for pattern, before in ordered_ids:
self.log_info('\t\tSplitting on page-break')
elem = pattern(tree)
if elem:
before, after = self.do_split(tree, elem[0], before)
self.trees.append(before)
tree = after
self.trees.append(tree)
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
def split_text(self, text, root, size):
self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
rest = text.replace('\r', '')
parts = re.split('\n\n', rest)
self.log.debug('\t\t\t\tFound %d parts'%len(parts))
if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag '
'with a very large paragraph', root)
ans = []
buf = ''
for part in parts:
if len(buf) + len(part) < size:
buf += '\n\n'+part
else:
ans.append(buf)
buf = part
return ans
def split_to_size(self, tree):
self.log.debug('\t\tSplitting...')
root = tree.getroot()
# Split large <pre> tags
for pre in list(root.xpath('//pre')):
text = u''.join(pre.xpath('descendant::text()'))
pre.text = text
for child in list(pre.iterchildren()):
pre.remove(child)
if len(pre.text) > self.max_flow_size*0.5:
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
new_pres = []
for frag in frags:
pre2 = copy.copy(pre)
pre2.text = frag
pre2.tail = u''
new_pres.append(pre2)
new_pres[-1].tail = pre.tail
p = pre.getparent()
i = p.index(pre)
p[i:i+1] = new_pres
def find_page_breaks(self, stylesheets, root):
'''
Find all elements that have either page-break-before or page-break-after set.
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
have ids, an id is created).
'''
page_break_selectors = set([])
for rule in rules(stylesheets):
before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
try:
if before and before != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText), True))
except:
pass
try:
if after and after != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText), False))
except:
pass
page_breaks = set([])
for selector, before in page_break_selectors:
for elem in selector(root):
elem.pb_before = before
page_breaks.add(elem)
for i, elem in enumerate(root.iter()):
elem.pb_order = i
page_breaks = list(page_breaks)
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
self.page_break_ids = []
for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i))
id = x.get('id')
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
self.page_break_ids.append(id)
split_point, before = self.find_split_point(root)
if split_point is None:
raise SplitError(self.item.href, root)
for t in self.do_split(tree, split_point, before):
r = t.getroot()
if self.is_page_empty(r):
continue
size = len(tostring(r))
if size <= self.max_flow_size:
self.trees.append(t)
#print tostring(t.getroot(), pretty_print=True)
self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)',
len(self.trees), size/1024.)
self.split_size += size
else:
self.split_to_size(t)
def find_split_point(self, root):
'''
@ -336,8 +365,7 @@ class Splitter(object):
'//br',
'//li',
):
elems = root.xpath(path,
namespaces={'re':'http://exslt.org/regular-expressions'})
elems = root.xpath(path, namespaces=NAMESPACES)
elem = pick_elem(elems)
if elem is not None:
try:
@ -355,6 +383,8 @@ class Splitter(object):
all anchors in the original tree. Internal links are re-directed. The
original file is deleted and the split files are saved.
'''
if not self.was_split:
return
self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.files = []
@ -368,134 +398,46 @@ class Splitter(object):
elem.attrib.pop(SPLIT_ATTR, None)
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
for current, tree in zip(self.files, self.trees):
for a in tree.getroot().xpath('//a[@href]'):
spine_pos = self.item.spine_pos
for current, tree in zip(map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
href = a.get('href').strip()
if href.startswith('#'):
anchor = href[1:]
file = self.anchor_map[anchor]
if file != current:
a.set('href', file+href)
open(content(current), 'wb').\
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
os.remove(content(self.path))
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
new_item = self.oeb.manifest.add(new_id, current,
self.item.media_type, data=tree.getroot())
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
if self.oeb.guide:
for ref in self.oeb.guide:
href, frag = urldefrag(ref.href)
if href == self.item.href:
nhref = self.anchor_map[frag if frag else None]
if frag:
nhref = '#'.join(nhref, frag)
ref.href = nhref
def fix_toc_entry(toc):
if toc.href:
href, frag = urldefrag(toc.href)
if href == self.item.href:
nhref = self.anchor_map[frag if frag else None]
if frag:
nhref = '#'.join(nhref, frag)
toc.href = nhref
for x in toc:
fix_toc_entry(x)
def fix_opf(self, opf):
'''
Fix references to the split file in the OPF.
'''
items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path]
new_items = [('content/'+f, None) for f in self.files]
id_map = {}
for item in items:
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
if self.oeb.toc:
fix_toc_entry(self.oeb.toc)
for id in id_map.keys():
opf.replace_spine_items_by_idref(id, id_map[id])
for ref in opf.iterguide():
href = ref.get('href', '')
if href.startswith('content/'+self.path):
href = href.split('#')
frag = None
if len(href) > 1:
frag = href[1]
if frag not in self.anchor_map:
self.log_warning('\t\tUnable to re-map OPF link', href)
continue
new_file = self.anchor_map[frag]
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
self.oeb.manifest.remove(self.item)
def fix_content_links(html_files, changes, opts):
split_files = [f.path for f in changes]
anchor_maps = [f.anchor_map for f in changes]
files = list(html_files)
for j, f in enumerate(split_files):
try:
i = files.index(f)
files[i:i+1] = changes[j].files
except ValueError:
continue
for htmlfile in files:
changed = False
root = html.fromstring(open(content(htmlfile), 'rb').read())
for a in root.xpath('//a[@href]'):
href = a.get('href')
if not href.startswith('#'):
href = href.split('#')
anchor = href[1] if len(href) > 1 else None
href = href[0]
if href in split_files:
try:
newf = anchor_maps[split_files.index(href)][anchor]
except:
print '\t\tUnable to remap HTML link:', href, anchor
continue
frag = ('#'+anchor) if anchor else ''
a.set('href', newf+frag)
changed = True
if changed:
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
def fix_ncx(path, changes):
split_files = [f.path for f in changes]
anchor_maps = [f.anchor_map for f in changes]
tree = etree.parse(path)
changed = False
for content in tree.getroot().xpath('//x:content[@src]',
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
href = content.get('src')
if not href.startswith('#'):
href = href.split('#')
anchor = href[1] if len(href) > 1 else None
href = href[0].split('/')[-1]
if href in split_files:
try:
newf = anchor_maps[split_files.index(href)][anchor]
except:
print 'Unable to remap NCX link:', href, anchor
frag = ('#'+anchor) if anchor else ''
content.set('src', 'content/'+newf+frag)
changed = True
if changed:
open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
def find_html_files(opf):
'''
Find all HTML files referenced by `opf`.
'''
html_files = []
for item in opf.itermanifest():
if 'html' in item.get('media-type', '').lower():
f = item.get('href').split('/')[-1]
f2 = f.replace('&', '%26')
if not os.path.exists(content(f)) and os.path.exists(content(f2)):
f = f2
item.set('href', item.get('href').replace('&', '%26'))
if os.path.exists(content(f)):
html_files.append(f)
return html_files
def split(pathtoopf, opts, stylesheet_map):
pathtoopf = os.path.abspath(pathtoopf)
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
with CurrentDir(os.path.dirname(pathtoopf)):
html_files = find_html_files(opf)
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
changes = [c for c in changes if c.was_split]
fix_content_links(html_files, changes, opts)
for item in opf.itermanifest():
if item.get('media-type', '') == 'application/x-dtbncx+xml':
fix_ncx(item.get('href'), changes)
break
open(pathtoopf, 'wb').write(opf.render())