EPUB/AZW3 Output: Fix incorrect splitting of html at page-break-after page breaks in certain circumstances (The split element being the first child of a parent that contains other split elements). Fixes #1139317 (Incorrect page break for Haodoo's PDB/uPDB to Epub conversion)

This commit is contained in:
Kovid Goyal 2013-03-02 14:09:33 +05:30
parent bde2a22e35
commit 70c75df551

View File

@ -10,6 +10,7 @@ assumes a prior call to the flatcss transform.
''' '''
import os, math, functools, collections, re, copy import os, math, functools, collections, re, copy
from collections import OrderedDict
from lxml.etree import XPath as _XPath from lxml.etree import XPath as _XPath
from lxml import etree from lxml import etree
@ -106,8 +107,7 @@ class Split(object):
continue continue
for elem in selector(body[0]): for elem in selector(body[0]):
if elem not in body: if elem not in body:
if before: elem.set('pb_before', '1' if before else '0')
elem.set('pb_before', '1')
page_breaks.add(elem) page_breaks.add(elem)
for i, elem in enumerate(item.data.iter()): for i, elem in enumerate(item.data.iter()):
@ -134,14 +134,12 @@ class Split(object):
id = 'calibre_pb_%d'%i id = 'calibre_pb_%d'%i
x.set('id', id) x.set('id', id)
xp = XPath('//*[@id=%r]'%id) xp = XPath('//*[@id=%r]'%id)
page_breaks_.append((xp, page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
x.get('pb_before', False)))
page_break_ids.append(id) page_break_ids.append(id)
for elem in item.data.iter(): for elem in item.data.iter():
elem.attrib.pop('pb_order', False) elem.attrib.pop('pb_order', False)
if elem.get('pb_before', False): elem.attrib.pop('pb_before', False)
elem.attrib.pop('pb_before')
return page_breaks_, page_break_ids return page_breaks_, page_break_ids
@ -223,22 +221,23 @@ class FlowSplitter(object):
self.commit() self.commit()
def split_on_page_breaks(self, orig_tree): def split_on_page_breaks(self, orig_tree):
ordered_ids = [] ordered_ids = OrderedDict()
for elem in orig_tree.xpath('//*[@id]'): all_page_break_ids = frozenset(self.page_break_ids)
id = elem.get('id') for elem_id in orig_tree.xpath('//*/@id'):
if id in self.page_break_ids: if elem_id in all_page_break_ids:
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)]) ordered_ids[elem_id] = self.page_breaks[
self.page_break_ids.index(elem_id)]
self.trees = [] self.trees = []
tree = orig_tree tree = orig_tree
for pattern, before in ordered_ids: for pattern, before in ordered_ids.itervalues():
elem = pattern(tree) elem = pattern(tree)
if elem: if elem:
self.log.debug('\t\tSplitting on page-break at %s'% self.log.debug('\t\tSplitting on page-break at %s'%
elem[0].get('id')) elem[0].get('id'))
before, after = self.do_split(tree, elem[0], before) before_tree, after_tree = self.do_split(tree, elem[0], before)
self.trees.append(before) self.trees.append(before_tree)
tree = after tree = after_tree
self.trees.append(tree) self.trees.append(tree)
trees, ids = [], set([]) trees, ids = [], set([])
for tree in self.trees: for tree in self.trees:
@ -289,7 +288,6 @@ class FlowSplitter(object):
if self.opts.verbose > 3 and npath != path: if self.opts.verbose > 3 and npath != path:
self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath)) self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath))
return npath return npath
def do_split(self, tree, split_point, before): def do_split(self, tree, split_point, before):
@ -304,6 +302,10 @@ class FlowSplitter(object):
root = tree.getroot() root = tree.getroot()
root2 = tree2.getroot() root2 = tree2.getroot()
body, body2 = map(self.get_body, (root, root2)) body, body2 = map(self.get_body, (root, root2))
if before:
# We cannot adjust for after since moving an after split point to a
# parent will cause breakage if the parent contains any content
# after the original split point
path = self.adjust_split_point(root, path) path = self.adjust_split_point(root, path)
split_point = root.xpath(path)[0] split_point = root.xpath(path)[0]
split_point2 = root2.xpath(path)[0] split_point2 = root2.xpath(path)[0]