Fix #2358 (HTML to ePub conversion results in duplicated content / incorrect breaks)

This commit is contained in:
Kovid Goyal 2009-04-29 20:31:18 -07:00
parent 6f072dc3d1
commit e869684a29

View File

@ -24,16 +24,16 @@ SPLIT_ATTR = 'cs'
SPLIT_POINT_ATTR = 'csp' SPLIT_POINT_ATTR = 'csp'
class SplitError(ValueError): class SplitError(ValueError):
def __init__(self, path, root): def __init__(self, path, root):
size = len(tostring(root))/1024. size = len(tostring(root))/1024.
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
(os.path.basename(path), size)) (os.path.basename(path), size))
class Splitter(LoggingInterface): class Splitter(LoggingInterface):
def __init__(self, path, opts, stylesheet_map, opf): def __init__(self, path, opts, stylesheet_map, opf):
LoggingInterface.__init__(self, logging.getLogger('htmlsplit')) LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
self.setup_cli_handler(opts.verbose) self.setup_cli_handler(opts.verbose)
@ -45,10 +45,10 @@ class Splitter(LoggingInterface):
self.orig_size = os.stat(content(path)).st_size self.orig_size = os.stat(content(path)).st_size
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
root = html.fromstring(open(content(path)).read()) root = html.fromstring(open(content(path)).read())
self.page_breaks, self.trees = [], [] self.page_breaks, self.trees = [], []
self.split_size = 0 self.split_size = 0
# Split on page breaks # Split on page breaks
self.splitting_on_page_breaks = True self.splitting_on_page_breaks = True
if not opts.dont_split_on_page_breaks: if not opts.dont_split_on_page_breaks:
@ -60,29 +60,36 @@ class Splitter(LoggingInterface):
else: else:
self.trees = [root.getroottree()] self.trees = [root.getroottree()]
trees = list(self.trees) trees = list(self.trees)
# Split any remaining over-sized trees # Split any remaining over-sized trees
self.splitting_on_page_breaks = False self.splitting_on_page_breaks = False
if self.opts.profile.flow_size < sys.maxint: if self.opts.profile.flow_size < sys.maxint:
lt_found = False lt_found = False
self.log_info('\tLooking for large trees...') self.log_info('\tLooking for large trees...')
self.tree_map = {}
for i, tree in enumerate(list(trees)): for i, tree in enumerate(list(trees)):
self.trees = [] self.split_trees = []
size = len(tostring(tree.getroot())) size = len(tostring(tree.getroot()))
if size > self.opts.profile.flow_size: if size > self.opts.profile.flow_size:
lt_found = True lt_found = True
try: try:
self.split_to_size(tree) self.split_to_size(tree)
self.tree_map[tree] = self.split_trees
except (SplitError, RuntimeError): # Splitting fails except (SplitError, RuntimeError): # Splitting fails
if not self.always_remove: if not self.always_remove:
self.always_remove = True self.always_remove = True
self.split_trees = []
self.split_to_size(tree) self.split_to_size(tree)
self.tree_map[tree] = self.split_trees
else: else:
raise raise
trees[i:i+1] = list(self.trees) t = []
for x in trees:
t.extend(self.tree_map.get(x, [x]))
trees = t
if not lt_found: if not lt_found:
self.log_info('\tNo large trees found') self.log_info('\tNo large trees found')
self.trees = trees self.trees = trees
self.was_split = len(self.trees) > 1 self.was_split = len(self.trees) > 1
if self.was_split: if self.was_split:
@ -92,17 +99,17 @@ class Splitter(LoggingInterface):
for f in self.files: for f in self.files:
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.) self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
self.fix_opf(opf) self.fix_opf(opf)
self.trees = None self.trees = None
def split_text(self, text, root, size): def split_text(self, text, root, size):
self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
rest = text.replace('\r', '') rest = text.replace('\r', '')
parts = re.split('\n\n', rest) parts = re.split('\n\n', rest)
self.log_debug('\t\t\t\tFound %d parts'%len(parts)) self.log_debug('\t\t\t\tFound %d parts'%len(parts))
if max(map(len, parts)) > size: if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root) raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
ans = [] ans = []
buf = '' buf = ''
for part in parts: for part in parts:
@ -112,8 +119,8 @@ class Splitter(LoggingInterface):
ans.append(buf) ans.append(buf)
buf = part buf = part
return ans return ans
def split_to_size(self, tree): def split_to_size(self, tree):
self.log_debug('\t\tSplitting...') self.log_debug('\t\tSplitting...')
root = tree.getroot() root = tree.getroot()
@ -135,7 +142,7 @@ class Splitter(LoggingInterface):
p = pre.getparent() p = pre.getparent()
i = p.index(pre) i = p.index(pre)
p[i:i+1] = new_pres p[i:i+1] = new_pres
split_point, before = self.find_split_point(root) split_point, before = self.find_split_point(root)
if split_point is None or self.split_size > 6*self.orig_size: if split_point is None or self.split_size > 6*self.orig_size:
if not self.always_remove: if not self.always_remove:
@ -143,21 +150,21 @@ class Splitter(LoggingInterface):
'structure preservation. This may cause ' 'structure preservation. This may cause '
'incorrect rendering.')) 'incorrect rendering.'))
raise SplitError(self.path, root) raise SplitError(self.path, root)
for t in self.do_split(tree, split_point, before): for t in self.do_split(tree, split_point, before):
r = t.getroot() r = t.getroot()
if self.is_page_empty(r): if self.is_page_empty(r):
continue continue
size = len(tostring(r)) size = len(tostring(r))
if size <= self.opts.profile.flow_size: if size <= self.opts.profile.flow_size:
self.trees.append(t) self.split_trees.append(t)
#print tostring(t.getroot(), pretty_print=True) #print tostring(t.getroot(), pretty_print=True)
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
len(self.trees), size/1024.) len(self.split_trees), size/1024.)
self.split_size += size self.split_size += size
else: else:
self.split_to_size(t) self.split_to_size(t)
def is_page_empty(self, root): def is_page_empty(self, root):
body = root.find('body') body = root.find('body')
if body is None: if body is None:
@ -171,14 +178,14 @@ class Splitter(LoggingInterface):
if img.get('style', '') != 'display:none': if img.get('style', '') != 'display:none':
return False return False
return True return True
def do_split(self, tree, split_point, before): def do_split(self, tree, split_point, before):
''' '''
Split ``tree`` into a *before* and *after* tree at ``split_point``, Split ``tree`` into a *before* and *after* tree at ``split_point``,
preserving tag structure, but not duplicating any text. preserving tag structure, but not duplicating any text.
All tags that have had their text and tail All tags that have had their text and tail
removed have the attribute ``calibre_split`` set to 1. removed have the attribute ``calibre_split`` set to 1.
:param before: If True tree is split before split_point, otherwise after split_point :param before: If True tree is split before split_point, otherwise after split_point
:return: before_tree, after_tree :return: before_tree, after_tree
''' '''
@ -189,7 +196,7 @@ class Splitter(LoggingInterface):
body, body2 = root.body, root2.body body, body2 = root.body, root2.body
split_point = root.xpath(path)[0] split_point = root.xpath(path)[0]
split_point2 = root2.xpath(path)[0] split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True): def nix_element(elem, top=True):
if self.always_remove: if self.always_remove:
parent = elem.getparent() parent = elem.getparent()
@ -199,18 +206,18 @@ class Splitter(LoggingInterface):
else: else:
index = parent.index(elem) index = parent.index(elem)
parent[index:index+1] = list(elem.iterchildren()) parent[index:index+1] = list(elem.iterchildren())
else: else:
elem.text = u'' elem.text = u''
elem.tail = u'' elem.tail = u''
elem.set(SPLIT_ATTR, '1') elem.set(SPLIT_ATTR, '1')
if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']: if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
elem.set('style', 'display:none') elem.set('style', 'display:none')
def fix_split_point(sp): def fix_split_point(sp):
if not self.splitting_on_page_breaks: if not self.splitting_on_page_breaks:
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
# Tree 1 # Tree 1
hit_split_point = False hit_split_point = False
for elem in list(body.iterdescendants(etree.Element)): for elem in list(body.iterdescendants(etree.Element)):
@ -224,8 +231,8 @@ class Splitter(LoggingInterface):
continue continue
if hit_split_point: if hit_split_point:
nix_element(elem) nix_element(elem)
# Tree 2 # Tree 2
hit_split_point = False hit_split_point = False
for elem in list(body2.iterdescendants(etree.Element)): for elem in list(body2.iterdescendants(etree.Element)):
@ -239,17 +246,17 @@ class Splitter(LoggingInterface):
continue continue
if not hit_split_point: if not hit_split_point:
nix_element(elem, top=False) nix_element(elem, top=False)
return tree, tree2 return tree, tree2
def split_on_page_breaks(self, orig_tree): def split_on_page_breaks(self, orig_tree):
ordered_ids = [] ordered_ids = []
for elem in orig_tree.xpath('//*[@id]'): for elem in orig_tree.xpath('//*[@id]'):
id = elem.get('id') id = elem.get('id')
if id in self.page_break_ids: if id in self.page_break_ids:
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)]) ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
self.trees = [] self.trees = []
tree = orig_tree tree = orig_tree
for pattern, before in ordered_ids: for pattern, before in ordered_ids:
@ -261,13 +268,13 @@ class Splitter(LoggingInterface):
tree = after tree = after
self.trees.append(tree) self.trees.append(tree)
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())] self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
def find_page_breaks(self, stylesheets, root): def find_page_breaks(self, stylesheets, root):
''' '''
Find all elements that have either page-break-before or page-break-after set. Find all elements that have either page-break-before or page-break-after set.
Populates `self.page_breaks` with id based XPath selectors (for elements that don't Populates `self.page_breaks` with id based XPath selectors (for elements that don't
have ids, an id is created). have ids, an id is created).
''' '''
page_break_selectors = set([]) page_break_selectors = set([])
@ -284,16 +291,16 @@ class Splitter(LoggingInterface):
page_break_selectors.add((CSSSelector(rule.selectorText), False)) page_break_selectors.add((CSSSelector(rule.selectorText), False))
except: except:
pass pass
page_breaks = set([]) page_breaks = set([])
for selector, before in page_break_selectors: for selector, before in page_break_selectors:
for elem in selector(root): for elem in selector(root):
elem.pb_before = before elem.pb_before = before
page_breaks.add(elem) page_breaks.add(elem)
for i, elem in enumerate(root.iter()): for i, elem in enumerate(root.iter()):
elem.pb_order = i elem.pb_order = i
page_breaks = list(page_breaks) page_breaks = list(page_breaks)
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order)) page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
self.page_break_ids = [] self.page_break_ids = []
@ -301,12 +308,12 @@ class Splitter(LoggingInterface):
x.set('id', x.get('id', 'calibre_pb_%d'%i)) x.set('id', x.get('id', 'calibre_pb_%d'%i))
id = x.get('id') id = x.get('id')
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before)) self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
self.page_break_ids.append(id) self.page_break_ids.append(id)
def find_split_point(self, root): def find_split_point(self, root):
''' '''
Find the tag at which to split the tree rooted at `root`. Find the tag at which to split the tree rooted at `root`.
Search order is: Search order is:
* Heading tags * Heading tags
* <div> tags * <div> tags
@ -315,7 +322,7 @@ class Splitter(LoggingInterface):
* <p> tags * <p> tags
* <br> tags * <br> tags
* <li> tags * <li> tags
We try to split in the "middle" of the file (as defined by tag counts. We try to split in the "middle" of the file (as defined by tag counts.
''' '''
def pick_elem(elems): def pick_elem(elems):
@ -326,18 +333,18 @@ class Splitter(LoggingInterface):
i = int(math.floor(len(elems)/2.)) i = int(math.floor(len(elems)/2.))
elems[i].set(SPLIT_POINT_ATTR, '1') elems[i].set(SPLIT_POINT_ATTR, '1')
return elems[i] return elems[i]
for path in ( for path in (
'//*[re:match(name(), "h[1-6]", "i")]', '//*[re:match(name(), "h[1-6]", "i")]',
'/html/body/div', '/html/body/div',
'//pre', '//pre',
'//hr', '//hr',
'//p', '//p',
'//div', '//div',
'//br', '//br',
'//li', '//li',
): ):
elems = root.xpath(path, elems = root.xpath(path,
namespaces={'re':'http://exslt.org/regular-expressions'}) namespaces={'re':'http://exslt.org/regular-expressions'})
elem = pick_elem(elems) elem = pick_elem(elems)
if elem is not None: if elem is not None:
@ -346,9 +353,9 @@ class Splitter(LoggingInterface):
except: except:
continue continue
return elem, True return elem, True
return None, True return None, True
def commit(self): def commit(self):
''' '''
Commit all changes caused by the split. This removes the previously Commit all changes caused by the split. This removes the previously
@ -358,7 +365,7 @@ class Splitter(LoggingInterface):
''' '''
self.anchor_map = collections.defaultdict(lambda :self.base%0) self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.files = [] self.files = []
for i, tree in enumerate(self.trees): for i, tree in enumerate(self.trees):
root = tree.getroot() root = tree.getroot()
self.files.append(self.base%i) self.files.append(self.base%i)
@ -368,7 +375,7 @@ class Splitter(LoggingInterface):
for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)): for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_ATTR, None)
elem.attrib.pop(SPLIT_POINT_ATTR, '0') elem.attrib.pop(SPLIT_POINT_ATTR, '0')
for current, tree in zip(self.files, self.trees): for current, tree in zip(self.files, self.trees):
for a in tree.getroot().xpath('//a[@href]'): for a in tree.getroot().xpath('//a[@href]'):
href = a.get('href').strip() href = a.get('href').strip()
@ -376,10 +383,10 @@ class Splitter(LoggingInterface):
anchor = href[1:] anchor = href[1:]
file = self.anchor_map[anchor] file = self.anchor_map[anchor]
if file != current: if file != current:
a.set('href', file+href) a.set('href', file+href)
open(content(current), 'wb').\ open(content(current), 'wb').\
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print)) write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
os.remove(content(self.path)) os.remove(content(self.path))
@ -392,12 +399,12 @@ class Splitter(LoggingInterface):
id_map = {} id_map = {}
for item in items: for item in items:
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items) id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
for id in id_map.keys(): for id in id_map.keys():
opf.replace_spine_items_by_idref(id, id_map[id]) opf.replace_spine_items_by_idref(id, id_map[id])
for ref in opf.iterguide(): for ref in opf.iterguide():
href = ref.get('href', '') href = ref.get('href', '')
if href.startswith('content/'+self.path): if href.startswith('content/'+self.path):
href = href.split('#') href = href.split('#')
frag = None frag = None
@ -409,8 +416,8 @@ class Splitter(LoggingInterface):
new_file = self.anchor_map[frag] new_file = self.anchor_map[frag]
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag))) ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
def fix_content_links(html_files, changes, opts): def fix_content_links(html_files, changes, opts):
split_files = [f.path for f in changes] split_files = [f.path for f in changes]
anchor_maps = [f.anchor_map for f in changes] anchor_maps = [f.anchor_map for f in changes]
@ -421,7 +428,7 @@ def fix_content_links(html_files, changes, opts):
files[i:i+1] = changes[j].files files[i:i+1] = changes[j].files
except ValueError: except ValueError:
continue continue
for htmlfile in files: for htmlfile in files:
changed = False changed = False
root = html.fromstring(open(content(htmlfile), 'rb').read()) root = html.fromstring(open(content(htmlfile), 'rb').read())
@ -440,7 +447,7 @@ def fix_content_links(html_files, changes, opts):
frag = ('#'+anchor) if anchor else '' frag = ('#'+anchor) if anchor else ''
a.set('href', newf+frag) a.set('href', newf+frag)
changed = True changed = True
if changed: if changed:
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print)) open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
@ -449,7 +456,7 @@ def fix_ncx(path, changes):
anchor_maps = [f.anchor_map for f in changes] anchor_maps = [f.anchor_map for f in changes]
tree = etree.parse(path) tree = etree.parse(path)
changed = False changed = False
for content in tree.getroot().xpath('//x:content[@src]', for content in tree.getroot().xpath('//x:content[@src]',
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}): namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
href = content.get('src') href = content.get('src')
if not href.startswith('#'): if not href.startswith('#'):
@ -482,21 +489,21 @@ def find_html_files(opf):
if os.path.exists(content(f)): if os.path.exists(content(f)):
html_files.append(f) html_files.append(f)
return html_files return html_files
def split(pathtoopf, opts, stylesheet_map): def split(pathtoopf, opts, stylesheet_map):
pathtoopf = os.path.abspath(pathtoopf) pathtoopf = os.path.abspath(pathtoopf)
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
with CurrentDir(os.path.dirname(pathtoopf)): with CurrentDir(os.path.dirname(pathtoopf)):
html_files = find_html_files(opf) html_files = find_html_files(opf)
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files] changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
changes = [c for c in changes if c.was_split] changes = [c for c in changes if c.was_split]
fix_content_links(html_files, changes, opts) fix_content_links(html_files, changes, opts)
for item in opf.itermanifest(): for item in opf.itermanifest():
if item.get('media-type', '') == 'application/x-dtbncx+xml': if item.get('media-type', '') == 'application/x-dtbncx+xml':
fix_ncx(item.get('href'), changes) fix_ncx(item.get('href'), changes)
break break
open(pathtoopf, 'wb').write(opf.render()) open(pathtoopf, 'wb').write(opf.render())