diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index 918f8bcc7e..a5cc6dfc7d 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -24,16 +24,16 @@ SPLIT_ATTR = 'cs' SPLIT_POINT_ATTR = 'csp' class SplitError(ValueError): - + def __init__(self, path, root): size = len(tostring(root))/1024. - ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% + ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% (os.path.basename(path), size)) - + class Splitter(LoggingInterface): - + def __init__(self, path, opts, stylesheet_map, opf): LoggingInterface.__init__(self, logging.getLogger('htmlsplit')) self.setup_cli_handler(opts.verbose) @@ -45,10 +45,10 @@ class Splitter(LoggingInterface): self.orig_size = os.stat(content(path)).st_size self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) root = html.fromstring(open(content(path)).read()) - + self.page_breaks, self.trees = [], [] self.split_size = 0 - + # Split on page breaks self.splitting_on_page_breaks = True if not opts.dont_split_on_page_breaks: @@ -60,29 +60,36 @@ class Splitter(LoggingInterface): else: self.trees = [root.getroottree()] trees = list(self.trees) - + # Split any remaining over-sized trees self.splitting_on_page_breaks = False if self.opts.profile.flow_size < sys.maxint: lt_found = False self.log_info('\tLooking for large trees...') + self.tree_map = {} for i, tree in enumerate(list(trees)): - self.trees = [] - size = len(tostring(tree.getroot())) + self.split_trees = [] + size = len(tostring(tree.getroot())) if size > self.opts.profile.flow_size: lt_found = True try: self.split_to_size(tree) + self.tree_map[tree] = self.split_trees except (SplitError, RuntimeError): # Splitting fails if not self.always_remove: self.always_remove = True + self.split_trees = [] self.split_to_size(tree) + self.tree_map[tree] = self.split_trees else: raise - trees[i:i+1] = list(self.trees) + t = [] + for x in trees: + t.extend(self.tree_map.get(x, [x])) + trees = t if not lt_found: self.log_info('\tNo large trees found') - + self.trees = trees self.was_split = len(self.trees) > 1 if self.was_split: @@ -92,17 +99,17 @@ class Splitter(LoggingInterface): for f in self.files: self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.) self.fix_opf(opf) - + self.trees = None - - + + def split_text(self, text, root, size): self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) rest = text.replace('\r', '') parts = re.split('\n\n', rest) self.log_debug('\t\t\t\tFound %d parts'%len(parts)) if max(map(len, parts)) > size: - raise SplitError('Cannot split as file contains a
tag with a very large paragraph', root) + raise SplitError('Cannot split as file contains atag with a very large paragraph', root) ans = [] buf = '' for part in parts: @@ -112,8 +119,8 @@ class Splitter(LoggingInterface): ans.append(buf) buf = part return ans - - + + def split_to_size(self, tree): self.log_debug('\t\tSplitting...') root = tree.getroot() @@ -135,7 +142,7 @@ class Splitter(LoggingInterface): p = pre.getparent() i = p.index(pre) p[i:i+1] = new_pres - + split_point, before = self.find_split_point(root) if split_point is None or self.split_size > 6*self.orig_size: if not self.always_remove: @@ -143,21 +150,21 @@ class Splitter(LoggingInterface): 'structure preservation. This may cause ' 'incorrect rendering.')) raise SplitError(self.path, root) - + for t in self.do_split(tree, split_point, before): r = t.getroot() if self.is_page_empty(r): continue size = len(tostring(r)) if size <= self.opts.profile.flow_size: - self.trees.append(t) + self.split_trees.append(t) #print tostring(t.getroot(), pretty_print=True) - self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', - len(self.trees), size/1024.) + self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', + len(self.split_trees), size/1024.) self.split_size += size else: self.split_to_size(t) - + def is_page_empty(self, root): body = root.find('body') if body is None: @@ -171,14 +178,14 @@ class Splitter(LoggingInterface): if img.get('style', '') != 'display:none': return False return True - + def do_split(self, tree, split_point, before): ''' - Split ``tree`` into a *before* and *after* tree at ``split_point``, - preserving tag structure, but not duplicating any text. + Split ``tree`` into a *before* and *after* tree at ``split_point``, + preserving tag structure, but not duplicating any text. All tags that have had their text and tail removed have the attribute ``calibre_split`` set to 1. - + :param before: If True tree is split before split_point, otherwise after split_point :return: before_tree, after_tree ''' @@ -189,7 +196,7 @@ class Splitter(LoggingInterface): body, body2 = root.body, root2.body split_point = root.xpath(path)[0] split_point2 = root2.xpath(path)[0] - + def nix_element(elem, top=True): if self.always_remove: parent = elem.getparent() @@ -199,18 +206,18 @@ class Splitter(LoggingInterface): else: index = parent.index(elem) parent[index:index+1] = list(elem.iterchildren()) - + else: elem.text = u'' elem.tail = u'' elem.set(SPLIT_ATTR, '1') if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']: elem.set('style', 'display:none') - + def fix_split_point(sp): if not self.splitting_on_page_breaks: - sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') - + sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') + # Tree 1 hit_split_point = False for elem in list(body.iterdescendants(etree.Element)): @@ -224,8 +231,8 @@ class Splitter(LoggingInterface): continue if hit_split_point: nix_element(elem) - - + + # Tree 2 hit_split_point = False for elem in list(body2.iterdescendants(etree.Element)): @@ -239,17 +246,17 @@ class Splitter(LoggingInterface): continue if not hit_split_point: nix_element(elem, top=False) - + return tree, tree2 - - + + def split_on_page_breaks(self, orig_tree): ordered_ids = [] for elem in orig_tree.xpath('//*[@id]'): id = elem.get('id') if id in self.page_break_ids: ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)]) - + self.trees = [] tree = orig_tree for pattern, before in ordered_ids: @@ -261,13 +268,13 @@ class Splitter(LoggingInterface): tree = after self.trees.append(tree) self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())] - - - + + + def find_page_breaks(self, stylesheets, root): ''' Find all elements that have either page-break-before or page-break-after set. - Populates `self.page_breaks` with id based XPath selectors (for elements that don't + Populates `self.page_breaks` with id based XPath selectors (for elements that don't have ids, an id is created). ''' page_break_selectors = set([]) @@ -284,16 +291,16 @@ class Splitter(LoggingInterface): page_break_selectors.add((CSSSelector(rule.selectorText), False)) except: pass - + page_breaks = set([]) for selector, before in page_break_selectors: for elem in selector(root): elem.pb_before = before page_breaks.add(elem) - + for i, elem in enumerate(root.iter()): elem.pb_order = i - + page_breaks = list(page_breaks) page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order)) self.page_break_ids = [] @@ -301,12 +308,12 @@ class Splitter(LoggingInterface): x.set('id', x.get('id', 'calibre_pb_%d'%i)) id = x.get('id') self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before)) - self.page_break_ids.append(id) - - + self.page_break_ids.append(id) + + def find_split_point(self, root): ''' - Find the tag at which to split the tree rooted at `root`. + Find the tag at which to split the tree rooted at `root`. Search order is: * Heading tags *tags @@ -315,7 +322,7 @@ class Splitter(LoggingInterface): *tags *
tags *tags - + We try to split in the "middle" of the file (as defined by tag counts. ''' def pick_elem(elems): @@ -326,18 +333,18 @@ class Splitter(LoggingInterface): i = int(math.floor(len(elems)/2.)) elems[i].set(SPLIT_POINT_ATTR, '1') return elems[i] - + for path in ( - '//*[re:match(name(), "h[1-6]", "i")]', + '//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//pre', - '//hr', + '//hr', '//p', '//div', '//br', '//li', ): - elems = root.xpath(path, + elems = root.xpath(path, namespaces={'re':'http://exslt.org/regular-expressions'}) elem = pick_elem(elems) if elem is not None: @@ -346,9 +353,9 @@ class Splitter(LoggingInterface): except: continue return elem, True - + return None, True - + def commit(self): ''' Commit all changes caused by the split. This removes the previously @@ -358,7 +365,7 @@ class Splitter(LoggingInterface): ''' self.anchor_map = collections.defaultdict(lambda :self.base%0) self.files = [] - + for i, tree in enumerate(self.trees): root = tree.getroot() self.files.append(self.base%i) @@ -368,7 +375,7 @@ class Splitter(LoggingInterface): for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)): elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_POINT_ATTR, '0') - + for current, tree in zip(self.files, self.trees): for a in tree.getroot().xpath('//a[@href]'): href = a.get('href').strip() @@ -376,10 +383,10 @@ class Splitter(LoggingInterface): anchor = href[1:] file = self.anchor_map[anchor] if file != current: - a.set('href', file+href) + a.set('href', file+href) open(content(current), 'wb').\ write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print)) - + os.remove(content(self.path)) @@ -392,12 +399,12 @@ class Splitter(LoggingInterface): id_map = {} for item in items: id_map[item.get('id')] = opf.replace_manifest_item(item, new_items) - + for id in id_map.keys(): opf.replace_spine_items_by_idref(id, id_map[id]) - + for ref in opf.iterguide(): - href = ref.get('href', '') + href = ref.get('href', '') if href.startswith('content/'+self.path): href = href.split('#') frag = None @@ -409,8 +416,8 @@ class Splitter(LoggingInterface): new_file = self.anchor_map[frag] ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag))) - - + + def fix_content_links(html_files, changes, opts): split_files = [f.path for f in changes] anchor_maps = [f.anchor_map for f in changes] @@ -421,7 +428,7 @@ def fix_content_links(html_files, changes, opts): files[i:i+1] = changes[j].files except ValueError: continue - + for htmlfile in files: changed = False root = html.fromstring(open(content(htmlfile), 'rb').read()) @@ -440,7 +447,7 @@ def fix_content_links(html_files, changes, opts): frag = ('#'+anchor) if anchor else '' a.set('href', newf+frag) changed = True - + if changed: open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print)) @@ -449,7 +456,7 @@ def fix_ncx(path, changes): anchor_maps = [f.anchor_map for f in changes] tree = etree.parse(path) changed = False - for content in tree.getroot().xpath('//x:content[@src]', + for content in tree.getroot().xpath('//x:content[@src]', namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}): href = content.get('src') if not href.startswith('#'): @@ -482,21 +489,21 @@ def find_html_files(opf): if os.path.exists(content(f)): html_files.append(f) return html_files - + def split(pathtoopf, opts, stylesheet_map): pathtoopf = os.path.abspath(pathtoopf) opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) - + with CurrentDir(os.path.dirname(pathtoopf)): html_files = find_html_files(opf) changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files] changes = [c for c in changes if c.was_split] - + fix_content_links(html_files, changes, opts) for item in opf.itermanifest(): if item.get('media-type', '') == 'application/x-dtbncx+xml': fix_ncx(item.get('href'), changes) - break + break open(pathtoopf, 'wb').write(opf.render())