EPUB Output: Make the file size splitting algorithm more intelligent. If a split results ina tree that is very small, choose another split point.

2025-07-07 18:24:30 -04:00 · 2010-05-02 15:17:09 -06:00 · 2010-05-02 15:17:09 -06:00 · 32f8611a76
commit 32f8611a76
parent 3df90b8926
2 changed files with 11 additions and 5 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -295,7 +295,7 @@ def xml2unicode(root, pretty_print=False):
    return etree.tostring(root, pretty_print=pretty_print)

 def xml2text(elem):
-    return etree.tostring(elem, method='text', encoding=unicode)
+    return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)

 ASCII_CHARS   = set(chr(x) for x in xrange(128))
 UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -5,7 +5,7 @@ __docformat__ = 'restructuredtext en'

 '''
 Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
-forces at "likely" locations to conform to size limitations. This transform
+forced at "likely" locations to conform to size limitations. This transform
 assumes a prior call to the flatcss transform.
 '''

@ -385,12 +385,18 @@ class FlowSplitter(object):
            raise SplitError(self.item.href, root)
        self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))

-        for t in self.do_split(tree, split_point, before):
+        trees = self.do_split(tree, split_point, before)
+        sizes = [len(tostring(t.getroot())) for t in trees]
+        if min(sizes) < 5*1024:
+            self.log.debug('\t\t\tSplit tree too small')
+            self.split_to_size(tree)
+            return
+
+        for t, size in zip(trees, sizes):
            r = t.getroot()
            if self.is_page_empty(r):
                continue
-            size = len(tostring(r))
-            if size <= self.max_flow_size:
+            elif size <= self.max_flow_size:
                self.split_trees.append(t)
                self.log.debug(
                    '\t\t\tCommitted sub-tree #%d (%d KB)'%(