diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index f770622952..2aabbf2e95 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -294,6 +294,9 @@ def xml2str(root, pretty_print=False, strip_comments=False): def xml2unicode(root, pretty_print=False): return etree.tostring(root, pretty_print=pretty_print) +def xml2text(elem): + return etree.tostring(elem, method='text', encoding=unicode, with_tail=False) + ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index d62c6353ea..4633131dc0 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -5,7 +5,7 @@ __docformat__ = 'restructuredtext en' ''' Splitting of the XHTML flows. Splitting can happen on page boundaries or can be -forces at "likely" locations to conform to size limitations. This transform +forced at "likely" locations to conform to size limitations. This transform assumes a prior call to the flatcss transform. ''' @@ -385,12 +385,18 @@ class FlowSplitter(object): raise SplitError(self.item.href, root) self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point)) - for t in self.do_split(tree, split_point, before): + trees = self.do_split(tree, split_point, before) + sizes = [len(tostring(t.getroot())) for t in trees] + if min(sizes) < 5*1024: + self.log.debug('\t\t\tSplit tree too small') + self.split_to_size(tree) + return + + for t, size in zip(trees, sizes): r = t.getroot() if self.is_page_empty(r): continue - size = len(tostring(r)) - if size <= self.max_flow_size: + elif size <= self.max_flow_size: self.split_trees.append(t) self.log.debug( '\t\t\tCommitted sub-tree #%d (%d KB)'%( diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py index 15e9675aa8..07235b4fb0 100644 --- a/src/calibre/ebooks/oeb/transforms/structure.py +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -11,7 +11,7 @@ import re from lxml import etree from urlparse import urlparse -from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML +from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text from calibre.ebooks import ConversionError def XPath(x): @@ -79,8 +79,7 @@ class DetectStructure(object): page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' for item, elem in self.detected_chapters: - text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) - text = text.strip() + text = xml2text(elem).strip() self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue @@ -120,8 +119,7 @@ class DetectStructure(object): if frag: href = '#'.join((href, frag)) if not self.oeb.toc.has_href(href): - text = u' '.join([t.strip() for t in \ - a.xpath('descendant::text()')]) + text = xml2text(a) text = text[:100].strip() if not self.oeb.toc.has_text(text): num += 1 @@ -135,7 +133,7 @@ class DetectStructure(object): def elem_to_link(self, item, elem, counter): - text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) + text = xml2text(elem) text = text[:100].strip() id = elem.get('id', 'calibre_toc_%d'%counter) elem.set('id', id)