diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py index cd376b4ec4..50ee4d011d 100644 --- a/src/calibre/ebooks/oeb/transforms/structure.py +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -10,7 +10,7 @@ import re, uuid from lxml import etree from urlparse import urlparse -from collections import OrderedDict +from collections import OrderedDict, Counter from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename from calibre.ebooks import ConversionError @@ -22,6 +22,26 @@ def XPath(x): raise ConversionError( 'The syntax of the XPath expression %s is invalid.' % repr(x)) +def isspace(x): + return not x or x.replace(u'\xa0', u'').isspace() + +def at_start(elem): + ' Return True if there is no content before elem ' + body = XPath('ancestor-or-self::h:body')(elem) + if not body: + return True + body = body[0] + ancestors = frozenset(XPath('ancestor::*')(elem)) + for x in body.iter(): + if x is elem: + return True + if getattr(x, 'tag', None) and x.tag.rpartition('}')[-1] in {'img', 'svg'}: + return False + if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))): + continue + return False + return False + class DetectStructure(object): def __call__(self, oeb, opts): @@ -51,7 +71,7 @@ class DetectStructure(object): regexp = re.compile(opts.toc_filter) for node in list(self.oeb.toc.iter()): if not node.title or regexp.search(node.title) is not None: - self.log('Filtering', node.title if node.title else\ + self.log('Filtering', node.title if node.title else 'empty node', 'from TOC') self.oeb.toc.remove(node) @@ -92,7 +112,8 @@ class DetectStructure(object): 'Invalid start reading at XPath expression, ignoring: %s'%expr) return for item in self.oeb.spine: - if not hasattr(item.data, 'xpath'): continue + if not hasattr(item.data, 'xpath'): + continue matches = expr(item.data) if matches: elem = matches[0] @@ -129,17 +150,27 @@ class DetectStructure(object): chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' + c = Counter() for item, elem in self.detected_chapters: + c[item] += 1 text = xml2text(elem).strip() text = re.sub(r'\s+', ' ', text.strip()) self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue - elif chapter_mark == 'rule': + if chapter_mark == 'rule': mark = etree.Element(XHTML('hr')) elif chapter_mark == 'pagebreak': + if c[item] < 3 and at_start(elem): + # For the first two elements in this item, check if they + # are at the start of the file, in which case inserting a + # page break in unnecessary and can lead to extra blank + # pages in the PDF Output plugin. We need to use two as + # feedbooks epubs match both a heading tag and its + # containing div with the default chapter expression. + continue mark = etree.Element(XHTML('div'), style=page_break_after) - else: # chapter_mark == 'both': + else: # chapter_mark == 'both': mark = etree.Element(XHTML('hr'), style=page_break_before) try: elem.addprevious(mark) @@ -182,8 +213,6 @@ class DetectStructure(object): self.log('Maximum TOC links reached, stopping.') return - - def elem_to_link(self, item, elem, counter): text = xml2text(elem).strip() if not text: @@ -197,7 +226,6 @@ class DetectStructure(object): href = '#'.join((item.href, id)) return text, href - def add_leveled_toc_items(self): added = OrderedDict() added2 = OrderedDict() @@ -223,7 +251,7 @@ class DetectStructure(object): node = self.oeb.toc.add(text, _href, play_order=self.oeb.toc.next_play_order()) added[elem] = node - #node.add(_('Top'), _href) + # node.add(_('Top'), _href) if self.opts.level2_toc is not None and added: for elem in find_matches(self.opts.level2_toc, document.data): @@ -263,3 +291,4 @@ class DetectStructure(object): play_order=self.oeb.toc.next_play_order()) break +