Dont add page breaks for chapters at the start of the file

PDF Output: Fix extra blank page being inserted at the start of the chapter when converting some epub files from feedbooks
2025-07-09 03:04:10 -04:00 · 2013-07-18 11:34:39 +05:30 · 2013-07-18 11:34:39 +05:30 · 5542dcfbb3
commit 5542dcfbb3
parent 0c5959f298
1 changed files with 38 additions and 9 deletions
--- a/src/calibre/ebooks/oeb/transforms/structure.py
+++ b/src/calibre/ebooks/oeb/transforms/structure.py
@ -10,7 +10,7 @@ import re, uuid

 from lxml import etree
 from urlparse import urlparse
-from collections import OrderedDict
+from collections import OrderedDict, Counter

 from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename
 from calibre.ebooks import ConversionError
@ -22,6 +22,26 @@ def XPath(x):
        raise ConversionError(
        'The syntax of the XPath expression %s is invalid.' % repr(x))

+def isspace(x):
+    return not x or x.replace(u'\xa0', u'').isspace()
+
+def at_start(elem):
+    ' Return True if there is no content before elem '
+    body = XPath('ancestor-or-self::h:body')(elem)
+    if not body:
+        return True
+    body = body[0]
+    ancestors = frozenset(XPath('ancestor::*')(elem))
+    for x in body.iter():
+        if x is elem:
+            return True
+        if getattr(x, 'tag', None) and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
+            return False
+        if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
+            continue
+        return False
+    return False
+
 class DetectStructure(object):

    def __call__(self, oeb, opts):
@ -51,7 +71,7 @@ class DetectStructure(object):
            regexp = re.compile(opts.toc_filter)
            for node in list(self.oeb.toc.iter()):
                if not node.title or regexp.search(node.title) is not None:
-                    self.log('Filtering', node.title if node.title else\
+                    self.log('Filtering', node.title if node.title else
                            'empty node', 'from TOC')
                    self.oeb.toc.remove(node)

@ -92,7 +112,8 @@ class DetectStructure(object):
                'Invalid start reading at XPath expression, ignoring: %s'%expr)
            return
        for item in self.oeb.spine:
-            if not hasattr(item.data, 'xpath'): continue
+            if not hasattr(item.data, 'xpath'):
+                continue
            matches = expr(item.data)
            if matches:
                elem = matches[0]
@ -129,17 +150,27 @@ class DetectStructure(object):
            chapter_mark = self.opts.chapter_mark
            page_break_before = 'display: block; page-break-before: always'
            page_break_after = 'display: block; page-break-after: always'
+            c = Counter()
            for item, elem in self.detected_chapters:
+                c[item] += 1
                text = xml2text(elem).strip()
                text = re.sub(r'\s+', ' ', text.strip())
                self.log('\tDetected chapter:', text[:50])
                if chapter_mark == 'none':
                    continue
-                elif chapter_mark == 'rule':
+                if chapter_mark == 'rule':
                    mark = etree.Element(XHTML('hr'))
                elif chapter_mark == 'pagebreak':
+                    if c[item] < 3 and at_start(elem):
+                        # For the first two elements in this item, check if they
+                        # are at the start of the file, in which case inserting a
+                        # page break in unnecessary and can lead to extra blank
+                        # pages in the PDF Output plugin. We need to use two as
+                        # feedbooks epubs match both a heading tag and its
+                        # containing div with the default chapter expression.
+                        continue
                    mark = etree.Element(XHTML('div'), style=page_break_after)
-                else: # chapter_mark == 'both':
+                else:  # chapter_mark == 'both':
                    mark = etree.Element(XHTML('hr'), style=page_break_before)
                try:
                    elem.addprevious(mark)
@ -182,8 +213,6 @@ class DetectStructure(object):
                            self.log('Maximum TOC links reached, stopping.')
                            return

-
-
    def elem_to_link(self, item, elem, counter):
        text = xml2text(elem).strip()
        if not text:
@ -197,7 +226,6 @@ class DetectStructure(object):
        href = '#'.join((item.href, id))
        return text, href

-
    def add_leveled_toc_items(self):
        added = OrderedDict()
        added2 = OrderedDict()
@ -223,7 +251,7 @@ class DetectStructure(object):
                    node = self.oeb.toc.add(text, _href,
                            play_order=self.oeb.toc.next_play_order())
                    added[elem] = node
-                    #node.add(_('Top'), _href)
+                    # node.add(_('Top'), _href)

            if self.opts.level2_toc is not None and added:
                for elem in find_matches(self.opts.level2_toc, document.data):
@ -263,3 +291,4 @@ class DetectStructure(object):
                                        play_order=self.oeb.toc.next_play_order())
                                break

+