remove br from top of page in chm conversion

2025-06-23 15:30:45 -04:00 · 2010-02-21 10:06:40 -08:00 · 2010-02-21 10:06:40 -08:00 · ac8ccceef8
commit ac8ccceef8
parent c91f022385
1 changed files with 18 additions and 1 deletions
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@ -11,7 +11,7 @@ from mimetypes import guess_type as guess_mimetype
 from htmlentitydefs import name2codepoint
 from pprint import PrettyPrinter

-from BeautifulSoup import BeautifulSoup
+from BeautifulSoup import BeautifulSoup, NavigableString
 from lxml import html, etree
 from pychm.chm import CHMFile
 from pychm.chmlib import (
@ -35,6 +35,17 @@ def match_string(s1, s2_already_lowered):
            return True
    return False

+def check_all_prev_empty(tag):
+    if tag is None:
+        return True
+    if tag.__class__ == NavigableString and not check_empty(tag):
+        return False
+    return check_all_prev_empty(tag.previousSibling)
+
+def check_empty(s, rex = re.compile(r'\S')):
+    return rex.search(s) is None
+
+
 def option_parser():
    parser = OptionParser(usage=_('%prog [options] mybook.chm'))
    parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
@ -160,6 +171,12 @@ class CHMReader(CHMFile):
                t[-1].extract()
        # for some very odd reason each page's content appears to be in a table
        # too. and this table has sub-tables for random asides... grr.
+        
+        # remove br at top of page if present after nav bars removed
+        br = html('br')
+        if br:
+            if check_all_prev_empty(br[0].previousSibling):
+                br[0].extract()

        # some images seem to be broken in some chm's :/
        for img in html('img'):