From ac8ccceef86423fbacb500fb6f5da842cf785573 Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 21 Feb 2010 10:06:40 -0800 Subject: [PATCH] remove br from top of page in chm conversion --- src/calibre/ebooks/chm/input.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index a2976c944a..784848929d 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -11,7 +11,7 @@ from mimetypes import guess_type as guess_mimetype from htmlentitydefs import name2codepoint from pprint import PrettyPrinter -from BeautifulSoup import BeautifulSoup +from BeautifulSoup import BeautifulSoup, NavigableString from lxml import html, etree from pychm.chm import CHMFile from pychm.chmlib import ( @@ -35,6 +35,17 @@ def match_string(s1, s2_already_lowered): return True return False +def check_all_prev_empty(tag): + if tag is None: + return True + if tag.__class__ == NavigableString and not check_empty(tag): + return False + return check_all_prev_empty(tag.previousSibling) + +def check_empty(s, rex = re.compile(r'\S')): + return rex.search(s) is None + + def option_parser(): parser = OptionParser(usage=_('%prog [options] mybook.chm')) parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output') @@ -160,6 +171,12 @@ class CHMReader(CHMFile): t[-1].extract() # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. + + # remove br at top of page if present after nav bars removed + br = html('br') + if br: + if check_all_prev_empty(br[0].previousSibling): + br[0].extract() # some images seem to be broken in some chm's :/ for img in html('img'):