From ac8ccceef86423fbacb500fb6f5da842cf785573 Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 21 Feb 2010 10:06:40 -0800 Subject: [PATCH 1/5] remove br from top of page in chm conversion --- src/calibre/ebooks/chm/input.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index a2976c944a..784848929d 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -11,7 +11,7 @@ from mimetypes import guess_type as guess_mimetype from htmlentitydefs import name2codepoint from pprint import PrettyPrinter -from BeautifulSoup import BeautifulSoup +from BeautifulSoup import BeautifulSoup, NavigableString from lxml import html, etree from pychm.chm import CHMFile from pychm.chmlib import ( @@ -35,6 +35,17 @@ def match_string(s1, s2_already_lowered): return True return False +def check_all_prev_empty(tag): + if tag is None: + return True + if tag.__class__ == NavigableString and not check_empty(tag): + return False + return check_all_prev_empty(tag.previousSibling) + +def check_empty(s, rex = re.compile(r'\S')): + return rex.search(s) is None + + def option_parser(): parser = OptionParser(usage=_('%prog [options] mybook.chm')) parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output') @@ -160,6 +171,12 @@ class CHMReader(CHMFile): t[-1].extract() # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. + + # remove br at top of page if present after nav bars removed + br = html('br') + if br: + if check_all_prev_empty(br[0].previousSibling): + br[0].extract() # some images seem to be broken in some chm's :/ for img in html('img'): From 91a2881a0c3ede8982c451d2e9a198c371bef79e Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 21 Feb 2010 11:01:12 -0800 Subject: [PATCH 2/5] strip br from top of page in chm conversion --- src/calibre/ebooks/chm/input.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index ecb54dffdb..3b08854532 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -4,11 +4,11 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ,' \ ' and Alex Bramley .' -import os, shutil, uuid +import os, shutil, uuid, re from tempfile import mkdtemp from mimetypes import guess_type as guess_mimetype -from BeautifulSoup import BeautifulSoup +from BeautifulSoup import BeautifulSoup, NavigableString from lxml import html from pychm.chm import CHMFile from pychm.chmlib import ( @@ -29,6 +29,17 @@ def match_string(s1, s2_already_lowered): return True return False +def check_all_prev_empty(tag): + if tag is None: + return True + if tag.__class__ == NavigableString and not check_empty(tag): + return False + return check_all_prev_empty(tag.previousSibling) + +def check_empty(s, rex = re.compile(r'\S')): + return rex.search(s) is None + + def option_parser(): parser = OptionParser(usage=_('%prog [options] mybook.chm')) parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output') @@ -155,6 +166,12 @@ class CHMReader(CHMFile): # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. + # remove br at top of page if present after nav bars removed + br = soup('br') + if br: + if check_all_prev_empty(br[0].previousSibling): + br[0].extract() + # some images seem to be broken in some chm's :/ for img in soup('img'): try: From 9f01f0b1264a9313699ff18606dd3550f40f304a Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sat, 6 Mar 2010 11:55:20 -0800 Subject: [PATCH 3/5] catch UnicodeDecodeError exception --- src/calibre/ebooks/chm/metadata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index 7386d54658..0ce1f0b07f 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -15,7 +15,10 @@ from calibre.utils.logging import default_log from calibre.ptempfile import TemporaryFile def _clean(s): - return s.replace(u'\u00a0', u' ') + try: + return s.replace(u'\u00a0', u' ') + except UnicodeDecodeError: + return u"" def _detag(tag): str = u"" From 043223eac6a682ba6559a7385c5d5a1ac8061e8d Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 7 Mar 2010 22:03:08 -0800 Subject: [PATCH 4/5] renderContents as unicode --- src/calibre/ebooks/chm/metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index 0ce1f0b07f..2f0c246d10 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -37,7 +37,7 @@ def _metadata_from_table(soup, searchfor): td = td.parent # there appears to be multiple ways of structuring the metadata # on the home page. cue some nasty special-case hacks... - if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I): + if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(None), flags=re.I): meta = _detag(td.findNextSibling('td')) return re.sub('^:', '', meta).strip() else: @@ -49,7 +49,7 @@ def _metadata_from_span(soup, searchfor): if span is None: return None # this metadata might need some cleaning up still :/ - return _detag(span.renderContents().strip()) + return _detag(span.renderContents(None).strip()) def _get_authors(soup): aut = (_metadata_from_span(soup, r'author') From 43d6a53d7b0de7acc70d7ce67e1eb7b62add8596 Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 7 Mar 2010 22:21:55 -0800 Subject: [PATCH 5/5] renderContents as unicode --- src/calibre/ebooks/chm/metadata.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index 2f0c246d10..d6a1d24024 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -15,10 +15,7 @@ from calibre.utils.logging import default_log from calibre.ptempfile import TemporaryFile def _clean(s): - try: - return s.replace(u'\u00a0', u' ') - except UnicodeDecodeError: - return u"" + return s.replace(u'\u00a0', u' ') def _detag(tag): str = u""