From ac8ccceef86423fbacb500fb6f5da842cf785573 Mon Sep 17 00:00:00 2001
From: James Ralston <>
Date: Sun, 21 Feb 2010 10:06:40 -0800
Subject: [PATCH 1/5] remove br from top of page in chm conversion

---
 src/calibre/ebooks/chm/input.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py
index a2976c944a..784848929d 100644
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@@ -11,7 +11,7 @@ from mimetypes import guess_type as guess_mimetype
 from htmlentitydefs import name2codepoint
 from pprint import PrettyPrinter
 
-from BeautifulSoup import BeautifulSoup
+from BeautifulSoup import BeautifulSoup, NavigableString
 from lxml import html, etree
 from pychm.chm import CHMFile
 from pychm.chmlib import (
@@ -35,6 +35,17 @@ def match_string(s1, s2_already_lowered):
             return True
     return False
 
+def check_all_prev_empty(tag):
+    if tag is None:
+        return True
+    if tag.__class__ == NavigableString and not check_empty(tag):
+        return False
+    return check_all_prev_empty(tag.previousSibling)
+
+def check_empty(s, rex = re.compile(r'\S')):
+    return rex.search(s) is None
+
+
 def option_parser():
     parser = OptionParser(usage=_('%prog [options] mybook.chm'))
     parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
@@ -160,6 +171,12 @@ class CHMReader(CHMFile):
                 t[-1].extract()
         # for some very odd reason each page's content appears to be in a table
         # too. and this table has sub-tables for random asides... grr.
+        
+        # remove br at top of page if present after nav bars removed
+        br = html('br')
+        if br:
+            if check_all_prev_empty(br[0].previousSibling):
+                br[0].extract()
 
         # some images seem to be broken in some chm's :/
         for img in html('img'):

From 91a2881a0c3ede8982c451d2e9a198c371bef79e Mon Sep 17 00:00:00 2001
From: James Ralston <>
Date: Sun, 21 Feb 2010 11:01:12 -0800
Subject: [PATCH 2/5] strip br from top of page in chm conversion

---
 src/calibre/ebooks/chm/input.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py
index ecb54dffdb..3b08854532 100644
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@@ -4,11 +4,11 @@ __license__ = 'GPL v3'
 __copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
                  ' and Alex Bramley <a.bramley at gmail.com>.'
 
-import os, shutil, uuid
+import os, shutil, uuid, re
 from tempfile import mkdtemp
 from mimetypes import guess_type as guess_mimetype
 
-from BeautifulSoup import BeautifulSoup
+from BeautifulSoup import BeautifulSoup, NavigableString
 from lxml import html
 from pychm.chm import CHMFile
 from pychm.chmlib import (
@@ -29,6 +29,17 @@ def match_string(s1, s2_already_lowered):
             return True
     return False
 
+def check_all_prev_empty(tag):
+    if tag is None:
+        return True
+    if tag.__class__ == NavigableString and not check_empty(tag):
+        return False
+    return check_all_prev_empty(tag.previousSibling)
+
+def check_empty(s, rex = re.compile(r'\S')):
+    return rex.search(s) is None
+
+
 def option_parser():
     parser = OptionParser(usage=_('%prog [options] mybook.chm'))
     parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
@@ -155,6 +166,12 @@ class CHMReader(CHMFile):
         # for some very odd reason each page's content appears to be in a table
         # too. and this table has sub-tables for random asides... grr.
 
+        # remove br at top of page if present after nav bars removed
+        br = soup('br')
+        if br:
+            if check_all_prev_empty(br[0].previousSibling):
+                br[0].extract()
+
         # some images seem to be broken in some chm's :/
         for img in soup('img'):
             try:

From 9f01f0b1264a9313699ff18606dd3550f40f304a Mon Sep 17 00:00:00 2001
From: James Ralston <>
Date: Sat, 6 Mar 2010 11:55:20 -0800
Subject: [PATCH 3/5] catch UnicodeDecodeError exception

---
 src/calibre/ebooks/chm/metadata.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py
index 7386d54658..0ce1f0b07f 100644
--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@@ -15,7 +15,10 @@ from calibre.utils.logging import default_log
 from calibre.ptempfile import TemporaryFile
 
 def _clean(s):
-    return s.replace(u'\u00a0', u' ')
+    try:
+        return s.replace(u'\u00a0', u' ')
+    except UnicodeDecodeError:
+        return u""
 
 def _detag(tag):
     str = u""

From 043223eac6a682ba6559a7385c5d5a1ac8061e8d Mon Sep 17 00:00:00 2001
From: James Ralston <>
Date: Sun, 7 Mar 2010 22:03:08 -0800
Subject: [PATCH 4/5] renderContents as unicode

---
 src/calibre/ebooks/chm/metadata.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py
index 0ce1f0b07f..2f0c246d10 100644
--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@@ -37,7 +37,7 @@ def _metadata_from_table(soup, searchfor):
     td = td.parent
     # there appears to be multiple ways of structuring the metadata
     # on the home page. cue some nasty special-case hacks...
-    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
+    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(None), flags=re.I):
         meta = _detag(td.findNextSibling('td'))
         return re.sub('^:', '', meta).strip()
     else:
@@ -49,7 +49,7 @@ def _metadata_from_span(soup, searchfor):
     if span is None:
         return None
     # this metadata might need some cleaning up still :/
-    return _detag(span.renderContents().strip())
+    return _detag(span.renderContents(None).strip())
 
 def _get_authors(soup):
     aut = (_metadata_from_span(soup, r'author')

From 43d6a53d7b0de7acc70d7ce67e1eb7b62add8596 Mon Sep 17 00:00:00 2001
From: James Ralston <>
Date: Sun, 7 Mar 2010 22:21:55 -0800
Subject: [PATCH 5/5] renderContents as unicode

---
 src/calibre/ebooks/chm/metadata.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py
index 2f0c246d10..d6a1d24024 100644
--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@@ -15,10 +15,7 @@ from calibre.utils.logging import default_log
 from calibre.ptempfile import TemporaryFile
 
 def _clean(s):
-    try:
-        return s.replace(u'\u00a0', u' ')
-    except UnicodeDecodeError:
-        return u""
+    return s.replace(u'\u00a0', u' ')
 
 def _detag(tag):
     str = u""