From a6945bdadbc9a4022d45e07539a204945ed8c410 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 22 Apr 2010 07:54:21 -0600
Subject: [PATCH] Fix #5336 (Text within angle brackets disappears in .mobi
 books)

---
 src/calibre/__init__.py           | 22 +++++++++++++---------
 src/calibre/ebooks/mobi/reader.py |  7 ++++++-
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index d21fcc8e87..6d104650bc 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -399,38 +399,42 @@ def my_unichr(num):
     except ValueError:
         return u'?'
 
-def entity_to_unicode(match, exceptions=[], encoding='cp1252'):
+def entity_to_unicode(match, exceptions=[], encoding='cp1252',
+        result_exceptions={}):
     '''
     @param match: A match object such that '&'+match.group(1)';' is the entity.
     @param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
     @param encoding: The encoding to use to decode numeric entities between 128 and 256.
     If None, the Unicode UCS encoding is used. A common encoding is cp1252.
     '''
+    def check(ch):
+        return result_exceptions.get(ch, ch)
+
     ent = match.group(1)
     if ent in exceptions:
         return '&'+ent+';'
     if ent == 'apos':
-        return "'"
+        return check("'")
     if ent == 'hellips':
         ent = 'hellip'
-    if ent.startswith(u'#x'):
+    if ent.lower().startswith(u'#x'):
         num = int(ent[2:], 16)
         if encoding is None or num > 255:
-            return my_unichr(num)
-        return chr(num).decode(encoding)
+            return check(my_unichr(num))
+        return check(chr(num).decode(encoding))
     if ent.startswith(u'#'):
         try:
             num = int(ent[1:])
         except ValueError:
             return '&'+ent+';'
         if encoding is None or num > 255:
-            return my_unichr(num)
+            return check(my_unichr(num))
         try:
-            return chr(num).decode(encoding)
+            return check(chr(num).decode(encoding))
         except UnicodeDecodeError:
-            return my_unichr(num)
+            return check(my_unichr(num))
     try:
-        return my_unichr(name2codepoint[ent])
+        return check(my_unichr(name2codepoint[ent]))
     except KeyError:
         return '&'+ent+';'
 
diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py
index ace60673d7..1b266740d7 100644
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@@ -303,7 +303,12 @@ class MobiReader(object):
         for pat in ENCODING_PATS:
             self.processed_html = pat.sub('', self.processed_html)
         e2u = functools.partial(entity_to_unicode,
-            exceptions=['lt', 'gt', 'amp', 'apos', 'quot', '#60', '#62'])
+            result_exceptions={
+                '<' : u'&lt;',
+                '>' : u'&gt;',
+                '&' : u'&amp;',
+                '"' : u'&quot;',
+                "'" : u'&apos;'})
         self.processed_html = re.sub(r'&(\S+?);', e2u,
             self.processed_html)
         self.extract_images(processed_records, output_dir)