From a6945bdadbc9a4022d45e07539a204945ed8c410 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 22 Apr 2010 07:54:21 -0600 Subject: [PATCH] Fix #5336 (Text within angle brackets disappears in .mobi books) --- src/calibre/__init__.py | 22 +++++++++++++--------- src/calibre/ebooks/mobi/reader.py | 7 ++++++- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index d21fcc8e87..6d104650bc 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -399,38 +399,42 @@ def my_unichr(num): except ValueError: return u'?' -def entity_to_unicode(match, exceptions=[], encoding='cp1252'): +def entity_to_unicode(match, exceptions=[], encoding='cp1252', + result_exceptions={}): ''' @param match: A match object such that '&'+match.group(1)';' is the entity. @param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234' @param encoding: The encoding to use to decode numeric entities between 128 and 256. If None, the Unicode UCS encoding is used. A common encoding is cp1252. ''' + def check(ch): + return result_exceptions.get(ch, ch) + ent = match.group(1) if ent in exceptions: return '&'+ent+';' if ent == 'apos': - return "'" + return check("'") if ent == 'hellips': ent = 'hellip' - if ent.startswith(u'#x'): + if ent.lower().startswith(u'#x'): num = int(ent[2:], 16) if encoding is None or num > 255: - return my_unichr(num) - return chr(num).decode(encoding) + return check(my_unichr(num)) + return check(chr(num).decode(encoding)) if ent.startswith(u'#'): try: num = int(ent[1:]) except ValueError: return '&'+ent+';' if encoding is None or num > 255: - return my_unichr(num) + return check(my_unichr(num)) try: - return chr(num).decode(encoding) + return check(chr(num).decode(encoding)) except UnicodeDecodeError: - return my_unichr(num) + return check(my_unichr(num)) try: - return my_unichr(name2codepoint[ent]) + return check(my_unichr(name2codepoint[ent])) except KeyError: return '&'+ent+';' diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index ace60673d7..1b266740d7 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -303,7 +303,12 @@ class MobiReader(object): for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) e2u = functools.partial(entity_to_unicode, - exceptions=['lt', 'gt', 'amp', 'apos', 'quot', '#60', '#62']) + result_exceptions={ + '<' : u'<', + '>' : u'>', + '&' : u'&', + '"' : u'"', + "'" : u'''}) self.processed_html = re.sub(r'&(\S+?);', e2u, self.processed_html) self.extract_images(processed_records, output_dir)