From 2cca07250bd28e5835a1b63ae8612f84ce454ca5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 13 Feb 2009 10:27:12 -0800 Subject: [PATCH] MOBI Input:Fix handling of numeric entities and convert empty
 tags to 
as they cause incorrect rendering in most HTML renderers --- src/calibre/ebooks/mobi/reader.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index bfbe8f5ae5..dbd326bfe6 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -186,6 +186,8 @@ class MobiReader(object): self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) + self.processed_html = re.sub(r'&(\S+?);', entity_to_unicode, + self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() @@ -271,6 +273,8 @@ class MobiReader(object): for key in tag.attrib.keys(): tag.attrib.pop(key) continue + if tag.tag == 'pre' and not tag.text: + tag.tag = 'div' styles, attrib = [], tag.attrib if attrib.has_key('style'): style = attrib.pop('style').strip() @@ -451,6 +455,7 @@ class MobiReader(object): self.processed_html += self.mobi_html[pos:end] + (anchor % oend) pos = end self.processed_html += self.mobi_html[pos:] + def extract_images(self, processed_records, output_dir): if self.verbose: