From 42989d47a3317a2b157fa3814c74b98ec93a94e9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 3 May 2013 18:35:49 +0530 Subject: [PATCH] MOBI metadata: Strip XML unsafe unicode characters when reading metadata from MOBI files. Fixes #1175965 (E-book Viewer: 'NoneType' object is not iterable) --- src/calibre/ebooks/mobi/reader/headers.py | 26 +++++++++++------------ src/calibre/utils/cleantext.py | 8 ++++++- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index 8e96475e36..b5b55b2ba0 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -13,12 +13,12 @@ from calibre.utils.date import parse_date from calibre.ebooks.mobi import MobiError from calibre.ebooks.metadata import MetaInformation, check_isbn from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana -from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars from calibre.utils.localization import canonicalize_lang NULL_INDEX = 0xffffffff -class EXTHHeader(object): # {{{ +class EXTHHeader(object): # {{{ def __init__(self, raw, codec, title): self.doctype = raw[:4] @@ -62,7 +62,7 @@ class EXTHHeader(object): # {{{ elif idx == 502: # last update time pass - elif idx == 503: # Long title + elif idx == 503: # Long title # Amazon seems to regard this as the definitive book title # rather than the title from the PDB header. In fact when # sending MOBI files through Amazon's email service if the @@ -72,7 +72,7 @@ class EXTHHeader(object): # {{{ title = self.decode(content) except: pass - elif idx == 524: # Lang code + elif idx == 524: # Lang code try: lang = content.decode(codec) lang = canonicalize_lang(lang) @@ -83,22 +83,22 @@ class EXTHHeader(object): # {{{ #else: # print 'unknown record', idx, repr(content) if title: - self.mi.title = replace_entities(clean_ascii_chars(title)) + self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title))) def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] - au = self.decode(content).strip() + au = clean_xml_chars(self.decode(content).strip()) self.mi.authors.append(au) if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()): self.mi.author_sort = au.strip() elif idx == 101: - self.mi.publisher = self.decode(content).strip() + self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: - self.mi.comments = self.decode(content).strip() + self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: @@ -106,7 +106,7 @@ class EXTHHeader(object): # {{{ elif idx == 105: if not self.mi.tags: self.mi.tags = [] - self.mi.tags.extend([x.strip() for x in self.decode(content).split(';')]) + self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: @@ -114,8 +114,8 @@ class EXTHHeader(object): # {{{ except: pass elif idx == 108: - self.mi.book_producer = self.decode(content).strip() - elif idx == 112: # dc:source set in some EBSP amazon samples + self.mi.book_producer = clean_xml_chars(self.decode(content).strip()) + elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' @@ -131,7 +131,7 @@ class EXTHHeader(object): # {{{ self.mi.application_id = self.mi.uuid = cid except: pass - elif idx == 113: # ASIN or other id + elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) @@ -242,7 +242,7 @@ class BookHeader(object): # if cnt is 1 or less, fdst section number can be garbage if self.fdstcnt <= 1: self.fdstidx = NULL_INDEX - else: # Null values + else: # Null values self.skelidx = self.dividx = self.othidx = self.fdstidx = \ NULL_INDEX diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index 27e667612e..219199815e 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -1,9 +1,9 @@ -from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2010, sengian ' __docformat__ = 'restructuredtext en' import re, htmlentitydefs +from future_builtins import map _ascii_pat = None @@ -28,6 +28,12 @@ def clean_ascii_chars(txt, charlist=None): pat = re.compile(u'|'.join(map(unichr, charlist))) return pat.sub('', txt) +def clean_xml_chars(unicode_string): + def allowed(x): + x = ord(x) + return (0x0001 < x < 0xd7ff) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff) + return u''.join(filter(allowed, unicode_string)) + ## # Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html # Removes HTML or XML character references and entities from a text string.