From 42989d47a3317a2b157fa3814c74b98ec93a94e9 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 3 May 2013 18:35:49 +0530
Subject: [PATCH] MOBI metadata: Strip XML unsafe unicode characters when
 reading metadata from MOBI files. Fixes #1175965 (E-book Viewer: 'NoneType'
 object is not iterable)

---
 src/calibre/ebooks/mobi/reader/headers.py | 26 +++++++++++------------
 src/calibre/utils/cleantext.py            |  8 ++++++-
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py
index 8e96475e36..b5b55b2ba0 100644
--- a/src/calibre/ebooks/mobi/reader/headers.py
+++ b/src/calibre/ebooks/mobi/reader/headers.py
@@ -13,12 +13,12 @@ from calibre.utils.date import parse_date
 from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.metadata import MetaInformation, check_isbn
 from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
-from calibre.utils.cleantext import clean_ascii_chars
+from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
 from calibre.utils.localization import canonicalize_lang
 
 NULL_INDEX = 0xffffffff
 
-class EXTHHeader(object): # {{{
+class EXTHHeader(object):  # {{{
 
     def __init__(self, raw, codec, title):
         self.doctype = raw[:4]
@@ -62,7 +62,7 @@ class EXTHHeader(object): # {{{
             elif idx == 502:
                 # last update time
                 pass
-            elif idx == 503: # Long title
+            elif idx == 503:  # Long title
                 # Amazon seems to regard this as the definitive book title
                 # rather than the title from the PDB header. In fact when
                 # sending MOBI files through Amazon's email service if the
@@ -72,7 +72,7 @@ class EXTHHeader(object): # {{{
                     title = self.decode(content)
                 except:
                     pass
-            elif idx == 524: # Lang code
+            elif idx == 524:  # Lang code
                 try:
                     lang = content.decode(codec)
                     lang = canonicalize_lang(lang)
@@ -83,22 +83,22 @@ class EXTHHeader(object): # {{{
             #else:
             #    print 'unknown record', idx, repr(content)
         if title:
-            self.mi.title = replace_entities(clean_ascii_chars(title))
+            self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
 
     def process_metadata(self, idx, content, codec):
         if idx == 100:
             if self.mi.is_null('authors'):
                 self.mi.authors = []
-            au = self.decode(content).strip()
+            au = clean_xml_chars(self.decode(content).strip())
             self.mi.authors.append(au)
             if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
                 self.mi.author_sort = au.strip()
         elif idx == 101:
-            self.mi.publisher = self.decode(content).strip()
+            self.mi.publisher = clean_xml_chars(self.decode(content).strip())
             if self.mi.publisher in {'Unknown', _('Unknown')}:
                 self.mi.publisher = None
         elif idx == 103:
-            self.mi.comments  = self.decode(content).strip()
+            self.mi.comments  = clean_xml_chars(self.decode(content).strip())
         elif idx == 104:
             raw = check_isbn(self.decode(content).strip().replace('-', ''))
             if raw:
@@ -106,7 +106,7 @@ class EXTHHeader(object): # {{{
         elif idx == 105:
             if not self.mi.tags:
                 self.mi.tags = []
-            self.mi.tags.extend([x.strip() for x in self.decode(content).split(';')])
+            self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
             self.mi.tags = list(set(self.mi.tags))
         elif idx == 106:
             try:
@@ -114,8 +114,8 @@ class EXTHHeader(object): # {{{
             except:
                 pass
         elif idx == 108:
-            self.mi.book_producer = self.decode(content).strip()
-        elif idx == 112: # dc:source set in some EBSP amazon samples
+            self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
+        elif idx == 112:  # dc:source set in some EBSP amazon samples
             try:
                 content = content.decode(codec).strip()
                 isig = 'urn:isbn:'
@@ -131,7 +131,7 @@ class EXTHHeader(object): # {{{
                         self.mi.application_id = self.mi.uuid = cid
             except:
                 pass
-        elif idx == 113: # ASIN or other id
+        elif idx == 113:  # ASIN or other id
             try:
                 self.uuid = content.decode('ascii')
                 self.mi.set_identifier('mobi-asin', self.uuid)
@@ -242,7 +242,7 @@ class BookHeader(object):
                 # if cnt is 1 or less, fdst section number can be garbage
                 if self.fdstcnt <= 1:
                     self.fdstidx = NULL_INDEX
-            else: # Null values
+            else:  # Null values
                 self.skelidx = self.dividx = self.othidx = self.fdstidx = \
                         NULL_INDEX
 
diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py
index 27e667612e..219199815e 100644
--- a/src/calibre/utils/cleantext.py
+++ b/src/calibre/utils/cleantext.py
@@ -1,9 +1,9 @@
-from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2010, sengian <sengian1@gmail.com>'
 __docformat__ = 'restructuredtext en'
 
 import re, htmlentitydefs
+from future_builtins import map
 
 _ascii_pat = None
 
@@ -28,6 +28,12 @@ def clean_ascii_chars(txt, charlist=None):
         pat = re.compile(u'|'.join(map(unichr, charlist)))
     return pat.sub('', txt)
 
+def clean_xml_chars(unicode_string):
+    def allowed(x):
+        x = ord(x)
+        return (0x0001 < x < 0xd7ff) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
+    return u''.join(filter(allowed, unicode_string))
+
 ##
 # Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
 # Removes HTML or XML character references and entities from a text string.