MOBI metadata: Strip XML unsafe unicode characters when reading metadata from MOBI files. Fixes #1175965 (E-book Viewer: 'NoneType' object is not iterable)

2025-11-05 12:03:03 -05:00 · 2013-05-03 18:35:49 +05:30 · 2013-05-03 18:35:49 +05:30 · 42989d47a3
commit 42989d47a3
parent 8e8efd61a9
2 changed files with 20 additions and 14 deletions
--- a/src/calibre/ebooks/mobi/reader/headers.py
+++ b/src/calibre/ebooks/mobi/reader/headers.py
@ -13,7 +13,7 @@ from calibre.utils.date import parse_date
 from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.metadata import MetaInformation, check_isbn
 from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
-from calibre.utils.cleantext import clean_ascii_chars
+from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
 from calibre.utils.localization import canonicalize_lang
 NULL_INDEX = 0xffffffff
@ -83,22 +83,22 @@ class EXTHHeader(object): # {{{
            #else:
            #    print 'unknown record', idx, repr(content)
        if title:
-            self.mi.title = replace_entities(clean_ascii_chars(title))
+            self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
    def process_metadata(self, idx, content, codec):
        if idx == 100:
            if self.mi.is_null('authors'):
                self.mi.authors = []
-            au = self.decode(content).strip()
+            au = clean_xml_chars(self.decode(content).strip())
            self.mi.authors.append(au)
            if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
                self.mi.author_sort = au.strip()
        elif idx == 101:
-            self.mi.publisher = self.decode(content).strip()
+            self.mi.publisher = clean_xml_chars(self.decode(content).strip())
            if self.mi.publisher in {'Unknown', _('Unknown')}:
                self.mi.publisher = None
        elif idx == 103:
-            self.mi.comments  = self.decode(content).strip()
+            self.mi.comments  = clean_xml_chars(self.decode(content).strip())
        elif idx == 104:
            raw = check_isbn(self.decode(content).strip().replace('-', ''))
            if raw:
@ -106,7 +106,7 @@ class EXTHHeader(object): # {{{
        elif idx == 105:
            if not self.mi.tags:
                self.mi.tags = []
-            self.mi.tags.extend([x.strip() for x in self.decode(content).split(';')])
+            self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
            self.mi.tags = list(set(self.mi.tags))
        elif idx == 106:
            try:
@ -114,7 +114,7 @@ class EXTHHeader(object): # {{{
            except:
                pass
        elif idx == 108:
-            self.mi.book_producer = self.decode(content).strip()
+            self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
        elif idx == 112:  # dc:source set in some EBSP amazon samples
            try:
                content = content.decode(codec).strip()
--- a/src/calibre/utils/cleantext.py
+++ b/src/calibre/utils/cleantext.py
@ -1,9 +1,9 @@
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2010, sengian <sengian1@gmail.com>'
 __docformat__ = 'restructuredtext en'
 import re, htmlentitydefs
 from future_builtins import map
 _ascii_pat = None
@ -28,6 +28,12 @@ def clean_ascii_chars(txt, charlist=None):
        pat = re.compile(u'|'.join(map(unichr, charlist)))
    return pat.sub('', txt)
 def clean_xml_chars(unicode_string):
    def allowed(x):
        x = ord(x)
        return (0x0001 < x < 0xd7ff) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
    return u''.join(filter(allowed, unicode_string))
 ##
 # Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
 # Removes HTML or XML character references and entities from a text string.