MOBI metadata: Strip XML unsafe unicode characters when reading metadata from MOBI files. Fixes #1175965 (E-book Viewer: 'NoneType' object is not iterable)

This commit is contained in:
Kovid Goyal 2013-05-03 18:35:49 +05:30
parent 8e8efd61a9
commit 42989d47a3
2 changed files with 20 additions and 14 deletions

View File

@ -13,7 +13,7 @@ from calibre.utils.date import parse_date
from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi import MobiError
from calibre.ebooks.metadata import MetaInformation, check_isbn from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
NULL_INDEX = 0xffffffff NULL_INDEX = 0xffffffff
@ -83,22 +83,22 @@ class EXTHHeader(object): # {{{
#else: #else:
# print 'unknown record', idx, repr(content) # print 'unknown record', idx, repr(content)
if title: if title:
self.mi.title = replace_entities(clean_ascii_chars(title)) self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
def process_metadata(self, idx, content, codec): def process_metadata(self, idx, content, codec):
if idx == 100: if idx == 100:
if self.mi.is_null('authors'): if self.mi.is_null('authors'):
self.mi.authors = [] self.mi.authors = []
au = self.decode(content).strip() au = clean_xml_chars(self.decode(content).strip())
self.mi.authors.append(au) self.mi.authors.append(au)
if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()): if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
self.mi.author_sort = au.strip() self.mi.author_sort = au.strip()
elif idx == 101: elif idx == 101:
self.mi.publisher = self.decode(content).strip() self.mi.publisher = clean_xml_chars(self.decode(content).strip())
if self.mi.publisher in {'Unknown', _('Unknown')}: if self.mi.publisher in {'Unknown', _('Unknown')}:
self.mi.publisher = None self.mi.publisher = None
elif idx == 103: elif idx == 103:
self.mi.comments = self.decode(content).strip() self.mi.comments = clean_xml_chars(self.decode(content).strip())
elif idx == 104: elif idx == 104:
raw = check_isbn(self.decode(content).strip().replace('-', '')) raw = check_isbn(self.decode(content).strip().replace('-', ''))
if raw: if raw:
@ -106,7 +106,7 @@ class EXTHHeader(object): # {{{
elif idx == 105: elif idx == 105:
if not self.mi.tags: if not self.mi.tags:
self.mi.tags = [] self.mi.tags = []
self.mi.tags.extend([x.strip() for x in self.decode(content).split(';')]) self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
self.mi.tags = list(set(self.mi.tags)) self.mi.tags = list(set(self.mi.tags))
elif idx == 106: elif idx == 106:
try: try:
@ -114,7 +114,7 @@ class EXTHHeader(object): # {{{
except: except:
pass pass
elif idx == 108: elif idx == 108:
self.mi.book_producer = self.decode(content).strip() self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
elif idx == 112: # dc:source set in some EBSP amazon samples elif idx == 112: # dc:source set in some EBSP amazon samples
try: try:
content = content.decode(codec).strip() content = content.decode(codec).strip()

View File

@ -1,9 +1,9 @@
from __future__ import with_statement
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>' __copyright__ = '2010, sengian <sengian1@gmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, htmlentitydefs import re, htmlentitydefs
from future_builtins import map
_ascii_pat = None _ascii_pat = None
@ -28,6 +28,12 @@ def clean_ascii_chars(txt, charlist=None):
pat = re.compile(u'|'.join(map(unichr, charlist))) pat = re.compile(u'|'.join(map(unichr, charlist)))
return pat.sub('', txt) return pat.sub('', txt)
def clean_xml_chars(unicode_string):
def allowed(x):
x = ord(x)
return (0x0001 < x < 0xd7ff) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
return u''.join(filter(allowed, unicode_string))
## ##
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html # Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
# Removes HTML or XML character references and entities from a text string. # Removes HTML or XML character references and entities from a text string.