MOBI metadata: Strip XML unsafe unicode characters when reading metadata from MOBI files. Fixes #1175965 (E-book Viewer: 'NoneType' object is not iterable)

This commit is contained in:
Kovid Goyal 2013-05-03 18:35:49 +05:30
parent 8e8efd61a9
commit 42989d47a3
2 changed files with 20 additions and 14 deletions

View File

@ -13,12 +13,12 @@ from calibre.utils.date import parse_date
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.utils.localization import canonicalize_lang
NULL_INDEX = 0xffffffff
class EXTHHeader(object): # {{{
class EXTHHeader(object): # {{{
def __init__(self, raw, codec, title):
self.doctype = raw[:4]
@ -62,7 +62,7 @@ class EXTHHeader(object): # {{{
elif idx == 502:
# last update time
pass
elif idx == 503: # Long title
elif idx == 503: # Long title
# Amazon seems to regard this as the definitive book title
# rather than the title from the PDB header. In fact when
# sending MOBI files through Amazon's email service if the
@ -72,7 +72,7 @@ class EXTHHeader(object): # {{{
title = self.decode(content)
except:
pass
elif idx == 524: # Lang code
elif idx == 524: # Lang code
try:
lang = content.decode(codec)
lang = canonicalize_lang(lang)
@ -83,22 +83,22 @@ class EXTHHeader(object): # {{{
#else:
# print 'unknown record', idx, repr(content)
if title:
self.mi.title = replace_entities(clean_ascii_chars(title))
self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
def process_metadata(self, idx, content, codec):
if idx == 100:
if self.mi.is_null('authors'):
self.mi.authors = []
au = self.decode(content).strip()
au = clean_xml_chars(self.decode(content).strip())
self.mi.authors.append(au)
if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
self.mi.author_sort = au.strip()
elif idx == 101:
self.mi.publisher = self.decode(content).strip()
self.mi.publisher = clean_xml_chars(self.decode(content).strip())
if self.mi.publisher in {'Unknown', _('Unknown')}:
self.mi.publisher = None
elif idx == 103:
self.mi.comments = self.decode(content).strip()
self.mi.comments = clean_xml_chars(self.decode(content).strip())
elif idx == 104:
raw = check_isbn(self.decode(content).strip().replace('-', ''))
if raw:
@ -106,7 +106,7 @@ class EXTHHeader(object): # {{{
elif idx == 105:
if not self.mi.tags:
self.mi.tags = []
self.mi.tags.extend([x.strip() for x in self.decode(content).split(';')])
self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
self.mi.tags = list(set(self.mi.tags))
elif idx == 106:
try:
@ -114,8 +114,8 @@ class EXTHHeader(object): # {{{
except:
pass
elif idx == 108:
self.mi.book_producer = self.decode(content).strip()
elif idx == 112: # dc:source set in some EBSP amazon samples
self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
elif idx == 112: # dc:source set in some EBSP amazon samples
try:
content = content.decode(codec).strip()
isig = 'urn:isbn:'
@ -131,7 +131,7 @@ class EXTHHeader(object): # {{{
self.mi.application_id = self.mi.uuid = cid
except:
pass
elif idx == 113: # ASIN or other id
elif idx == 113: # ASIN or other id
try:
self.uuid = content.decode('ascii')
self.mi.set_identifier('mobi-asin', self.uuid)
@ -242,7 +242,7 @@ class BookHeader(object):
# if cnt is 1 or less, fdst section number can be garbage
if self.fdstcnt <= 1:
self.fdstidx = NULL_INDEX
else: # Null values
else: # Null values
self.skelidx = self.dividx = self.othidx = self.fdstidx = \
NULL_INDEX

View File

@ -1,9 +1,9 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>'
__docformat__ = 'restructuredtext en'
import re, htmlentitydefs
from future_builtins import map
_ascii_pat = None
@ -28,6 +28,12 @@ def clean_ascii_chars(txt, charlist=None):
pat = re.compile(u'|'.join(map(unichr, charlist)))
return pat.sub('', txt)
def clean_xml_chars(unicode_string):
def allowed(x):
x = ord(x)
return (0x0001 < x < 0xd7ff) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
return u''.join(filter(allowed, unicode_string))
##
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
# Removes HTML or XML character references and entities from a text string.