mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
MOBI metadata: Strip XML unsafe unicode characters when reading metadata from MOBI files. Fixes #1175965 (E-book Viewer: 'NoneType' object is not iterable)
This commit is contained in:
parent
8e8efd61a9
commit
42989d47a3
@ -13,7 +13,7 @@ from calibre.utils.date import parse_date
|
|||||||
from calibre.ebooks.mobi import MobiError
|
from calibre.ebooks.mobi import MobiError
|
||||||
from calibre.ebooks.metadata import MetaInformation, check_isbn
|
from calibre.ebooks.metadata import MetaInformation, check_isbn
|
||||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
|
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
||||||
from calibre.utils.localization import canonicalize_lang
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
NULL_INDEX = 0xffffffff
|
NULL_INDEX = 0xffffffff
|
||||||
@ -83,22 +83,22 @@ class EXTHHeader(object): # {{{
|
|||||||
#else:
|
#else:
|
||||||
# print 'unknown record', idx, repr(content)
|
# print 'unknown record', idx, repr(content)
|
||||||
if title:
|
if title:
|
||||||
self.mi.title = replace_entities(clean_ascii_chars(title))
|
self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
|
||||||
|
|
||||||
def process_metadata(self, idx, content, codec):
|
def process_metadata(self, idx, content, codec):
|
||||||
if idx == 100:
|
if idx == 100:
|
||||||
if self.mi.is_null('authors'):
|
if self.mi.is_null('authors'):
|
||||||
self.mi.authors = []
|
self.mi.authors = []
|
||||||
au = self.decode(content).strip()
|
au = clean_xml_chars(self.decode(content).strip())
|
||||||
self.mi.authors.append(au)
|
self.mi.authors.append(au)
|
||||||
if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
|
if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
|
||||||
self.mi.author_sort = au.strip()
|
self.mi.author_sort = au.strip()
|
||||||
elif idx == 101:
|
elif idx == 101:
|
||||||
self.mi.publisher = self.decode(content).strip()
|
self.mi.publisher = clean_xml_chars(self.decode(content).strip())
|
||||||
if self.mi.publisher in {'Unknown', _('Unknown')}:
|
if self.mi.publisher in {'Unknown', _('Unknown')}:
|
||||||
self.mi.publisher = None
|
self.mi.publisher = None
|
||||||
elif idx == 103:
|
elif idx == 103:
|
||||||
self.mi.comments = self.decode(content).strip()
|
self.mi.comments = clean_xml_chars(self.decode(content).strip())
|
||||||
elif idx == 104:
|
elif idx == 104:
|
||||||
raw = check_isbn(self.decode(content).strip().replace('-', ''))
|
raw = check_isbn(self.decode(content).strip().replace('-', ''))
|
||||||
if raw:
|
if raw:
|
||||||
@ -106,7 +106,7 @@ class EXTHHeader(object): # {{{
|
|||||||
elif idx == 105:
|
elif idx == 105:
|
||||||
if not self.mi.tags:
|
if not self.mi.tags:
|
||||||
self.mi.tags = []
|
self.mi.tags = []
|
||||||
self.mi.tags.extend([x.strip() for x in self.decode(content).split(';')])
|
self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
|
||||||
self.mi.tags = list(set(self.mi.tags))
|
self.mi.tags = list(set(self.mi.tags))
|
||||||
elif idx == 106:
|
elif idx == 106:
|
||||||
try:
|
try:
|
||||||
@ -114,7 +114,7 @@ class EXTHHeader(object): # {{{
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
elif idx == 108:
|
elif idx == 108:
|
||||||
self.mi.book_producer = self.decode(content).strip()
|
self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
|
||||||
elif idx == 112: # dc:source set in some EBSP amazon samples
|
elif idx == 112: # dc:source set in some EBSP amazon samples
|
||||||
try:
|
try:
|
||||||
content = content.decode(codec).strip()
|
content = content.decode(codec).strip()
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from __future__ import with_statement
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re, htmlentitydefs
|
import re, htmlentitydefs
|
||||||
|
from future_builtins import map
|
||||||
|
|
||||||
_ascii_pat = None
|
_ascii_pat = None
|
||||||
|
|
||||||
@ -28,6 +28,12 @@ def clean_ascii_chars(txt, charlist=None):
|
|||||||
pat = re.compile(u'|'.join(map(unichr, charlist)))
|
pat = re.compile(u'|'.join(map(unichr, charlist)))
|
||||||
return pat.sub('', txt)
|
return pat.sub('', txt)
|
||||||
|
|
||||||
|
def clean_xml_chars(unicode_string):
|
||||||
|
def allowed(x):
|
||||||
|
x = ord(x)
|
||||||
|
return (0x0001 < x < 0xd7ff) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
|
||||||
|
return u''.join(filter(allowed, unicode_string))
|
||||||
|
|
||||||
##
|
##
|
||||||
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
||||||
# Removes HTML or XML character references and entities from a text string.
|
# Removes HTML or XML character references and entities from a text string.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user