diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 395fe0c484..fb80cc8bfe 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -21,7 +21,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_date, isoformat from calibre.utils.localization import get_lang, canonicalize_lang from calibre import prints, guess_type -from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars from calibre.utils.config import tweaks class Resource(object): # {{{ @@ -1436,7 +1436,10 @@ def metadata_to_opf(mi, as_string=True, default_lang=None): attrib['name'] = name if content: attrib['content'] = content - elem = metadata.makeelement(tag, attrib=attrib) + try: + elem = metadata.makeelement(tag, attrib=attrib) + except ValueError: + elem = metadata.makeelement(tag, attrib={k:clean_xml_chars(v) for k, v in attrib.iteritems()}) elem.tail = '\n'+(' '*8) if text: try: diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index 219199815e..81614077bd 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -28,13 +28,14 @@ def clean_ascii_chars(txt, charlist=None): pat = re.compile(u'|'.join(map(unichr, charlist))) return pat.sub('', txt) +def allowed(x): + x = ord(x) + return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff) + def clean_xml_chars(unicode_string): - def allowed(x): - x = ord(x) - return (0x0001 < x < 0xd7ff) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff) return u''.join(filter(allowed, unicode_string)) -## + # Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html # Removes HTML or XML character references and entities from a text string. # @@ -60,6 +61,7 @@ def unescape(text, rm=False, rchar=u''): except KeyError: pass if rm: - return rchar #replace by char - return text # leave as is + return rchar # replace by char + return text # leave as is return re.sub("&#?\w+;", fixup, text) +