When backing up metadata. automatically remove XML invalid chars, instead of erroring out

This commit is contained in:
Kovid Goyal 2013-05-23 09:31:47 +05:30
parent 83810d655d
commit 621c641396
2 changed files with 13 additions and 8 deletions

View File

@ -21,7 +21,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import parse_date, isoformat from calibre.utils.date import parse_date, isoformat
from calibre.utils.localization import get_lang, canonicalize_lang from calibre.utils.localization import get_lang, canonicalize_lang
from calibre import prints, guess_type from calibre import prints, guess_type
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.utils.config import tweaks from calibre.utils.config import tweaks
class Resource(object): # {{{ class Resource(object): # {{{
@ -1436,7 +1436,10 @@ def metadata_to_opf(mi, as_string=True, default_lang=None):
attrib['name'] = name attrib['name'] = name
if content: if content:
attrib['content'] = content attrib['content'] = content
try:
elem = metadata.makeelement(tag, attrib=attrib) elem = metadata.makeelement(tag, attrib=attrib)
except ValueError:
elem = metadata.makeelement(tag, attrib={k:clean_xml_chars(v) for k, v in attrib.iteritems()})
elem.tail = '\n'+(' '*8) elem.tail = '\n'+(' '*8)
if text: if text:
try: try:

View File

@ -28,13 +28,14 @@ def clean_ascii_chars(txt, charlist=None):
pat = re.compile(u'|'.join(map(unichr, charlist))) pat = re.compile(u'|'.join(map(unichr, charlist)))
return pat.sub('', txt) return pat.sub('', txt)
def clean_xml_chars(unicode_string): def allowed(x):
def allowed(x):
x = ord(x) x = ord(x)
return (0x0001 < x < 0xd7ff) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff) return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
def clean_xml_chars(unicode_string):
return u''.join(filter(allowed, unicode_string)) return u''.join(filter(allowed, unicode_string))
##
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html # Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
# Removes HTML or XML character references and entities from a text string. # Removes HTML or XML character references and entities from a text string.
# #
@ -60,6 +61,7 @@ def unescape(text, rm=False, rchar=u''):
except KeyError: except KeyError:
pass pass
if rm: if rm:
return rchar #replace by char return rchar # replace by char
return text # leave as is return text # leave as is
return re.sub("&#?\w+;", fixup, text) return re.sub("&#?\w+;", fixup, text)