diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 2babb9182b..97a3842f1b 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -7,6 +7,7 @@ import sys, os, re, logging, time, mimetypes, \ __builtin__.__dict__['dynamic_property'] = lambda(func): func(None) from htmlentitydefs import name2codepoint from math import floor +from functools import partial warnings.simplefilter('ignore', DeprecationWarning) @@ -446,6 +447,12 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252', return '&'+ent+';' _ent_pat = re.compile(r'&(\S+?);') +xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions = { + '"' : '"', + "'" : ''', + '<' : '<', + '>' : '>', + '&' : '&'}) def prepare_string_for_xml(raw, attribute=False): raw = _ent_pat.sub(entity_to_unicode, raw) diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index 25341b120a..3afa6ce1f4 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -43,11 +43,8 @@ def strip_encoding_declarations(raw): return raw def substitute_entites(raw): - from calibre import entity_to_unicode - from functools import partial - f = partial(entity_to_unicode, exceptions= - ['amp', 'apos', 'quot', 'lt', 'gt']) - return ENTITY_PATTERN.sub(f, raw) + from calibre import xml_entity_to_unicode + return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) _CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 1b266740d7..15e6391812 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal ' Read data from .mobi files ''' -import functools, shutil, os, re, struct, textwrap, cStringIO, sys +import shutil, os, re, struct, textwrap, cStringIO, sys try: from PIL import Image as PILImage @@ -14,7 +14,7 @@ except ImportError: from lxml import html, etree -from calibre import entity_to_unicode, CurrentDir +from calibre import xml_entity_to_unicode, CurrentDir, entity_to_unicode from calibre.utils.filenames import ascii_filename from calibre.utils.date import parse_date from calibre.ptempfile import TemporaryDirectory @@ -302,14 +302,7 @@ class MobiReader(object): for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) - e2u = functools.partial(entity_to_unicode, - result_exceptions={ - '<' : u'<', - '>' : u'>', - '&' : u'&', - '"' : u'"', - "'" : u'''}) - self.processed_html = re.sub(r'&(\S+?);', e2u, + self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index fc0a832528..f770622952 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -771,18 +771,12 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _parse_xml(self, data): - data = xml_to_unicode(data, strip_encoding_pats=True)[0] + data = xml_to_unicode(data, strip_encoding_pats=True, + assume_utf8=True, resolve_entities=True)[0] if not data: return None parser = etree.XMLParser(recover=True) - try: - return etree.fromstring(data, parser=parser) - except etree.XMLSyntaxError, err: - if getattr(err, 'code', 0) == 26 or str(err).startswith('Entity'): - data = xml_to_unicode(data, strip_encoding_pats=True, - resolve_entities=True)[0] - return etree.fromstring(data) - raise + return etree.fromstring(data, parser=parser) def _parse_xhtml(self, data): self.oeb.log.debug('Parsing', self.href, '...') diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index d8ba3e5b77..d62c6353ea 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -115,7 +115,7 @@ class Split(object): for i, x in enumerate(page_breaks): x.set('id', x.get('id', 'calibre_pb_%d'%i)) id = x.get('id') - page_breaks_.append((XPath('//*[@id="%s"]'%id), + page_breaks_.append((XPath('//*[@id=%r]'%id), x.get('pb_before', False))) page_break_ids.append(id)