Conversion pipeline: When decoding XML (but not XHTML) if no encoding is specified, assume utf-8. Make entity conversion more robust. When splitting html handle ids with quotes in them correctly

2025-12-11 23:55:44 -05:00 · 2010-04-24 07:30:50 -06:00 · 2010-04-24 07:30:50 -06:00 · b2f590b6b3
commit b2f590b6b3
parent 8daf5561a3
5 changed files with 16 additions and 25 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -7,6 +7,7 @@ import sys, os, re, logging, time, mimetypes, \
 __builtin__.__dict__['dynamic_property'] = lambda(func): func(None)
 from htmlentitydefs import name2codepoint
 from math import floor
+from functools import partial

 warnings.simplefilter('ignore', DeprecationWarning)

@ -446,6 +447,12 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
        return '&'+ent+';'

 _ent_pat = re.compile(r'&(\S+?);')
+xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions = {
+    '"' : '&quot;',
+    "'" : '&apos;',
+    '<' : '&lt;',
+    '>' : '&gt;',
+    '&' : '&amp;'})

 def prepare_string_for_xml(raw, attribute=False):
    raw = _ent_pat.sub(entity_to_unicode, raw)
--- a/src/calibre/ebooks/chardet/init.py
+++ b/src/calibre/ebooks/chardet/init.py
@ -43,11 +43,8 @@ def strip_encoding_declarations(raw):
    return raw

 def substitute_entites(raw):
-    from calibre import entity_to_unicode
-    from functools import partial
-    f = partial(entity_to_unicode, exceptions=
-                ['amp', 'apos', 'quot', 'lt', 'gt'])
-    return ENTITY_PATTERN.sub(f, raw)
+    from calibre import xml_entity_to_unicode
+    return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)

 _CHARSET_ALIASES = { "macintosh" : "mac-roman",
                        "x-sjis" : "shift-jis" }
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 Read data from .mobi files
 '''

-import functools, shutil, os, re, struct, textwrap, cStringIO, sys
+import shutil, os, re, struct, textwrap, cStringIO, sys

 try:
    from PIL import Image as PILImage
@ -14,7 +14,7 @@ except ImportError:

 from lxml import html, etree

-from calibre import entity_to_unicode, CurrentDir
+from calibre import xml_entity_to_unicode, CurrentDir, entity_to_unicode
 from calibre.utils.filenames import ascii_filename
 from calibre.utils.date import parse_date
 from calibre.ptempfile import TemporaryDirectory
@ -302,14 +302,7 @@ class MobiReader(object):

        for pat in ENCODING_PATS:
            self.processed_html = pat.sub('', self.processed_html)
-        e2u = functools.partial(entity_to_unicode,
-            result_exceptions={
-                '<' : u'&lt;',
-                '>' : u'&gt;',
-                '&' : u'&amp;',
-                '"' : u'&quot;',
-                "'" : u'&apos;'})
-        self.processed_html = re.sub(r'&(\S+?);', e2u,
+        self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
            self.processed_html)
        self.extract_images(processed_records, output_dir)
        self.replace_page_breaks()
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -771,18 +771,12 @@ class Manifest(object):
                % (self.id, self.href, self.media_type)

        def _parse_xml(self, data):
-            data = xml_to_unicode(data, strip_encoding_pats=True)[0]
+            data = xml_to_unicode(data, strip_encoding_pats=True,
+                    assume_utf8=True, resolve_entities=True)[0]
            if not data:
                return None
            parser = etree.XMLParser(recover=True)
-            try:
            return etree.fromstring(data, parser=parser)
-            except etree.XMLSyntaxError, err:
-                if getattr(err, 'code', 0) == 26 or str(err).startswith('Entity'):
-                    data = xml_to_unicode(data, strip_encoding_pats=True,
-                            resolve_entities=True)[0]
-                    return etree.fromstring(data)
-                raise

        def _parse_xhtml(self, data):
            self.oeb.log.debug('Parsing', self.href, '...')
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -115,7 +115,7 @@ class Split(object):
        for i, x in enumerate(page_breaks):
            x.set('id', x.get('id', 'calibre_pb_%d'%i))
            id = x.get('id')
-            page_breaks_.append((XPath('//*[@id="%s"]'%id),
+            page_breaks_.append((XPath('//*[@id=%r]'%id),
                x.get('pb_before', False)))
            page_break_ids.append(id)