Fix handling of private entities

Conversion: Fix private entities that use the same name as an HTML entity not being handled correctly Viewer: Fix HTML files with private entities displaying an artifact at the top Editor: Check Book: Show an error for HTML files with private entities Editor: Fix HTML: Automatically resolve private entities Fixes #1772157 [Private Named Entities problems](https://bugs.launchpad.net/calibre/+bug/1772157)
2025-08-30 23:00:21 -04:00 · 2018-05-19 18:02:46 +05:30 · 2018-05-19 18:02:46 +05:30 · a0e845be91
commit a0e845be91
parent 8ed67769dd
5 changed files with 51 additions and 11 deletions
--- a/src/calibre/ebooks/oeb/display/webview.py
+++ b/src/calibre/ebooks/oeb/display/webview.py
@ -40,8 +40,11 @@ def cleanup_html(html):
    return html
 xml_detect_pat = re.compile(r'<!(?:\[CDATA\[|ENTITY)')
 def load_as_html(html):
-    return re.search(r'<[a-zA-Z0-9-]+:svg', html) is None and '<![CDATA[' not in html
+    return re.search(r'<[a-zA-Z0-9-]+:svg', html) is None and xml_detect_pat.search(html) is None
 def load_html(path, view, codec='utf-8', mime_type=None,
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -180,12 +180,6 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
            data = xml_to_unicode(data)[0]
    data = strip_encoding_declarations(data)
    if preprocessor is not None:
        data = preprocessor(data)
    # There could be null bytes in data if it had &#0; entities in it
    data = data.replace('\0', '')
    # Remove DOCTYPE declaration as it messes up parsing
    # In particular, it causes tostring to insert xmlns
    # declarations, which messes up the coercing logic
@ -198,10 +192,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
        pre = data[:idx]
        data = data[idx:]
        if '<!DOCTYPE' in pre:  # Handle user defined entities
            has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
            # kindlegen produces invalid xhtml with uppercase attribute names
            # if fed HTML 4 with uppercase attribute names, so try to detect
            # and compensate for that.
            has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
            # Process private entities
            user_entities = {}
            for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
                val = match.group(2)
@ -212,6 +207,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
                pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
                data = pat.sub(lambda m:user_entities[m.group(1)], data)
    if preprocessor is not None:
        data = preprocessor(data)
    # There could be null bytes in data if it had &#0; entities in it
    data = data.replace('\0', '')
    data = raw = clean_word_doc(data, log)
    # Setting huge_tree=True causes crashes in windows with large files
--- a/src/calibre/ebooks/oeb/polish/check/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/check/parsing.py
@ -24,7 +24,7 @@ XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
 ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
 replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
-mismatch_pat = re.compile('tag mismatch:.+?line (\d+).+?line \d+')
+mismatch_pat = re.compile(r'tag mismatch:.+?line (\d+).+?line \d+')
 class EmptyFile(BaseError):
@ -80,6 +80,13 @@ class HTMLParseError(XMLParseError):
             ' however, automatic fixing can sometimes "do the wrong thing".')
 class PrivateEntities(XMLParseError):
    HELP = _('This HTML file uses private entities.'
    ' These are not supported. You can try running "Fix HTML" from the Tools menu,'
    ' which will try to automatically resolve the private entities.')
 class NamedEntities(BaseError):
    level = WARN
@ -255,9 +262,16 @@ def check_encoding_declarations(name, container):
    return errors
 def check_for_private_entities(name, raw):
    if re.search(br'<!DOCTYPE\s+.+?<!ENTITY\s+.+?]>', raw, flags=re.DOTALL) is not None:
        return True
 def check_xml_parsing(name, mt, raw):
    if not raw:
        return [EmptyFile(name)]
    if check_for_private_entities(name, raw):
        return [PrivateEntities(_('Private entities found'), name)]
    raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
    # Get rid of entities as named entities trip up the XML parser
    eproc = EntitityProcessor(mt)
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -953,8 +953,8 @@ class Container(ContainerBase):  # {{{
            for child in mdata:
                child.tail = '\n    '
                try:
-                    if (child.get('name', '').startswith('calibre:') and
+                    if (child.get('name', '').startswith('calibre:'
-                        child.get('content', '').strip() in {'{}', ''}):
+                        ) and child.get('content', '').strip() in {'{}', ''}):
                        remove.add(child)
                except AttributeError:
                    continue  # Happens for XML comments
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@ -44,9 +44,32 @@ def strip_encoding_declarations(raw):
    return raw
 def handle_private_entities(data):
    # Process private entities
    pre = ''
    idx = data.find('<html')
    if idx == -1:
        idx = data.find('<HTML')
    if idx > -1:
        pre = data[:idx]
        data = data[idx:]
        if '<!DOCTYPE' in pre:  # Handle user defined entities
            user_entities = {}
            for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
                val = match.group(2)
                if val.startswith('"') and val.endswith('"'):
                    val = val[1:-1]
                user_entities[match.group(1)] = val
            if user_entities:
                pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
                data = pat.sub(lambda m:user_entities[m.group(1)], data)
    return data
 def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    raw = handle_private_entities(raw)
    if replace_entities:
        raw = xml_replace_entities(raw).replace('\0', '')  # Handle &#0;
    raw = raw.replace('\r\n', '\n').replace('\r', '\n')