From a0e845be912281dc9cf6cb6d349a3f20243a6d8b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 May 2018 18:02:46 +0530 Subject: [PATCH] Fix handling of private entities Conversion: Fix private entities that use the same name as an HTML entity not being handled correctly Viewer: Fix HTML files with private entities displaying an artifact at the top Editor: Check Book: Show an error for HTML files with private entities Editor: Fix HTML: Automatically resolve private entities Fixes #1772157 [Private Named Entities problems](https://bugs.launchpad.net/calibre/+bug/1772157) --- src/calibre/ebooks/oeb/display/webview.py | 5 +++- src/calibre/ebooks/oeb/parse_utils.py | 14 +++++------ .../ebooks/oeb/polish/check/parsing.py | 16 ++++++++++++- src/calibre/ebooks/oeb/polish/container.py | 4 ++-- src/calibre/ebooks/oeb/polish/parsing.py | 23 +++++++++++++++++++ 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/oeb/display/webview.py b/src/calibre/ebooks/oeb/display/webview.py index 9dfc43c9c4..c4403e7b41 100644 --- a/src/calibre/ebooks/oeb/display/webview.py +++ b/src/calibre/ebooks/oeb/display/webview.py @@ -40,8 +40,11 @@ def cleanup_html(html): return html +xml_detect_pat = re.compile(r']+HTML\s+4.0[^.]+>', pre) is not None # kindlegen produces invalid xhtml with uppercase attribute names # if fed HTML 4 with uppercase attribute names, so try to detect # and compensate for that. + has_html4_doctype = re.search(r']+HTML\s+4.0[^.]+>', pre) is not None + # Process private entities user_entities = {} for match in re.finditer(r']+)', pre): val = match.group(2) @@ -212,6 +207,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) data = pat.sub(lambda m:user_entities[m.group(1)], data) + if preprocessor is not None: + data = preprocessor(data) + + # There could be null bytes in data if it had � entities in it + data = data.replace('\0', '') data = raw = clean_word_doc(data, log) # Setting huge_tree=True causes crashes in windows with large files diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py index f80776faf6..3cc48e94f7 100644 --- a/src/calibre/ebooks/oeb/polish/check/parsing.py +++ b/src/calibre/ebooks/oeb/polish/check/parsing.py @@ -24,7 +24,7 @@ XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'} ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES)))) -mismatch_pat = re.compile('tag mismatch:.+?line (\d+).+?line \d+') +mismatch_pat = re.compile(r'tag mismatch:.+?line (\d+).+?line \d+') class EmptyFile(BaseError): @@ -80,6 +80,13 @@ class HTMLParseError(XMLParseError): ' however, automatic fixing can sometimes "do the wrong thing".') +class PrivateEntities(XMLParseError): + + HELP = _('This HTML file uses private entities.' + ' These are not supported. You can try running "Fix HTML" from the Tools menu,' + ' which will try to automatically resolve the private entities.') + + class NamedEntities(BaseError): level = WARN @@ -255,9 +262,16 @@ def check_encoding_declarations(name, container): return errors +def check_for_private_entities(name, raw): + if re.search(br'', raw, flags=re.DOTALL) is not None: + return True + + def check_xml_parsing(name, mt, raw): if not raw: return [EmptyFile(name)] + if check_for_private_entities(name, raw): + return [PrivateEntities(_('Private entities found'), name)] raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n') # Get rid of entities as named entities trip up the XML parser eproc = EntitityProcessor(mt) diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index 194aff4750..4c0bda3fd6 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -953,8 +953,8 @@ class Container(ContainerBase): # {{{ for child in mdata: child.tail = '\n ' try: - if (child.get('name', '').startswith('calibre:') and - child.get('content', '').strip() in {'{}', ''}): + if (child.get('name', '').startswith('calibre:' + ) and child.get('content', '').strip() in {'{}', ''}): remove.add(child) except AttributeError: continue # Happens for XML comments diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index efab19a0e5..20f4065839 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -44,9 +44,32 @@ def strip_encoding_declarations(raw): return raw +def handle_private_entities(data): + # Process private entities + pre = '' + idx = data.find(' -1: + pre = data[:idx] + data = data[idx:] + if ']+)', pre): + val = match.group(2) + if val.startswith('"') and val.endswith('"'): + val = val[1:-1] + user_entities[match.group(1)] = val + if user_entities: + pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) + data = pat.sub(lambda m:user_entities[m.group(1)], data) + return data + + def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) + raw = handle_private_entities(raw) if replace_entities: raw = xml_replace_entities(raw).replace('\0', '') # Handle � raw = raw.replace('\r\n', '\n').replace('\r', '\n')