Fix handling of private entities

Conversion: Fix private entities that use the same name as an HTML entity not being handled correctly

Viewer: Fix HTML files with private entities displaying an artifact at
the top

Editor: Check Book: Show an error for HTML files with private entities

Editor: Fix HTML: Automatically resolve private entities

Fixes #1772157 [Private Named Entities problems](https://bugs.launchpad.net/calibre/+bug/1772157)
This commit is contained in:
Kovid Goyal 2018-05-19 18:02:46 +05:30
parent 8ed67769dd
commit a0e845be91
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 51 additions and 11 deletions

View File

@ -40,8 +40,11 @@ def cleanup_html(html):
return html return html
xml_detect_pat = re.compile(r'<!(?:\[CDATA\[|ENTITY)')
def load_as_html(html): def load_as_html(html):
return re.search(r'<[a-zA-Z0-9-]+:svg', html) is None and '<![CDATA[' not in html return re.search(r'<[a-zA-Z0-9-]+:svg', html) is None and xml_detect_pat.search(html) is None
def load_html(path, view, codec='utf-8', mime_type=None, def load_html(path, view, codec='utf-8', mime_type=None,

View File

@ -180,12 +180,6 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
data = xml_to_unicode(data)[0] data = xml_to_unicode(data)[0]
data = strip_encoding_declarations(data) data = strip_encoding_declarations(data)
if preprocessor is not None:
data = preprocessor(data)
# There could be null bytes in data if it had &#0; entities in it
data = data.replace('\0', '')
# Remove DOCTYPE declaration as it messes up parsing # Remove DOCTYPE declaration as it messes up parsing
# In particular, it causes tostring to insert xmlns # In particular, it causes tostring to insert xmlns
# declarations, which messes up the coercing logic # declarations, which messes up the coercing logic
@ -198,10 +192,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
pre = data[:idx] pre = data[:idx]
data = data[idx:] data = data[idx:]
if '<!DOCTYPE' in pre: # Handle user defined entities if '<!DOCTYPE' in pre: # Handle user defined entities
has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
# kindlegen produces invalid xhtml with uppercase attribute names # kindlegen produces invalid xhtml with uppercase attribute names
# if fed HTML 4 with uppercase attribute names, so try to detect # if fed HTML 4 with uppercase attribute names, so try to detect
# and compensate for that. # and compensate for that.
has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
# Process private entities
user_entities = {} user_entities = {}
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
val = match.group(2) val = match.group(2)
@ -212,6 +207,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data) data = pat.sub(lambda m:user_entities[m.group(1)], data)
if preprocessor is not None:
data = preprocessor(data)
# There could be null bytes in data if it had &#0; entities in it
data = data.replace('\0', '')
data = raw = clean_word_doc(data, log) data = raw = clean_word_doc(data, log)
# Setting huge_tree=True causes crashes in windows with large files # Setting huge_tree=True causes crashes in windows with large files

View File

@ -24,7 +24,7 @@ XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES)))) replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
mismatch_pat = re.compile('tag mismatch:.+?line (\d+).+?line \d+') mismatch_pat = re.compile(r'tag mismatch:.+?line (\d+).+?line \d+')
class EmptyFile(BaseError): class EmptyFile(BaseError):
@ -80,6 +80,13 @@ class HTMLParseError(XMLParseError):
' however, automatic fixing can sometimes "do the wrong thing".') ' however, automatic fixing can sometimes "do the wrong thing".')
class PrivateEntities(XMLParseError):
HELP = _('This HTML file uses private entities.'
' These are not supported. You can try running "Fix HTML" from the Tools menu,'
' which will try to automatically resolve the private entities.')
class NamedEntities(BaseError): class NamedEntities(BaseError):
level = WARN level = WARN
@ -255,9 +262,16 @@ def check_encoding_declarations(name, container):
return errors return errors
def check_for_private_entities(name, raw):
if re.search(br'<!DOCTYPE\s+.+?<!ENTITY\s+.+?]>', raw, flags=re.DOTALL) is not None:
return True
def check_xml_parsing(name, mt, raw): def check_xml_parsing(name, mt, raw):
if not raw: if not raw:
return [EmptyFile(name)] return [EmptyFile(name)]
if check_for_private_entities(name, raw):
return [PrivateEntities(_('Private entities found'), name)]
raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n') raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
# Get rid of entities as named entities trip up the XML parser # Get rid of entities as named entities trip up the XML parser
eproc = EntitityProcessor(mt) eproc = EntitityProcessor(mt)

View File

@ -953,8 +953,8 @@ class Container(ContainerBase): # {{{
for child in mdata: for child in mdata:
child.tail = '\n ' child.tail = '\n '
try: try:
if (child.get('name', '').startswith('calibre:') and if (child.get('name', '').startswith('calibre:'
child.get('content', '').strip() in {'{}', ''}): ) and child.get('content', '').strip() in {'{}', ''}):
remove.add(child) remove.add(child)
except AttributeError: except AttributeError:
continue # Happens for XML comments continue # Happens for XML comments

View File

@ -44,9 +44,32 @@ def strip_encoding_declarations(raw):
return raw return raw
def handle_private_entities(data):
# Process private entities
pre = ''
idx = data.find('<html')
if idx == -1:
idx = data.find('<HTML')
if idx > -1:
pre = data[:idx]
data = data[idx:]
if '<!DOCTYPE' in pre: # Handle user defined entities
user_entities = {}
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
val = match.group(2)
if val.startswith('"') and val.endswith('"'):
val = val[1:-1]
user_entities[match.group(1)] = val
if user_entities:
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data)
return data
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
if isinstance(raw, bytes): if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
raw = handle_private_entities(raw)
if replace_entities: if replace_entities:
raw = xml_replace_entities(raw).replace('\0', '') # Handle &#0; raw = xml_replace_entities(raw).replace('\0', '') # Handle &#0;
raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = raw.replace('\r\n', '\n').replace('\r', '\n')