mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix handling of private entities
Conversion: Fix private entities that use the same name as an HTML entity not being handled correctly Viewer: Fix HTML files with private entities displaying an artifact at the top Editor: Check Book: Show an error for HTML files with private entities Editor: Fix HTML: Automatically resolve private entities Fixes #1772157 [Private Named Entities problems](https://bugs.launchpad.net/calibre/+bug/1772157)
This commit is contained in:
parent
8ed67769dd
commit
a0e845be91
@ -40,8 +40,11 @@ def cleanup_html(html):
|
|||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
xml_detect_pat = re.compile(r'<!(?:\[CDATA\[|ENTITY)')
|
||||||
|
|
||||||
|
|
||||||
def load_as_html(html):
|
def load_as_html(html):
|
||||||
return re.search(r'<[a-zA-Z0-9-]+:svg', html) is None and '<![CDATA[' not in html
|
return re.search(r'<[a-zA-Z0-9-]+:svg', html) is None and xml_detect_pat.search(html) is None
|
||||||
|
|
||||||
|
|
||||||
def load_html(path, view, codec='utf-8', mime_type=None,
|
def load_html(path, view, codec='utf-8', mime_type=None,
|
||||||
|
@ -180,12 +180,6 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
data = xml_to_unicode(data)[0]
|
data = xml_to_unicode(data)[0]
|
||||||
|
|
||||||
data = strip_encoding_declarations(data)
|
data = strip_encoding_declarations(data)
|
||||||
if preprocessor is not None:
|
|
||||||
data = preprocessor(data)
|
|
||||||
|
|
||||||
# There could be null bytes in data if it had � entities in it
|
|
||||||
data = data.replace('\0', '')
|
|
||||||
|
|
||||||
# Remove DOCTYPE declaration as it messes up parsing
|
# Remove DOCTYPE declaration as it messes up parsing
|
||||||
# In particular, it causes tostring to insert xmlns
|
# In particular, it causes tostring to insert xmlns
|
||||||
# declarations, which messes up the coercing logic
|
# declarations, which messes up the coercing logic
|
||||||
@ -198,10 +192,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
pre = data[:idx]
|
pre = data[:idx]
|
||||||
data = data[idx:]
|
data = data[idx:]
|
||||||
if '<!DOCTYPE' in pre: # Handle user defined entities
|
if '<!DOCTYPE' in pre: # Handle user defined entities
|
||||||
has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
|
|
||||||
# kindlegen produces invalid xhtml with uppercase attribute names
|
# kindlegen produces invalid xhtml with uppercase attribute names
|
||||||
# if fed HTML 4 with uppercase attribute names, so try to detect
|
# if fed HTML 4 with uppercase attribute names, so try to detect
|
||||||
# and compensate for that.
|
# and compensate for that.
|
||||||
|
has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
|
||||||
|
# Process private entities
|
||||||
user_entities = {}
|
user_entities = {}
|
||||||
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
||||||
val = match.group(2)
|
val = match.group(2)
|
||||||
@ -212,6 +207,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||||
|
|
||||||
|
if preprocessor is not None:
|
||||||
|
data = preprocessor(data)
|
||||||
|
|
||||||
|
# There could be null bytes in data if it had � entities in it
|
||||||
|
data = data.replace('\0', '')
|
||||||
data = raw = clean_word_doc(data, log)
|
data = raw = clean_word_doc(data, log)
|
||||||
|
|
||||||
# Setting huge_tree=True causes crashes in windows with large files
|
# Setting huge_tree=True causes crashes in windows with large files
|
||||||
|
@ -24,7 +24,7 @@ XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
|
|||||||
ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
|
ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
|
||||||
|
|
||||||
replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
|
replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
|
||||||
mismatch_pat = re.compile('tag mismatch:.+?line (\d+).+?line \d+')
|
mismatch_pat = re.compile(r'tag mismatch:.+?line (\d+).+?line \d+')
|
||||||
|
|
||||||
|
|
||||||
class EmptyFile(BaseError):
|
class EmptyFile(BaseError):
|
||||||
@ -80,6 +80,13 @@ class HTMLParseError(XMLParseError):
|
|||||||
' however, automatic fixing can sometimes "do the wrong thing".')
|
' however, automatic fixing can sometimes "do the wrong thing".')
|
||||||
|
|
||||||
|
|
||||||
|
class PrivateEntities(XMLParseError):
|
||||||
|
|
||||||
|
HELP = _('This HTML file uses private entities.'
|
||||||
|
' These are not supported. You can try running "Fix HTML" from the Tools menu,'
|
||||||
|
' which will try to automatically resolve the private entities.')
|
||||||
|
|
||||||
|
|
||||||
class NamedEntities(BaseError):
|
class NamedEntities(BaseError):
|
||||||
|
|
||||||
level = WARN
|
level = WARN
|
||||||
@ -255,9 +262,16 @@ def check_encoding_declarations(name, container):
|
|||||||
return errors
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
def check_for_private_entities(name, raw):
|
||||||
|
if re.search(br'<!DOCTYPE\s+.+?<!ENTITY\s+.+?]>', raw, flags=re.DOTALL) is not None:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def check_xml_parsing(name, mt, raw):
|
def check_xml_parsing(name, mt, raw):
|
||||||
if not raw:
|
if not raw:
|
||||||
return [EmptyFile(name)]
|
return [EmptyFile(name)]
|
||||||
|
if check_for_private_entities(name, raw):
|
||||||
|
return [PrivateEntities(_('Private entities found'), name)]
|
||||||
raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
|
raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
|
||||||
# Get rid of entities as named entities trip up the XML parser
|
# Get rid of entities as named entities trip up the XML parser
|
||||||
eproc = EntitityProcessor(mt)
|
eproc = EntitityProcessor(mt)
|
||||||
|
@ -953,8 +953,8 @@ class Container(ContainerBase): # {{{
|
|||||||
for child in mdata:
|
for child in mdata:
|
||||||
child.tail = '\n '
|
child.tail = '\n '
|
||||||
try:
|
try:
|
||||||
if (child.get('name', '').startswith('calibre:') and
|
if (child.get('name', '').startswith('calibre:'
|
||||||
child.get('content', '').strip() in {'{}', ''}):
|
) and child.get('content', '').strip() in {'{}', ''}):
|
||||||
remove.add(child)
|
remove.add(child)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
continue # Happens for XML comments
|
continue # Happens for XML comments
|
||||||
|
@ -44,9 +44,32 @@ def strip_encoding_declarations(raw):
|
|||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
|
||||||
|
def handle_private_entities(data):
|
||||||
|
# Process private entities
|
||||||
|
pre = ''
|
||||||
|
idx = data.find('<html')
|
||||||
|
if idx == -1:
|
||||||
|
idx = data.find('<HTML')
|
||||||
|
if idx > -1:
|
||||||
|
pre = data[:idx]
|
||||||
|
data = data[idx:]
|
||||||
|
if '<!DOCTYPE' in pre: # Handle user defined entities
|
||||||
|
user_entities = {}
|
||||||
|
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
||||||
|
val = match.group(2)
|
||||||
|
if val.startswith('"') and val.endswith('"'):
|
||||||
|
val = val[1:-1]
|
||||||
|
user_entities[match.group(1)] = val
|
||||||
|
if user_entities:
|
||||||
|
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||||
|
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
|
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
|
||||||
if isinstance(raw, bytes):
|
if isinstance(raw, bytes):
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
|
raw = handle_private_entities(raw)
|
||||||
if replace_entities:
|
if replace_entities:
|
||||||
raw = xml_replace_entities(raw).replace('\0', '') # Handle �
|
raw = xml_replace_entities(raw).replace('\0', '') # Handle �
|
||||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user