mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Special case entities when checking for XML well-formedness
This commit is contained in:
parent
66d462d28c
commit
33086f2061
@ -52,7 +52,10 @@ def fix_errors(container, errors):
|
|||||||
|
|
||||||
for err in errors:
|
for err in errors:
|
||||||
if err.INDIVIDUAL_FIX:
|
if err.INDIVIDUAL_FIX:
|
||||||
if err(container):
|
if err(container) is not False:
|
||||||
|
# Assume changed unless fixer explicitly says no change (this
|
||||||
|
# is because sometimes I forget to return True, and it is
|
||||||
|
# better to have a false positive than a false negative)
|
||||||
changed = True
|
changed = True
|
||||||
return changed
|
return changed
|
||||||
|
|
||||||
|
@ -6,11 +6,21 @@ from __future__ import (unicode_literals, division, absolute_import,
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
from lxml.etree import XMLParser, fromstring, XMLSyntaxError
|
from lxml.etree import XMLParser, fromstring, XMLSyntaxError
|
||||||
|
|
||||||
from calibre.ebooks.oeb.polish.check.base import BaseError
|
from calibre.ebooks.html_entities import html5_entities
|
||||||
|
from calibre.ebooks.oeb.polish.utils import PositionFinder
|
||||||
|
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN
|
||||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
from calibre.ebooks.oeb.base import OEB_DOCS
|
||||||
|
|
||||||
|
HTML_ENTITTIES = frozenset(html5_entities)
|
||||||
|
XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
|
||||||
|
ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
|
||||||
|
|
||||||
|
replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
|
||||||
|
|
||||||
class XMLParseError(BaseError):
|
class XMLParseError(BaseError):
|
||||||
|
|
||||||
is_parsing_error = True
|
is_parsing_error = True
|
||||||
@ -30,19 +40,94 @@ class HTMLParseError(XMLParseError):
|
|||||||
' incorrect display of content. These errors can usually be fixed automatically,'
|
' incorrect display of content. These errors can usually be fixed automatically,'
|
||||||
' however, automatic fixing can sometimes "do the wrong thing".')
|
' however, automatic fixing can sometimes "do the wrong thing".')
|
||||||
|
|
||||||
|
class NamedEntities(BaseError):
|
||||||
|
|
||||||
|
level = WARN
|
||||||
|
INDIVIDUAL_FIX = _('Replace all named entities with their character equivalents in this file')
|
||||||
|
HELP = _('Named entities are often only incompletely supported by various book reading software.'
|
||||||
|
' Therefore, it is best to not use them, replacing them with the actual characters they'
|
||||||
|
' represent. This can be done automatically.')
|
||||||
|
|
||||||
|
def __init__(self, name):
|
||||||
|
BaseError.__init__(self, _('Named entities present'), name)
|
||||||
|
|
||||||
|
def __call__(self, container):
|
||||||
|
raw = container.raw_data(self.name)
|
||||||
|
nraw = replace_pat.sub(lambda m:html5_entities[m.group(1)], raw)
|
||||||
|
with container.open(self.name, 'wb') as f:
|
||||||
|
f.write(nraw.encode('utf-8'))
|
||||||
|
return True
|
||||||
|
|
||||||
|
class BadEntity(BaseError):
|
||||||
|
|
||||||
|
HELP = _('This is an invalid (unrecognized) entity. Replace it with whatever'
|
||||||
|
' text it is supposed to have represented.')
|
||||||
|
|
||||||
|
def __init__(self, ent, name, lnum, col):
|
||||||
|
BaseError.__init__(self, _('Invalid entity: %s') % ent, name, lnum, col)
|
||||||
|
|
||||||
|
|
||||||
|
class EntitityProcessor(object):
|
||||||
|
|
||||||
|
def __init__(self, mt):
|
||||||
|
self.entities = ALL_ENTITIES if mt in OEB_DOCS else XML_ENTITIES
|
||||||
|
self.ok_named_entities = []
|
||||||
|
self.bad_entities = []
|
||||||
|
|
||||||
|
def __call__(self, m):
|
||||||
|
val = m.group(1).decode('ascii')
|
||||||
|
if val in XML_ENTITIES:
|
||||||
|
# Leave XML entities alone
|
||||||
|
return m.group()
|
||||||
|
|
||||||
|
if val.startswith('#'):
|
||||||
|
nval = val[1:]
|
||||||
|
try:
|
||||||
|
if nval.startswith('x'):
|
||||||
|
int(nval[1:], 16)
|
||||||
|
else:
|
||||||
|
int(nval, 10)
|
||||||
|
except ValueError:
|
||||||
|
# Invalid numerical entity
|
||||||
|
self.bad_entities.append((m.start(), m.group()))
|
||||||
|
return b' ' * len(m.group())
|
||||||
|
return m.group()
|
||||||
|
|
||||||
|
if val in self.entities:
|
||||||
|
# Known named entity, report it
|
||||||
|
self.ok_named_entities.append(m.start())
|
||||||
|
else:
|
||||||
|
self.bad_entities.append((m.start(), m.group()))
|
||||||
|
return b' ' * len(m.group())
|
||||||
|
|
||||||
|
entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
|
||||||
|
|
||||||
def check_xml_parsing(name, mt, raw):
|
def check_xml_parsing(name, mt, raw):
|
||||||
|
raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
|
||||||
|
# Get rid of entities as named entities trip up the XML parser
|
||||||
|
eproc = EntitityProcessor(mt)
|
||||||
|
eraw = entity_pat.sub(eproc, raw)
|
||||||
parser = XMLParser(recover=False)
|
parser = XMLParser(recover=False)
|
||||||
errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
|
errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
|
||||||
|
errors = []
|
||||||
|
if eproc.ok_named_entities:
|
||||||
|
errors.append(NamedEntities(name))
|
||||||
|
if eproc.bad_entities:
|
||||||
|
position = PositionFinder(raw)
|
||||||
|
for offset, ent in eproc.bad_entities:
|
||||||
|
lnum, col = position(offset)
|
||||||
|
errors.append(BadEntity(ent, name, lnum, col))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
fromstring(raw, parser=parser)
|
fromstring(eraw, parser=parser)
|
||||||
except XMLSyntaxError as err:
|
except XMLSyntaxError as err:
|
||||||
try:
|
try:
|
||||||
line, col = err.position
|
line, col = err.position
|
||||||
except:
|
except:
|
||||||
line = col = None
|
line = col = None
|
||||||
return [errcls(err.message, name, line, col)]
|
return errors + [errcls(err.message, name, line, col)]
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
return [errcls(err.message, name)]
|
return errors + [errcls(err.message, name)]
|
||||||
return []
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
@ -9,7 +9,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import os, logging, sys, hashlib, uuid, re, shutil
|
import os, logging, sys, hashlib, uuid, re, shutil
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from bisect import bisect
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
from future_builtins import zip
|
from future_builtins import zip
|
||||||
@ -32,6 +31,7 @@ from calibre.ebooks.oeb.base import (
|
|||||||
rewrite_links, iterlinks, itercsslinks, urlquote, urlunquote)
|
rewrite_links, iterlinks, itercsslinks, urlquote, urlunquote)
|
||||||
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
|
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
|
||||||
from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
|
from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
|
||||||
|
from calibre.ebooks.oeb.polish.utils import PositionFinder
|
||||||
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
|
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
|
||||||
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
||||||
from calibre.utils.filenames import nlinks_file, hardlink_file
|
from calibre.utils.filenames import nlinks_file, hardlink_file
|
||||||
@ -293,15 +293,8 @@ class Container(object): # {{{
|
|||||||
elif media_type.lower() in OEB_STYLES:
|
elif media_type.lower() in OEB_STYLES:
|
||||||
if get_line_numbers:
|
if get_line_numbers:
|
||||||
with self.open(name) as f:
|
with self.open(name) as f:
|
||||||
raw = self.decode(f.read())
|
raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
|
||||||
new_lines = tuple(m.start() + 1 for m in re.finditer(r'\n', raw))
|
position = PositionFinder(raw)
|
||||||
def position(pos):
|
|
||||||
lnum = bisect(new_lines, pos)
|
|
||||||
try:
|
|
||||||
offset = abs(pos - new_lines[lnum - 1])
|
|
||||||
except IndexError:
|
|
||||||
offset = pos
|
|
||||||
return (lnum + 1, offset)
|
|
||||||
for link, offset in itercsslinks(raw):
|
for link, offset in itercsslinks(raw):
|
||||||
lnum, col = position(offset)
|
lnum, col = position(offset)
|
||||||
yield link, lnum, col
|
yield link, lnum, col
|
||||||
|
24
src/calibre/ebooks/oeb/polish/utils.py
Normal file
24
src/calibre/ebooks/oeb/polish/utils.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
import re
|
||||||
|
from bisect import bisect
|
||||||
|
|
||||||
|
class PositionFinder(object):
|
||||||
|
|
||||||
|
def __init__(self, raw):
|
||||||
|
pat = br'\n' if isinstance(raw, bytes) else r'\n'
|
||||||
|
self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
|
||||||
|
|
||||||
|
def __call__(self, pos):
|
||||||
|
lnum = bisect(self.new_lines, pos)
|
||||||
|
try:
|
||||||
|
offset = abs(pos - self.new_lines[lnum - 1])
|
||||||
|
except IndexError:
|
||||||
|
offset = pos
|
||||||
|
return (lnum + 1, offset)
|
Loading…
x
Reference in New Issue
Block a user