Special case entities when checking for XML well-formedness

This commit is contained in:
Kovid Goyal 2013-12-10 11:31:14 +05:30
parent 66d462d28c
commit 33086f2061
4 changed files with 121 additions and 16 deletions

View File

@ -52,7 +52,10 @@ def fix_errors(container, errors):
for err in errors: for err in errors:
if err.INDIVIDUAL_FIX: if err.INDIVIDUAL_FIX:
if err(container): if err(container) is not False:
# Assume changed unless fixer explicitly says no change (this
# is because sometimes I forget to return True, and it is
# better to have a false positive than a false negative)
changed = True changed = True
return changed return changed

View File

@ -6,11 +6,21 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from lxml.etree import XMLParser, fromstring, XMLSyntaxError from lxml.etree import XMLParser, fromstring, XMLSyntaxError
from calibre.ebooks.oeb.polish.check.base import BaseError from calibre.ebooks.html_entities import html5_entities
from calibre.ebooks.oeb.polish.utils import PositionFinder
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN
from calibre.ebooks.oeb.base import OEB_DOCS from calibre.ebooks.oeb.base import OEB_DOCS
HTML_ENTITTIES = frozenset(html5_entities)
XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
class XMLParseError(BaseError): class XMLParseError(BaseError):
is_parsing_error = True is_parsing_error = True
@ -30,19 +40,94 @@ class HTMLParseError(XMLParseError):
' incorrect display of content. These errors can usually be fixed automatically,' ' incorrect display of content. These errors can usually be fixed automatically,'
' however, automatic fixing can sometimes "do the wrong thing".') ' however, automatic fixing can sometimes "do the wrong thing".')
class NamedEntities(BaseError):
level = WARN
INDIVIDUAL_FIX = _('Replace all named entities with their character equivalents in this file')
HELP = _('Named entities are often only incompletely supported by various book reading software.'
' Therefore, it is best to not use them, replacing them with the actual characters they'
' represent. This can be done automatically.')
def __init__(self, name):
BaseError.__init__(self, _('Named entities present'), name)
def __call__(self, container):
raw = container.raw_data(self.name)
nraw = replace_pat.sub(lambda m:html5_entities[m.group(1)], raw)
with container.open(self.name, 'wb') as f:
f.write(nraw.encode('utf-8'))
return True
class BadEntity(BaseError):
HELP = _('This is an invalid (unrecognized) entity. Replace it with whatever'
' text it is supposed to have represented.')
def __init__(self, ent, name, lnum, col):
BaseError.__init__(self, _('Invalid entity: %s') % ent, name, lnum, col)
class EntitityProcessor(object):
def __init__(self, mt):
self.entities = ALL_ENTITIES if mt in OEB_DOCS else XML_ENTITIES
self.ok_named_entities = []
self.bad_entities = []
def __call__(self, m):
val = m.group(1).decode('ascii')
if val in XML_ENTITIES:
# Leave XML entities alone
return m.group()
if val.startswith('#'):
nval = val[1:]
try:
if nval.startswith('x'):
int(nval[1:], 16)
else:
int(nval, 10)
except ValueError:
# Invalid numerical entity
self.bad_entities.append((m.start(), m.group()))
return b' ' * len(m.group())
return m.group()
if val in self.entities:
# Known named entity, report it
self.ok_named_entities.append(m.start())
else:
self.bad_entities.append((m.start(), m.group()))
return b' ' * len(m.group())
entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
def check_xml_parsing(name, mt, raw): def check_xml_parsing(name, mt, raw):
raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
# Get rid of entities as named entities trip up the XML parser
eproc = EntitityProcessor(mt)
eraw = entity_pat.sub(eproc, raw)
parser = XMLParser(recover=False) parser = XMLParser(recover=False)
errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
errors = []
if eproc.ok_named_entities:
errors.append(NamedEntities(name))
if eproc.bad_entities:
position = PositionFinder(raw)
for offset, ent in eproc.bad_entities:
lnum, col = position(offset)
errors.append(BadEntity(ent, name, lnum, col))
try: try:
fromstring(raw, parser=parser) fromstring(eraw, parser=parser)
except XMLSyntaxError as err: except XMLSyntaxError as err:
try: try:
line, col = err.position line, col = err.position
except: except:
line = col = None line = col = None
return [errcls(err.message, name, line, col)] return errors + [errcls(err.message, name, line, col)]
except Exception as err: except Exception as err:
return [errcls(err.message, name)] return errors + [errcls(err.message, name)]
return []
return errors

View File

@ -9,7 +9,6 @@ __docformat__ = 'restructuredtext en'
import os, logging, sys, hashlib, uuid, re, shutil import os, logging, sys, hashlib, uuid, re, shutil
from collections import defaultdict from collections import defaultdict
from bisect import bisect
from io import BytesIO from io import BytesIO
from urlparse import urlparse from urlparse import urlparse
from future_builtins import zip from future_builtins import zip
@ -32,6 +31,7 @@ from calibre.ebooks.oeb.base import (
rewrite_links, iterlinks, itercsslinks, urlquote, urlunquote) rewrite_links, iterlinks, itercsslinks, urlquote, urlunquote)
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
from calibre.ebooks.oeb.polish.utils import PositionFinder
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.filenames import nlinks_file, hardlink_file from calibre.utils.filenames import nlinks_file, hardlink_file
@ -293,15 +293,8 @@ class Container(object): # {{{
elif media_type.lower() in OEB_STYLES: elif media_type.lower() in OEB_STYLES:
if get_line_numbers: if get_line_numbers:
with self.open(name) as f: with self.open(name) as f:
raw = self.decode(f.read()) raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
new_lines = tuple(m.start() + 1 for m in re.finditer(r'\n', raw)) position = PositionFinder(raw)
def position(pos):
lnum = bisect(new_lines, pos)
try:
offset = abs(pos - new_lines[lnum - 1])
except IndexError:
offset = pos
return (lnum + 1, offset)
for link, offset in itercsslinks(raw): for link, offset in itercsslinks(raw):
lnum, col = position(offset) lnum, col = position(offset)
yield link, lnum, col yield link, lnum, col

View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from bisect import bisect
class PositionFinder(object):
def __init__(self, raw):
pat = br'\n' if isinstance(raw, bytes) else r'\n'
self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
def __call__(self, pos):
lnum = bisect(self.new_lines, pos)
try:
offset = abs(pos - self.new_lines[lnum - 1])
except IndexError:
offset = pos
return (lnum + 1, offset)