From 33086f206129bc03ab041c8003a59de3760c0ab0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 10 Dec 2013 11:31:14 +0530 Subject: [PATCH] Special case entities when checking for XML well-formedness --- src/calibre/ebooks/oeb/polish/check/main.py | 5 +- .../ebooks/oeb/polish/check/parsing.py | 95 ++++++++++++++++++- src/calibre/ebooks/oeb/polish/container.py | 13 +-- src/calibre/ebooks/oeb/polish/utils.py | 24 +++++ 4 files changed, 121 insertions(+), 16 deletions(-) create mode 100644 src/calibre/ebooks/oeb/polish/utils.py diff --git a/src/calibre/ebooks/oeb/polish/check/main.py b/src/calibre/ebooks/oeb/polish/check/main.py index f6db7d2e7c..ad2c33d55d 100644 --- a/src/calibre/ebooks/oeb/polish/check/main.py +++ b/src/calibre/ebooks/oeb/polish/check/main.py @@ -52,7 +52,10 @@ def fix_errors(container, errors): for err in errors: if err.INDIVIDUAL_FIX: - if err(container): + if err(container) is not False: + # Assume changed unless fixer explicitly says no change (this + # is because sometimes I forget to return True, and it is + # better to have a false positive than a false negative) changed = True return changed diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py index c34609f1ab..bd351b5e86 100644 --- a/src/calibre/ebooks/oeb/polish/check/parsing.py +++ b/src/calibre/ebooks/oeb/polish/check/parsing.py @@ -6,11 +6,21 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' +import re + from lxml.etree import XMLParser, fromstring, XMLSyntaxError -from calibre.ebooks.oeb.polish.check.base import BaseError +from calibre.ebooks.html_entities import html5_entities +from calibre.ebooks.oeb.polish.utils import PositionFinder +from calibre.ebooks.oeb.polish.check.base import BaseError, WARN from calibre.ebooks.oeb.base import OEB_DOCS +HTML_ENTITTIES = frozenset(html5_entities) +XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'} +ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES + +replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES)))) + class XMLParseError(BaseError): is_parsing_error = True @@ -30,19 +40,94 @@ class HTMLParseError(XMLParseError): ' incorrect display of content. These errors can usually be fixed automatically,' ' however, automatic fixing can sometimes "do the wrong thing".') +class NamedEntities(BaseError): + + level = WARN + INDIVIDUAL_FIX = _('Replace all named entities with their character equivalents in this file') + HELP = _('Named entities are often only incompletely supported by various book reading software.' + ' Therefore, it is best to not use them, replacing them with the actual characters they' + ' represent. This can be done automatically.') + + def __init__(self, name): + BaseError.__init__(self, _('Named entities present'), name) + + def __call__(self, container): + raw = container.raw_data(self.name) + nraw = replace_pat.sub(lambda m:html5_entities[m.group(1)], raw) + with container.open(self.name, 'wb') as f: + f.write(nraw.encode('utf-8')) + return True + +class BadEntity(BaseError): + + HELP = _('This is an invalid (unrecognized) entity. Replace it with whatever' + ' text it is supposed to have represented.') + + def __init__(self, ent, name, lnum, col): + BaseError.__init__(self, _('Invalid entity: %s') % ent, name, lnum, col) + + +class EntitityProcessor(object): + + def __init__(self, mt): + self.entities = ALL_ENTITIES if mt in OEB_DOCS else XML_ENTITIES + self.ok_named_entities = [] + self.bad_entities = [] + + def __call__(self, m): + val = m.group(1).decode('ascii') + if val in XML_ENTITIES: + # Leave XML entities alone + return m.group() + + if val.startswith('#'): + nval = val[1:] + try: + if nval.startswith('x'): + int(nval[1:], 16) + else: + int(nval, 10) + except ValueError: + # Invalid numerical entity + self.bad_entities.append((m.start(), m.group())) + return b' ' * len(m.group()) + return m.group() + + if val in self.entities: + # Known named entity, report it + self.ok_named_entities.append(m.start()) + else: + self.bad_entities.append((m.start(), m.group())) + return b' ' * len(m.group()) + +entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});') + def check_xml_parsing(name, mt, raw): + raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n') + # Get rid of entities as named entities trip up the XML parser + eproc = EntitityProcessor(mt) + eraw = entity_pat.sub(eproc, raw) parser = XMLParser(recover=False) errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError + errors = [] + if eproc.ok_named_entities: + errors.append(NamedEntities(name)) + if eproc.bad_entities: + position = PositionFinder(raw) + for offset, ent in eproc.bad_entities: + lnum, col = position(offset) + errors.append(BadEntity(ent, name, lnum, col)) try: - fromstring(raw, parser=parser) + fromstring(eraw, parser=parser) except XMLSyntaxError as err: try: line, col = err.position except: line = col = None - return [errcls(err.message, name, line, col)] + return errors + [errcls(err.message, name, line, col)] except Exception as err: - return [errcls(err.message, name)] - return [] + return errors + [errcls(err.message, name)] + + return errors diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index c6adc01c0f..32cbca2a91 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -9,7 +9,6 @@ __docformat__ = 'restructuredtext en' import os, logging, sys, hashlib, uuid, re, shutil from collections import defaultdict -from bisect import bisect from io import BytesIO from urlparse import urlparse from future_builtins import zip @@ -32,6 +31,7 @@ from calibre.ebooks.oeb.base import ( rewrite_links, iterlinks, itercsslinks, urlquote, urlunquote) from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak +from calibre.ebooks.oeb.polish.utils import PositionFinder from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.utils.filenames import nlinks_file, hardlink_file @@ -293,15 +293,8 @@ class Container(object): # {{{ elif media_type.lower() in OEB_STYLES: if get_line_numbers: with self.open(name) as f: - raw = self.decode(f.read()) - new_lines = tuple(m.start() + 1 for m in re.finditer(r'\n', raw)) - def position(pos): - lnum = bisect(new_lines, pos) - try: - offset = abs(pos - new_lines[lnum - 1]) - except IndexError: - offset = pos - return (lnum + 1, offset) + raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n') + position = PositionFinder(raw) for link, offset in itercsslinks(raw): lnum, col = position(offset) yield link, lnum, col diff --git a/src/calibre/ebooks/oeb/polish/utils.py b/src/calibre/ebooks/oeb/polish/utils.py new file mode 100644 index 0000000000..3712e522d8 --- /dev/null +++ b/src/calibre/ebooks/oeb/polish/utils.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import re +from bisect import bisect + +class PositionFinder(object): + + def __init__(self, raw): + pat = br'\n' if isinstance(raw, bytes) else r'\n' + self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw)) + + def __call__(self, pos): + lnum = bisect(self.new_lines, pos) + try: + offset = abs(pos - self.new_lines[lnum - 1]) + except IndexError: + offset = pos + return (lnum + 1, offset)