mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit book: Fix errors when working on HTML files that parse as valid XML, but do not have the correct XHTML namespace.
This commit is contained in:
parent
3918e3d679
commit
e1901121ae
@ -11,12 +11,12 @@ import re
|
||||
from lxml.etree import XMLParser, fromstring, XMLSyntaxError
|
||||
import cssutils
|
||||
|
||||
from calibre import force_unicode, human_readable
|
||||
from calibre import force_unicode, human_readable, prepare_string_for_xml
|
||||
from calibre.ebooks.html_entities import html5_entities
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
|
||||
from calibre.ebooks.oeb.polish.utils import PositionFinder
|
||||
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_NS
|
||||
|
||||
HTML_ENTITTIES = frozenset(html5_entities)
|
||||
XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
|
||||
@ -79,6 +79,24 @@ class BadEntity(BaseError):
|
||||
def __init__(self, ent, name, lnum, col):
|
||||
BaseError.__init__(self, _('Invalid entity: %s') % ent, name, lnum, col)
|
||||
|
||||
class BadNamespace(BaseError):
|
||||
|
||||
INDIVIDUAL_FIX = _(
|
||||
'Run fix HTML on this file, which will automatically insert the correct namespace')
|
||||
|
||||
def __init__(self, name, namespace):
|
||||
BaseError.__init__(self, _('Invalid or missing namespace'), name)
|
||||
self.HELP = prepare_string_for_xml(_(
|
||||
'This file has {0}. Its namespace must be {1}. Se the namespace by defining the xmlns'
|
||||
' attribute on the <html> element, like this <html xmlns="{1}">').format(
|
||||
(_('incorrect namespace %s') % namespace) if namespace else _('no namespace'),
|
||||
XHTML_NS))
|
||||
|
||||
def __call__(self, container):
|
||||
container.parsed(self.name)
|
||||
container.dirty(self.name)
|
||||
return True
|
||||
|
||||
|
||||
class EntitityProcessor(object):
|
||||
|
||||
@ -138,7 +156,7 @@ def check_xml_parsing(name, mt, raw):
|
||||
errors.append(BadEntity(ent, name, lnum, col))
|
||||
|
||||
try:
|
||||
fromstring(eraw, parser=parser)
|
||||
root = fromstring(eraw, parser=parser)
|
||||
except XMLSyntaxError as err:
|
||||
try:
|
||||
line, col = err.position
|
||||
@ -148,6 +166,9 @@ def check_xml_parsing(name, mt, raw):
|
||||
except Exception as err:
|
||||
return errors + [errcls(err.message, name)]
|
||||
|
||||
if mt in OEB_DOCS and root.nsmap.get(root.prefix, None) != XHTML_NS:
|
||||
errors.append(BadNamespace(name, root.nsmap.get(root.prefix, None)))
|
||||
|
||||
return errors
|
||||
|
||||
class CSSError(BaseError):
|
||||
|
@ -629,6 +629,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
|
||||
try:
|
||||
parser = XMLParser(no_network=True)
|
||||
ans = fromstring(raw, parser=parser)
|
||||
if ans.tag != '{%s}html' % html_ns:
|
||||
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
||||
if linenumber_attribute:
|
||||
for elem in ans.iter(LxmlElement):
|
||||
if elem.sourceline is not None:
|
||||
|
Loading…
x
Reference in New Issue
Block a user