mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit book: Fix errors when working on HTML files that parse as valid XML, but do not have the correct XHTML namespace.
This commit is contained in:
parent
3918e3d679
commit
e1901121ae
@ -11,12 +11,12 @@ import re
|
|||||||
from lxml.etree import XMLParser, fromstring, XMLSyntaxError
|
from lxml.etree import XMLParser, fromstring, XMLSyntaxError
|
||||||
import cssutils
|
import cssutils
|
||||||
|
|
||||||
from calibre import force_unicode, human_readable
|
from calibre import force_unicode, human_readable, prepare_string_for_xml
|
||||||
from calibre.ebooks.html_entities import html5_entities
|
from calibre.ebooks.html_entities import html5_entities
|
||||||
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
|
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
|
||||||
from calibre.ebooks.oeb.polish.utils import PositionFinder
|
from calibre.ebooks.oeb.polish.utils import PositionFinder
|
||||||
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO
|
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO
|
||||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_NS
|
||||||
|
|
||||||
HTML_ENTITTIES = frozenset(html5_entities)
|
HTML_ENTITTIES = frozenset(html5_entities)
|
||||||
XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
|
XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
|
||||||
@ -79,6 +79,24 @@ class BadEntity(BaseError):
|
|||||||
def __init__(self, ent, name, lnum, col):
|
def __init__(self, ent, name, lnum, col):
|
||||||
BaseError.__init__(self, _('Invalid entity: %s') % ent, name, lnum, col)
|
BaseError.__init__(self, _('Invalid entity: %s') % ent, name, lnum, col)
|
||||||
|
|
||||||
|
class BadNamespace(BaseError):
|
||||||
|
|
||||||
|
INDIVIDUAL_FIX = _(
|
||||||
|
'Run fix HTML on this file, which will automatically insert the correct namespace')
|
||||||
|
|
||||||
|
def __init__(self, name, namespace):
|
||||||
|
BaseError.__init__(self, _('Invalid or missing namespace'), name)
|
||||||
|
self.HELP = prepare_string_for_xml(_(
|
||||||
|
'This file has {0}. Its namespace must be {1}. Se the namespace by defining the xmlns'
|
||||||
|
' attribute on the <html> element, like this <html xmlns="{1}">').format(
|
||||||
|
(_('incorrect namespace %s') % namespace) if namespace else _('no namespace'),
|
||||||
|
XHTML_NS))
|
||||||
|
|
||||||
|
def __call__(self, container):
|
||||||
|
container.parsed(self.name)
|
||||||
|
container.dirty(self.name)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class EntitityProcessor(object):
|
class EntitityProcessor(object):
|
||||||
|
|
||||||
@ -138,7 +156,7 @@ def check_xml_parsing(name, mt, raw):
|
|||||||
errors.append(BadEntity(ent, name, lnum, col))
|
errors.append(BadEntity(ent, name, lnum, col))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
fromstring(eraw, parser=parser)
|
root = fromstring(eraw, parser=parser)
|
||||||
except XMLSyntaxError as err:
|
except XMLSyntaxError as err:
|
||||||
try:
|
try:
|
||||||
line, col = err.position
|
line, col = err.position
|
||||||
@ -148,6 +166,9 @@ def check_xml_parsing(name, mt, raw):
|
|||||||
except Exception as err:
|
except Exception as err:
|
||||||
return errors + [errcls(err.message, name)]
|
return errors + [errcls(err.message, name)]
|
||||||
|
|
||||||
|
if mt in OEB_DOCS and root.nsmap.get(root.prefix, None) != XHTML_NS:
|
||||||
|
errors.append(BadNamespace(name, root.nsmap.get(root.prefix, None)))
|
||||||
|
|
||||||
return errors
|
return errors
|
||||||
|
|
||||||
class CSSError(BaseError):
|
class CSSError(BaseError):
|
||||||
|
@ -629,6 +629,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
|
|||||||
try:
|
try:
|
||||||
parser = XMLParser(no_network=True)
|
parser = XMLParser(no_network=True)
|
||||||
ans = fromstring(raw, parser=parser)
|
ans = fromstring(raw, parser=parser)
|
||||||
|
if ans.tag != '{%s}html' % html_ns:
|
||||||
|
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
||||||
if linenumber_attribute:
|
if linenumber_attribute:
|
||||||
for elem in ans.iter(LxmlElement):
|
for elem in ans.iter(LxmlElement):
|
||||||
if elem.sourceline is not None:
|
if elem.sourceline is not None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user