mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Edit book: Fix position syncing incorrect when in HTML files that contain <meta> tags with charset encoding declarations over more than a single line
This commit is contained in:
parent
5ba7de5fc7
commit
d72cfadb6f
@ -18,7 +18,7 @@ from html5lib.ihatexml import InfosetFilter, DataLossWarning
|
|||||||
from html5lib.html5parser import HTMLParser
|
from html5lib.html5parser import HTMLParser
|
||||||
|
|
||||||
from calibre import xml_replace_entities
|
from calibre import xml_replace_entities
|
||||||
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS
|
||||||
from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
|
from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
|
||||||
from calibre.utils.cleantext import clean_xml_chars
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
|
|
||||||
@ -599,6 +599,16 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
|
|||||||
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
||||||
return root
|
return root
|
||||||
|
|
||||||
|
def strip_encoding_declarations(raw):
|
||||||
|
# A custom encoding stripper that preserves line numbers
|
||||||
|
limit = 10*1024
|
||||||
|
for pat in ENCODING_PATS:
|
||||||
|
prefix = raw[:limit]
|
||||||
|
suffix = raw[limit:]
|
||||||
|
prefix = pat.sub(lambda m: '\n' * m.group().count('\n'), prefix)
|
||||||
|
raw = prefix + suffix
|
||||||
|
return raw
|
||||||
|
|
||||||
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
|
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
|
||||||
if isinstance(raw, bytes):
|
if isinstance(raw, bytes):
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user