Edit book: Fix position syncing incorrect when in HTML files that contain <meta> tags with charset encoding declarations over more than a single line

2025-11-23 23:13:02 -05:00 · 2013-12-23 09:57:59 +05:30 · 2013-12-23 09:57:59 +05:30 · d72cfadb6f
commit d72cfadb6f
parent 5ba7de5fc7
1 changed files with 11 additions and 1 deletions
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@ -18,7 +18,7 @@ from html5lib.ihatexml import InfosetFilter, DataLossWarning
 from html5lib.html5parser import HTMLParser
 from calibre import xml_replace_entities
-from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
+from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS
 from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
 from calibre.utils.cleantext import clean_xml_chars
@ -599,6 +599,16 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
 def strip_encoding_declarations(raw):
    # A custom encoding stripper that preserves line numbers
    limit = 10*1024
    for pat in ENCODING_PATS:
        prefix = raw[:limit]
        suffix = raw[limit:]
        prefix = pat.sub(lambda m: '\n' * m.group().count('\n'), prefix)
        raw = prefix + suffix
    return raw
 def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)