From d72cfadb6f08f43537c829cede04bd7db4eed139 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 23 Dec 2013 09:57:59 +0530 Subject: [PATCH] Edit book: Fix position syncing incorrect when in HTML files that contain tags with charset encoding declarations over more than a single line --- src/calibre/ebooks/oeb/polish/parsing.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index 47dac638bf..71f26ed71c 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -18,7 +18,7 @@ from html5lib.ihatexml import InfosetFilter, DataLossWarning from html5lib.html5parser import HTMLParser from calibre import xml_replace_entities -from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations +from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags from calibre.utils.cleantext import clean_xml_chars @@ -599,6 +599,16 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root +def strip_encoding_declarations(raw): + # A custom encoding stripper that preserves line numbers + limit = 10*1024 + for pat in ENCODING_PATS: + prefix = raw[:limit] + suffix = raw[limit:] + prefix = pat.sub(lambda m: '\n' * m.group().count('\n'), prefix) + raw = prefix + suffix + return raw + def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)