From d72cfadb6f08f43537c829cede04bd7db4eed139 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 23 Dec 2013 09:57:59 +0530
Subject: [PATCH] Edit book: Fix position syncing incorrect when in HTML files
 that contain <meta> tags with charset encoding declarations over more than a
 single line

---
 src/calibre/ebooks/oeb/polish/parsing.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py
index 47dac638bf..71f26ed71c 100644
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@@ -18,7 +18,7 @@ from html5lib.ihatexml import InfosetFilter, DataLossWarning
 from html5lib.html5parser import HTMLParser
 
 from calibre import xml_replace_entities
-from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
+from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS
 from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
 from calibre.utils.cleantext import clean_xml_chars
 
@@ -599,6 +599,16 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
         raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
     return root
 
+def strip_encoding_declarations(raw):
+    # A custom encoding stripper that preserves line numbers
+    limit = 10*1024
+    for pat in ENCODING_PATS:
+        prefix = raw[:limit]
+        suffix = raw[limit:]
+        prefix = pat.sub(lambda m: '\n' * m.group().count('\n'), prefix)
+        raw = prefix + suffix
+    return raw
+
 def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
     if isinstance(raw, bytes):
         raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)