diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index f634f0a77e..0785252c4d 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -19,15 +19,31 @@ ENCODING_PATS = [ ] ENTITY_PATTERN = re.compile(r'&(\S+?);') -def strip_encoding_declarations(raw): - limit = 50*1024 +def strip_encoding_declarations(raw, limit=50*1024): + prefix = raw[:limit] + suffix = raw[limit:] for pat in ENCODING_PATS: - prefix = raw[:limit] - suffix = raw[limit:] prefix = pat.sub('', prefix) - raw = prefix + suffix + raw = prefix + suffix return raw +def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024): + prefix = raw[:limit] + suffix = raw[limit:] + changed = [False] + def sub(m): + ans = m.group() + if m.group(1).lower() != enc.lower(): + changed[0] = True + start, end = m.start(1) - m.start(0), m.end(1) - m.end(0) + ans = ans[:start] + enc + ans[end:] + return ans + + for pat in ENCODING_PATS: + prefix = pat.sub(sub, prefix) + raw = prefix + suffix + return raw, changed[0] + def substitute_entites(raw): from calibre import xml_entity_to_unicode return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) diff --git a/src/calibre/gui2/tweak_book/editor/widget.py b/src/calibre/gui2/tweak_book/editor/widget.py index cc50c280c8..87f8da80d8 100644 --- a/src/calibre/gui2/tweak_book/editor/widget.py +++ b/src/calibre/gui2/tweak_book/editor/widget.py @@ -15,6 +15,7 @@ from PyQt4.Qt import ( from calibre import prints from calibre.constants import DEBUG +from calibre.ebooks.chardet import replace_encoding_declarations from calibre.gui2 import error_dialog from calibre.gui2.tweak_book import actions, current_container, tprefs, dictionaries, editor_toolbar_actions from calibre.gui2.tweak_book.editor import SPELL_PROPERTY @@ -136,6 +137,9 @@ class Editor(QMainWindow): def data(self): def fget(self): ans = self.get_raw_data() + ans, changed = replace_encoding_declarations(ans, enc='utf-8', limit=4*1024) + if changed: + self.data = ans return ans.encode('utf-8') def fset(self, val): self.editor.load_text(val, syntax=self.syntax)