From 01d2a6df56fc14f2dca3831ab2ae5b0a83aac185 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 22 Jun 2014 12:23:17 +0530 Subject: [PATCH] Edit Book: When editing HTML files that have charset encoding declarations, automatically change the declared encoding (if any) to UTF-8 on save, since the editor always saves files in the UTF-8 encoding. This prevents a mismatch between the declared encoding and the actual encoding if the HTML file was originally in an encoding other than UTF-8. --- src/calibre/ebooks/chardet.py | 26 ++++++++++++++++---- src/calibre/gui2/tweak_book/editor/widget.py | 4 +++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index f634f0a77e..0785252c4d 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -19,15 +19,31 @@ ENCODING_PATS = [ ] ENTITY_PATTERN = re.compile(r'&(\S+?);') -def strip_encoding_declarations(raw): - limit = 50*1024 +def strip_encoding_declarations(raw, limit=50*1024): + prefix = raw[:limit] + suffix = raw[limit:] for pat in ENCODING_PATS: - prefix = raw[:limit] - suffix = raw[limit:] prefix = pat.sub('', prefix) - raw = prefix + suffix + raw = prefix + suffix return raw +def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024): + prefix = raw[:limit] + suffix = raw[limit:] + changed = [False] + def sub(m): + ans = m.group() + if m.group(1).lower() != enc.lower(): + changed[0] = True + start, end = m.start(1) - m.start(0), m.end(1) - m.end(0) + ans = ans[:start] + enc + ans[end:] + return ans + + for pat in ENCODING_PATS: + prefix = pat.sub(sub, prefix) + raw = prefix + suffix + return raw, changed[0] + def substitute_entites(raw): from calibre import xml_entity_to_unicode return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) diff --git a/src/calibre/gui2/tweak_book/editor/widget.py b/src/calibre/gui2/tweak_book/editor/widget.py index cc50c280c8..87f8da80d8 100644 --- a/src/calibre/gui2/tweak_book/editor/widget.py +++ b/src/calibre/gui2/tweak_book/editor/widget.py @@ -15,6 +15,7 @@ from PyQt4.Qt import ( from calibre import prints from calibre.constants import DEBUG +from calibre.ebooks.chardet import replace_encoding_declarations from calibre.gui2 import error_dialog from calibre.gui2.tweak_book import actions, current_container, tprefs, dictionaries, editor_toolbar_actions from calibre.gui2.tweak_book.editor import SPELL_PROPERTY @@ -136,6 +137,9 @@ class Editor(QMainWindow): def data(self): def fget(self): ans = self.get_raw_data() + ans, changed = replace_encoding_declarations(ans, enc='utf-8', limit=4*1024) + if changed: + self.data = ans return ans.encode('utf-8') def fset(self, val): self.editor.load_text(val, syntax=self.syntax)