Edit Book: When editing HTML files that have charset encoding declarations, automatically change the declared encoding (if any) to UTF-8 on save, since the editor always saves files in the UTF-8 encoding.

This prevents a mismatch between the declared encoding and the actual
encoding if the HTML file was originally in an encoding other than
UTF-8.
This commit is contained in:
Kovid Goyal 2014-06-22 12:23:17 +05:30
parent 78be882aff
commit 01d2a6df56
2 changed files with 25 additions and 5 deletions

View File

@ -19,15 +19,31 @@ ENCODING_PATS = [
]
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw):
limit = 50*1024
for pat in ENCODING_PATS:
def strip_encoding_declarations(raw, limit=50*1024):
prefix = raw[:limit]
suffix = raw[limit:]
for pat in ENCODING_PATS:
prefix = pat.sub('', prefix)
raw = prefix + suffix
return raw
def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
prefix = raw[:limit]
suffix = raw[limit:]
changed = [False]
def sub(m):
ans = m.group()
if m.group(1).lower() != enc.lower():
changed[0] = True
start, end = m.start(1) - m.start(0), m.end(1) - m.end(0)
ans = ans[:start] + enc + ans[end:]
return ans
for pat in ENCODING_PATS:
prefix = pat.sub(sub, prefix)
raw = prefix + suffix
return raw, changed[0]
def substitute_entites(raw):
from calibre import xml_entity_to_unicode
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)

View File

@ -15,6 +15,7 @@ from PyQt4.Qt import (
from calibre import prints
from calibre.constants import DEBUG
from calibre.ebooks.chardet import replace_encoding_declarations
from calibre.gui2 import error_dialog
from calibre.gui2.tweak_book import actions, current_container, tprefs, dictionaries, editor_toolbar_actions
from calibre.gui2.tweak_book.editor import SPELL_PROPERTY
@ -136,6 +137,9 @@ class Editor(QMainWindow):
def data(self):
def fget(self):
ans = self.get_raw_data()
ans, changed = replace_encoding_declarations(ans, enc='utf-8', limit=4*1024)
if changed:
self.data = ans
return ans.encode('utf-8')
def fset(self, val):
self.editor.load_text(val, syntax=self.syntax)