mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit Book: When editing HTML files that have charset encoding declarations, automatically change the declared encoding (if any) to UTF-8 on save, since the editor always saves files in the UTF-8 encoding.
This prevents a mismatch between the declared encoding and the actual encoding if the HTML file was originally in an encoding other than UTF-8.
This commit is contained in:
parent
78be882aff
commit
01d2a6df56
@ -19,15 +19,31 @@ ENCODING_PATS = [
|
|||||||
]
|
]
|
||||||
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
||||||
|
|
||||||
def strip_encoding_declarations(raw):
|
def strip_encoding_declarations(raw, limit=50*1024):
|
||||||
limit = 50*1024
|
prefix = raw[:limit]
|
||||||
|
suffix = raw[limit:]
|
||||||
for pat in ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
prefix = raw[:limit]
|
|
||||||
suffix = raw[limit:]
|
|
||||||
prefix = pat.sub('', prefix)
|
prefix = pat.sub('', prefix)
|
||||||
raw = prefix + suffix
|
raw = prefix + suffix
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
|
||||||
|
prefix = raw[:limit]
|
||||||
|
suffix = raw[limit:]
|
||||||
|
changed = [False]
|
||||||
|
def sub(m):
|
||||||
|
ans = m.group()
|
||||||
|
if m.group(1).lower() != enc.lower():
|
||||||
|
changed[0] = True
|
||||||
|
start, end = m.start(1) - m.start(0), m.end(1) - m.end(0)
|
||||||
|
ans = ans[:start] + enc + ans[end:]
|
||||||
|
return ans
|
||||||
|
|
||||||
|
for pat in ENCODING_PATS:
|
||||||
|
prefix = pat.sub(sub, prefix)
|
||||||
|
raw = prefix + suffix
|
||||||
|
return raw, changed[0]
|
||||||
|
|
||||||
def substitute_entites(raw):
|
def substitute_entites(raw):
|
||||||
from calibre import xml_entity_to_unicode
|
from calibre import xml_entity_to_unicode
|
||||||
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
||||||
|
@ -15,6 +15,7 @@ from PyQt4.Qt import (
|
|||||||
|
|
||||||
from calibre import prints
|
from calibre import prints
|
||||||
from calibre.constants import DEBUG
|
from calibre.constants import DEBUG
|
||||||
|
from calibre.ebooks.chardet import replace_encoding_declarations
|
||||||
from calibre.gui2 import error_dialog
|
from calibre.gui2 import error_dialog
|
||||||
from calibre.gui2.tweak_book import actions, current_container, tprefs, dictionaries, editor_toolbar_actions
|
from calibre.gui2.tweak_book import actions, current_container, tprefs, dictionaries, editor_toolbar_actions
|
||||||
from calibre.gui2.tweak_book.editor import SPELL_PROPERTY
|
from calibre.gui2.tweak_book.editor import SPELL_PROPERTY
|
||||||
@ -136,6 +137,9 @@ class Editor(QMainWindow):
|
|||||||
def data(self):
|
def data(self):
|
||||||
def fget(self):
|
def fget(self):
|
||||||
ans = self.get_raw_data()
|
ans = self.get_raw_data()
|
||||||
|
ans, changed = replace_encoding_declarations(ans, enc='utf-8', limit=4*1024)
|
||||||
|
if changed:
|
||||||
|
self.data = ans
|
||||||
return ans.encode('utf-8')
|
return ans.encode('utf-8')
|
||||||
def fset(self, val):
|
def fset(self, val):
|
||||||
self.editor.load_text(val, syntax=self.syntax)
|
self.editor.load_text(val, syntax=self.syntax)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user