mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
KF8 Input: Ignore encoding declarations inside the html markup, as they are sometimes incorrect. Fixes #1022933 (Ebook Viewer shows random Chinese words)
This commit is contained in:
parent
32a52b9e05
commit
95190b45ad
@ -10,10 +10,15 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import re, codecs
|
import re, codecs
|
||||||
|
|
||||||
ENCODING_PATS = [
|
ENCODING_PATS = [
|
||||||
|
# XML declaration
|
||||||
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
|
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
|
||||||
re.IGNORECASE),
|
re.IGNORECASE),
|
||||||
|
# HTML 4 Pragma directive
|
||||||
re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
|
re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
|
||||||
re.IGNORECASE),
|
re.IGNORECASE),
|
||||||
|
# HTML 5 charset
|
||||||
|
re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>''',
|
||||||
|
re.IGNORECASE),
|
||||||
]
|
]
|
||||||
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
||||||
|
|
||||||
|
@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import re, os
|
import re, os
|
||||||
|
|
||||||
|
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||||
|
|
||||||
def update_internal_links(mobi8_reader):
|
def update_internal_links(mobi8_reader):
|
||||||
# need to update all links that are internal which
|
# need to update all links that are internal which
|
||||||
# are based on positions within the xhtml files **BEFORE**
|
# are based on positions within the xhtml files **BEFORE**
|
||||||
@ -324,6 +326,8 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log):
|
|||||||
for i, part in enumerate(parts):
|
for i, part in enumerate(parts):
|
||||||
pi = mobi8_reader.partinfo[i]
|
pi = mobi8_reader.partinfo[i]
|
||||||
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
|
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
|
||||||
|
part = strip_encoding_declarations(part)
|
||||||
|
part = part.replace('<head>', '<head><meta charset="UTF-8"/>')
|
||||||
f.write(part.encode('utf-8'))
|
f.write(part.encode('utf-8'))
|
||||||
spine.append(f.name)
|
spine.append(f.name)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user