From 95190b45ad92b49a32ffb5ec45d07a648e5cc7e4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 10 Jul 2012 23:19:45 +0530
Subject: [PATCH] KF8 Input: Ignore encoding declarations inside the html
 markup, as they are sometimes incorrect. Fixes #1022933 (Ebook Viewer shows
 random Chinese words)

---
 src/calibre/ebooks/chardet.py            | 5 +++++
 src/calibre/ebooks/mobi/reader/markup.py | 4 ++++
 2 files changed, 9 insertions(+)
diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py
index 864d09108b..158b15fe49 100644
--- a/src/calibre/ebooks/chardet.py
+++ b/src/calibre/ebooks/chardet.py
@@ -10,10 +10,15 @@ __docformat__ = 'restructuredtext en'
 import re, codecs
 
 ENCODING_PATS = [
+                # XML declaration
                  re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
                             re.IGNORECASE),
+                 # HTML 4 Pragma directive
                  re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
                             re.IGNORECASE),
+                 # HTML 5 charset
+                 re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>''',
+                     re.IGNORECASE),
                  ]
 ENTITY_PATTERN = re.compile(r'&(\S+?);')
 
diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py
index 079eb90590..de06899852 100644
--- a/src/calibre/ebooks/mobi/reader/markup.py
+++ b/src/calibre/ebooks/mobi/reader/markup.py
@@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'
 
 import re, os
 
+from calibre.ebooks.chardet import strip_encoding_declarations
+
 def update_internal_links(mobi8_reader):
     # need to update all links that are internal which
     # are based on positions within the xhtml files **BEFORE**
@@ -324,6 +326,8 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log):
     for i, part in enumerate(parts):
         pi = mobi8_reader.partinfo[i]
         with open(os.path.join(pi.type, pi.filename), 'wb') as f:
+            part = strip_encoding_declarations(part)
+            part = part.replace('<head>', '<head><meta charset="UTF-8"/>')
             f.write(part.encode('utf-8'))
             spine.append(f.name)