From 4f3abc5614e3678471f70368423b688f1eff44b3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 15 Apr 2011 08:07:38 -0400 Subject: [PATCH 1/2] HTMLZ Input: Detect or use input encoding. --- src/calibre/ebooks/htmlz/input.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/htmlz/input.py b/src/calibre/ebooks/htmlz/input.py index 6822f91b68..f0f1c29021 100644 --- a/src/calibre/ebooks/htmlz/input.py +++ b/src/calibre/ebooks/htmlz/input.py @@ -10,6 +10,7 @@ import os from calibre import walk from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.zipfile import ZipFile class HTMLZInput(InputFormatPlugin): @@ -34,6 +35,13 @@ class HTMLZInput(InputFormatPlugin): html = tf.read() break + # Encoding + if options.input_encoding: + ienc = options.input_encoding + else: + ienc = xml_to_unicode(html[:4096])[-1] + html = html.decode(ienc, 'replace') + # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') @@ -48,7 +56,7 @@ class HTMLZInput(InputFormatPlugin): fname = 'index%d.html'%c htmlfile = open(fname, 'wb') with htmlfile: - htmlfile.write(html.encode('utf-8')) + htmlfile.write(html.encode('utf-8', 'replace')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. From d819df99bdf50199aa2a6efffa703dd9afcafd37 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 15 Apr 2011 08:09:04 -0400 Subject: [PATCH 2/2] ... --- src/calibre/ebooks/htmlz/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/htmlz/input.py b/src/calibre/ebooks/htmlz/input.py index f0f1c29021..dcf2ed0ed3 100644 --- a/src/calibre/ebooks/htmlz/input.py +++ b/src/calibre/ebooks/htmlz/input.py @@ -56,7 +56,7 @@ class HTMLZInput(InputFormatPlugin): fname = 'index%d.html'%c htmlfile = open(fname, 'wb') with htmlfile: - htmlfile.write(html.encode('utf-8', 'replace')) + htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion.