HTMLZ input: Fix handling of HTML files encoded in an encoding other than UTF-8

This commit is contained in:
Kovid Goyal 2011-04-15 07:55:50 -06:00
commit 19f35f55bc

View File

@ -10,6 +10,7 @@ import os
from calibre import walk from calibre import walk
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
class HTMLZInput(InputFormatPlugin): class HTMLZInput(InputFormatPlugin):
@ -34,6 +35,13 @@ class HTMLZInput(InputFormatPlugin):
html = tf.read() html = tf.read()
break break
# Encoding
if options.input_encoding:
ienc = options.input_encoding
else:
ienc = xml_to_unicode(html[:4096])[-1]
html = html.decode(ienc, 'replace')
# Run the HTML through the html processing plugin. # Run the HTML through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html') html_input = plugin_for_input_format('html')