diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index 55257e1962..7b2b89a5fa 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -63,4 +63,6 @@ def xml_to_unicode(raw, verbose=False): encoding = encoding.lower() if CHARSET_ALIASES.has_key(encoding): encoding = CHARSET_ALIASES[encoding] + if encoding == 'ascii': + encoding = 'utf-8' return raw.decode(encoding, 'ignore'), encoding diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index aeb856624c..174e568909 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -1,7 +1,8 @@ +from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' -import os, sys, logging, re, shutil +import os, sys, logging, re, shutil, tempfile from lxml import html from lxml.etree import XPath get_text = XPath("//text()") @@ -36,12 +37,30 @@ class HTMLProcessor(PreProcessor, LoggingInterface): self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) + if opts.verbose > 2: + self.debug_tree('parsed') + self.extract_css() + if opts.verbose > 2: + self.debug_tree('nocss') + self.collect_font_statistics() self.split() + def debug_tree(self, name): + ''' + Dump source tree for later debugging. + ''' + tdir = tempfile.gettempdir() + if not os.path.exists(tdir): + os.makedirs(tdir) + with open(os.path.join(tdir, 'html2epub-%s-%s.html'%\ + (os.path.basename(self.htmlfile.path), name)), 'wb') as f: + f.write(html.tostring(self.root, encoding='utf-8')) + self.log_debug(_('Written processed HTML to ')+f.name) + def parse_html(self): ''' Create lxml ElementTree from HTML ''' self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:])) diff --git a/src/calibre/ebooks/lrf/html/demo/demo.html b/src/calibre/ebooks/lrf/html/demo/demo.html index 9fa73bcfac..8a43b50708 100644 --- a/src/calibre/ebooks/lrf/html/demo/demo.html +++ b/src/calibre/ebooks/lrf/html/demo/demo.html @@ -11,7 +11,7 @@
- This document contains a demonstration of the capabilities of html2lrf, the HTML to LRF converter from libprs500. To obtain libprs500 visit
https://libprs500.kovidgoyal.net
+ This document contains a demonstration of the capabilities of html2lrf, the HTML to LRF converter from calibre. To obtain calibre visit
http://calibre.kovidgoyal.net