This commit is contained in:
Kovid Goyal 2008-08-21 22:55:18 -07:00
parent 6bd47906d4
commit 416f49f4c4
3 changed files with 23 additions and 2 deletions

View File

@ -63,4 +63,6 @@ def xml_to_unicode(raw, verbose=False):
encoding = encoding.lower() encoding = encoding.lower()
if CHARSET_ALIASES.has_key(encoding): if CHARSET_ALIASES.has_key(encoding):
encoding = CHARSET_ALIASES[encoding] encoding = CHARSET_ALIASES[encoding]
if encoding == 'ascii':
encoding = 'utf-8'
return raw.decode(encoding, 'ignore'), encoding return raw.decode(encoding, 'ignore'), encoding

View File

@ -1,7 +1,8 @@
from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, sys, logging, re, shutil import os, sys, logging, re, shutil, tempfile
from lxml import html from lxml import html
from lxml.etree import XPath from lxml.etree import XPath
get_text = XPath("//text()") get_text = XPath("//text()")
@ -36,12 +37,30 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
if opts.verbose > 2:
self.debug_tree('parsed')
self.extract_css() self.extract_css()
if opts.verbose > 2:
self.debug_tree('nocss')
self.collect_font_statistics() self.collect_font_statistics()
self.split() self.split()
def debug_tree(self, name):
'''
Dump source tree for later debugging.
'''
tdir = tempfile.gettempdir()
if not os.path.exists(tdir):
os.makedirs(tdir)
with open(os.path.join(tdir, 'html2epub-%s-%s.html'%\
(os.path.basename(self.htmlfile.path), name)), 'wb') as f:
f.write(html.tostring(self.root, encoding='utf-8'))
self.log_debug(_('Written processed HTML to ')+f.name)
def parse_html(self): def parse_html(self):
''' Create lxml ElementTree from HTML ''' ''' Create lxml ElementTree from HTML '''
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:])) self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))

View File

@ -11,7 +11,7 @@
</head> </head>
<h1>Demo of <span style='font-family:monospace'>html2lrf</span></h1> <h1>Demo of <span style='font-family:monospace'>html2lrf</span></h1>
<p> <p>
This document contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span> This document contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter from <em>calibre.</em> To obtain calibre visit<br/><span style='font:sans-serif'>http://calibre.kovidgoyal.net</span>
</p> </p>
<br/> <br/>
<h2 id="toc">Table of Contents</h2> <h2 id="toc">Table of Contents</h2>