This commit is contained in:
Kovid Goyal 2008-08-21 22:55:18 -07:00
parent 6bd47906d4
commit 416f49f4c4
3 changed files with 23 additions and 2 deletions

View File

@ -63,4 +63,6 @@ def xml_to_unicode(raw, verbose=False):
encoding = encoding.lower()
if CHARSET_ALIASES.has_key(encoding):
encoding = CHARSET_ALIASES[encoding]
if encoding == 'ascii':
encoding = 'utf-8'
return raw.decode(encoding, 'ignore'), encoding

View File

@ -1,7 +1,8 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
import os, sys, logging, re, shutil
import os, sys, logging, re, shutil, tempfile
from lxml import html
from lxml.etree import XPath
get_text = XPath("//text()")
@ -36,12 +37,30 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
if opts.verbose > 2:
self.debug_tree('parsed')
self.extract_css()
if opts.verbose > 2:
self.debug_tree('nocss')
self.collect_font_statistics()
self.split()
def debug_tree(self, name):
'''
Dump source tree for later debugging.
'''
tdir = tempfile.gettempdir()
if not os.path.exists(tdir):
os.makedirs(tdir)
with open(os.path.join(tdir, 'html2epub-%s-%s.html'%\
(os.path.basename(self.htmlfile.path), name)), 'wb') as f:
f.write(html.tostring(self.root, encoding='utf-8'))
self.log_debug(_('Written processed HTML to ')+f.name)
def parse_html(self):
''' Create lxml ElementTree from HTML '''
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))

View File

@ -11,7 +11,7 @@
</head>
<h1>Demo of <span style='font-family:monospace'>html2lrf</span></h1>
<p>
This document contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
This document contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter from <em>calibre.</em> To obtain calibre visit<br/><span style='font:sans-serif'>http://calibre.kovidgoyal.net</span>
</p>
<br/>
<h2 id="toc">Table of Contents</h2>