CHM Input: Support hierarchical table of contents. Do not generate an inline table of contents when a metadata table of contents is present. Also correctly decode the text in the table of contents

This commit is contained in:
Kovid Goyal 2013-01-18 19:33:45 +05:30
parent 8134629e59
commit 1f2daebce6
3 changed files with 93 additions and 131 deletions

View File

@ -100,7 +100,7 @@ class CHMReader(CHMFile):
def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False): def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False):
html_files = set([]) html_files = set([])
try: try:
x = self.GetEncoding() x = self.get_encoding()
codecs.lookup(x) codecs.lookup(x)
enc = x enc = x
except: except:

View File

@ -7,8 +7,6 @@ import os
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
from calibre.constants import filesystem_encoding from calibre.constants import filesystem_encoding
class CHMInput(InputFormatPlugin): class CHMInput(InputFormatPlugin):
@ -57,6 +55,7 @@ class CHMInput(InputFormatPlugin):
mainpath = os.path.join(tdir, mainname) mainpath = os.path.join(tdir, mainname)
metadata = get_metadata_from_reader(self._chm_reader) metadata = get_metadata_from_reader(self._chm_reader)
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
self._chm_reader.CloseCHM() self._chm_reader.CloseCHM()
# print tdir, mainpath # print tdir, mainpath
# from calibre import ipython # from calibre import ipython
@ -64,15 +63,31 @@ class CHMInput(InputFormatPlugin):
options.debug_pipeline = None options.debug_pipeline = None
options.input_encoding = 'utf-8' options.input_encoding = 'utf-8'
# try a custom conversion: htmlpath, toc = self._create_html_root(mainpath, log, encoding)
#oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
# try using html converter:
htmlpath = self._create_html_root(mainpath, log)
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
options.debug_pipeline = odi options.debug_pipeline = odi
#log.debug('DEBUG: Not removing tempdir %s' % tdir) if toc.count() > 1:
oeb.toc = self.parse_html_toc(oeb.spine[0])
oeb.manifest.remove(oeb.spine[0])
oeb.auto_generated_toc = False
return oeb return oeb
def parse_html_toc(self, item):
from calibre.ebooks.oeb.base import TOC, XPath
dx = XPath('./h:div')
ax = XPath('./h:a[1]')
def do_node(parent, div):
for child in dx(div):
a = ax(child)[0]
c = parent.add(a.text, a.attrib['href'])
do_node(c, child)
toc = TOC()
root = XPath('//h:div[1]')(item.data)[0]
do_node(toc, root)
return toc
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi): def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
# use HTMLInput plugin to generate book # use HTMLInput plugin to generate book
from calibre.customize.builtins import HTMLInput from calibre.customize.builtins import HTMLInput
@ -81,78 +96,22 @@ class CHMInput(InputFormatPlugin):
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi) oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
return oeb return oeb
def _create_html_root(self, hhcpath, log, encoding):
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
import uuid
from lxml import html
from calibre.ebooks.conversion.plumber import create_oebbook
from calibre.ebooks.oeb.base import DirContainer
oeb = create_oebbook(log, None, opts,
encoding=opts.input_encoding, populate=False)
self.oeb = oeb
metadata = oeb.metadata
if mi.title:
metadata.add('title', mi.title)
if mi.authors:
for a in mi.authors:
metadata.add('creator', a, attrib={'role':'aut'})
if mi.publisher:
metadata.add('publisher', mi.publisher)
if mi.isbn:
metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'})
if not metadata.language:
oeb.logger.warn(u'Language not specified')
metadata.add('language', get_lang().replace('_', '-'))
if not metadata.creator:
oeb.logger.warn('Creator not specified')
metadata.add('creator', _('Unknown'))
if not metadata.title:
oeb.logger.warn('Title not specified')
metadata.add('title', _('Unknown'))
bookid = str(uuid.uuid4())
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.oeb.uid = metadata.identifier[0]
break
hhcdata = self._read_file(hhcpath)
hhcroot = html.fromstring(hhcdata)
chapters = self._process_nodes(hhcroot)
#print "============================="
#print "Printing hhcroot"
#print etree.tostring(hhcroot, pretty_print=True)
#print "============================="
log.debug('Found %d section nodes' % len(chapters))
if len(chapters) > 0:
path0 = chapters[0][1]
subpath = os.path.dirname(path0)
htmlpath = os.path.join(basedir, subpath)
oeb.container = DirContainer(htmlpath, log)
for chapter in chapters:
title = chapter[0]
basename = os.path.basename(chapter[1])
self._add_item(oeb, title, basename)
oeb.container = DirContainer(htmlpath, oeb.log)
return oeb
def _create_html_root(self, hhcpath, log):
from lxml import html from lxml import html
from urllib import unquote as _unquote from urllib import unquote as _unquote
from calibre.ebooks.oeb.base import urlquote from calibre.ebooks.oeb.base import urlquote
from calibre.ebooks.chardet import xml_to_unicode
hhcdata = self._read_file(hhcpath) hhcdata = self._read_file(hhcpath)
hhcdata = hhcdata.decode(encoding)
hhcdata = xml_to_unicode(hhcdata, verbose=True,
strip_encoding_pats=True, resolve_entities=True)[0]
hhcroot = html.fromstring(hhcdata) hhcroot = html.fromstring(hhcdata)
chapters = self._process_nodes(hhcroot) toc = self._process_nodes(hhcroot)
#print "=============================" #print "============================="
#print "Printing hhcroot" #print "Printing hhcroot"
#print etree.tostring(hhcroot, pretty_print=True) #print etree.tostring(hhcroot, pretty_print=True)
#print "=============================" #print "============================="
log.debug('Found %d section nodes' % len(chapters)) log.debug('Found %d section nodes' % toc.count())
htmlpath = os.path.splitext(hhcpath)[0] + ".html" htmlpath = os.path.splitext(hhcpath)[0] + ".html"
base = os.path.dirname(os.path.abspath(htmlpath)) base = os.path.dirname(os.path.abspath(htmlpath))
@ -168,37 +127,40 @@ class CHMInput(InputFormatPlugin):
x = y x = y
return x return x
with open(htmlpath, 'wb') as f: def donode(item, parent, base, subpath):
if chapters: for child in item:
f.write('<html><head><meta http-equiv="Content-type"' title = child.title
' content="text/html;charset=UTF-8" /></head><body>\n') if not title: continue
path0 = chapters[0][1] raw = unquote_path(child.href or '')
path0 = unquote_path(path0)
subpath = os.path.dirname(path0)
base = os.path.dirname(f.name)
for chapter in chapters:
title = chapter[0]
raw = unquote_path(chapter[1])
rsrcname = os.path.basename(raw) rsrcname = os.path.basename(raw)
rsrcpath = os.path.join(subpath, rsrcname) rsrcpath = os.path.join(subpath, rsrcname)
if (not os.path.exists(os.path.join(base, rsrcpath)) and if (not os.path.exists(os.path.join(base, rsrcpath)) and
os.path.exists(os.path.join(base, raw))): os.path.exists(os.path.join(base, raw))):
rsrcpath = raw rsrcpath = raw
# title should already be url encoded
if '%' not in rsrcpath: if '%' not in rsrcpath:
rsrcpath = urlquote(rsrcpath) rsrcpath = urlquote(rsrcpath)
url = "<br /><a href=" + rsrcpath + ">" + title + " </a>\n" if not raw:
if isinstance(url, unicode): rsrcpath = ''
url = url.encode('utf-8') c = DIV(A(title, href=rsrcpath))
f.write(url) donode(child, c, base, subpath)
parent.append(c)
f.write("</body></html>") with open(htmlpath, 'wb') as f:
if toc.count() > 1:
from lxml.html.builder import HTML, BODY, DIV, A
path0 = toc[0].href
path0 = unquote_path(path0)
subpath = os.path.dirname(path0)
base = os.path.dirname(f.name)
root = DIV()
donode(toc, root, base, subpath)
raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
pretty_print=True)
f.write(raw)
else: else:
f.write(hhcdata) f.write(hhcdata)
return htmlpath return htmlpath, toc
def _read_file(self, name): def _read_file(self, name):
f = open(name, 'rb') f = open(name, 'rb')
@ -206,41 +168,27 @@ class CHMInput(InputFormatPlugin):
f.close() f.close()
return data return data
def _visit_node(self, node, chapters, depth): def add_node(self, node, toc, ancestor_map):
# check that node is a normal node (not a comment, DOCTYPE, etc.)
# (normal nodes have string tags)
if isinstance(node.tag, basestring):
from calibre.ebooks.chm.reader import match_string from calibre.ebooks.chm.reader import match_string
if match_string(node.attrib['type'], 'text/sitemap'):
chapter_path = None p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'): parent = p[0] if p else None
chapter_title = None toc = ancestor_map.get(parent, toc)
for child in node: title = href = u''
if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'): for param in node.xpath('./param'):
chapter_title = child.attrib['value'] if match_string(param.attrib['name'], 'name'):
if match_string(child.tag,'param') and match_string(child.attrib['name'],'local'): title = param.attrib['value']
chapter_path = child.attrib['value'] elif match_string(param.attrib['name'], 'local'):
if chapter_title is not None and chapter_path is not None: href = param.attrib['value']
chapter = [chapter_title, chapter_path, depth] child = toc.add(title or _('Unknown'), href)
chapters.append(chapter) ancestor_map[node] = child
if node.tag=="UL":
depth = depth + 1
if node.tag=="/UL":
depth = depth - 1
def _process_nodes(self, root): def _process_nodes(self, root):
chapters = [] from calibre.ebooks.oeb.base import TOC
depth = 0 toc = TOC()
for node in root.iter(): ancestor_map = {}
self._visit_node(node, chapters, depth) for node in root.xpath('//object'):
return chapters self.add_node(node, toc, ancestor_map)
return toc
def _add_item(self, oeb, title, path):
bname = os.path.basename(path)
id, href = oeb.manifest.generate(id='html',
href=ascii_filename(bname))
item = oeb.manifest.add(id, href, 'text/html')
item.html_input_href = bname
oeb.spine.add(item, True)
oeb.toc.add(title, item.href)

View File

@ -28,6 +28,7 @@
import array import array
import string import string
import sys import sys
import codecs
import calibre.utils.chm.chmlib as chmlib import calibre.utils.chm.chmlib as chmlib
from calibre.constants import plugins from calibre.constants import plugins
@ -184,7 +185,7 @@ locale_table = {
0x0420 : ('iso8859_6', "Urdu", "Arabic"), 0x0420 : ('iso8859_6', "Urdu", "Arabic"),
0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"), 0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"),
0x0843 : ('cp1251', "Uzbek_Cyrillic", "Cyrillic"), 0x0843 : ('cp1251', "Uzbek_Cyrillic", "Cyrillic"),
0x042a : (None, "Vietnamese", "Vietnamese") 0x042a : ('cp1258', "Vietnamese", "Vietnamese")
} }
class CHMFile: class CHMFile:
@ -434,6 +435,19 @@ class CHMFile:
else: else:
return None return None
def get_encoding(self):
ans = self.GetEncoding()
if ans is None:
lcid = self.GetLCID()
if lcid is not None:
ans = lcid[0]
if ans:
try:
codecs.lookup(ans)
except:
ans = None
return ans
def GetDWORD(self, buff, idx=0): def GetDWORD(self, buff, idx=0):
'''Internal method. '''Internal method.
Reads a double word (4 bytes) from a buffer. Reads a double word (4 bytes) from a buffer.