CHM Input: Support hierarchical table of contents. Do not generate an inline table of contents when a metadata table of contents is present. Also correctly decode the text in the table of contents

This commit is contained in:
Kovid Goyal 2013-01-18 19:33:45 +05:30
parent 8134629e59
commit 1f2daebce6
3 changed files with 93 additions and 131 deletions

View File

@ -100,7 +100,7 @@ class CHMReader(CHMFile):
def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False):
html_files = set([])
try:
x = self.GetEncoding()
x = self.get_encoding()
codecs.lookup(x)
enc = x
except:

View File

@ -7,8 +7,6 @@ import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
from calibre.constants import filesystem_encoding
class CHMInput(InputFormatPlugin):
@ -57,6 +55,7 @@ class CHMInput(InputFormatPlugin):
mainpath = os.path.join(tdir, mainname)
metadata = get_metadata_from_reader(self._chm_reader)
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
self._chm_reader.CloseCHM()
# print tdir, mainpath
# from calibre import ipython
@ -64,15 +63,31 @@ class CHMInput(InputFormatPlugin):
options.debug_pipeline = None
options.input_encoding = 'utf-8'
# try a custom conversion:
#oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
# try using html converter:
htmlpath = self._create_html_root(mainpath, log)
htmlpath, toc = self._create_html_root(mainpath, log, encoding)
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
options.debug_pipeline = odi
#log.debug('DEBUG: Not removing tempdir %s' % tdir)
if toc.count() > 1:
oeb.toc = self.parse_html_toc(oeb.spine[0])
oeb.manifest.remove(oeb.spine[0])
oeb.auto_generated_toc = False
return oeb
def parse_html_toc(self, item):
from calibre.ebooks.oeb.base import TOC, XPath
dx = XPath('./h:div')
ax = XPath('./h:a[1]')
def do_node(parent, div):
for child in dx(div):
a = ax(child)[0]
c = parent.add(a.text, a.attrib['href'])
do_node(c, child)
toc = TOC()
root = XPath('//h:div[1]')(item.data)[0]
do_node(toc, root)
return toc
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
# use HTMLInput plugin to generate book
from calibre.customize.builtins import HTMLInput
@ -81,78 +96,22 @@ class CHMInput(InputFormatPlugin):
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
return oeb
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
import uuid
from lxml import html
from calibre.ebooks.conversion.plumber import create_oebbook
from calibre.ebooks.oeb.base import DirContainer
oeb = create_oebbook(log, None, opts,
encoding=opts.input_encoding, populate=False)
self.oeb = oeb
metadata = oeb.metadata
if mi.title:
metadata.add('title', mi.title)
if mi.authors:
for a in mi.authors:
metadata.add('creator', a, attrib={'role':'aut'})
if mi.publisher:
metadata.add('publisher', mi.publisher)
if mi.isbn:
metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'})
if not metadata.language:
oeb.logger.warn(u'Language not specified')
metadata.add('language', get_lang().replace('_', '-'))
if not metadata.creator:
oeb.logger.warn('Creator not specified')
metadata.add('creator', _('Unknown'))
if not metadata.title:
oeb.logger.warn('Title not specified')
metadata.add('title', _('Unknown'))
bookid = str(uuid.uuid4())
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.oeb.uid = metadata.identifier[0]
break
hhcdata = self._read_file(hhcpath)
hhcroot = html.fromstring(hhcdata)
chapters = self._process_nodes(hhcroot)
#print "============================="
#print "Printing hhcroot"
#print etree.tostring(hhcroot, pretty_print=True)
#print "============================="
log.debug('Found %d section nodes' % len(chapters))
if len(chapters) > 0:
path0 = chapters[0][1]
subpath = os.path.dirname(path0)
htmlpath = os.path.join(basedir, subpath)
oeb.container = DirContainer(htmlpath, log)
for chapter in chapters:
title = chapter[0]
basename = os.path.basename(chapter[1])
self._add_item(oeb, title, basename)
oeb.container = DirContainer(htmlpath, oeb.log)
return oeb
def _create_html_root(self, hhcpath, log):
def _create_html_root(self, hhcpath, log, encoding):
from lxml import html
from urllib import unquote as _unquote
from calibre.ebooks.oeb.base import urlquote
from calibre.ebooks.chardet import xml_to_unicode
hhcdata = self._read_file(hhcpath)
hhcdata = hhcdata.decode(encoding)
hhcdata = xml_to_unicode(hhcdata, verbose=True,
strip_encoding_pats=True, resolve_entities=True)[0]
hhcroot = html.fromstring(hhcdata)
chapters = self._process_nodes(hhcroot)
toc = self._process_nodes(hhcroot)
#print "============================="
#print "Printing hhcroot"
#print etree.tostring(hhcroot, pretty_print=True)
#print "============================="
log.debug('Found %d section nodes' % len(chapters))
log.debug('Found %d section nodes' % toc.count())
htmlpath = os.path.splitext(hhcpath)[0] + ".html"
base = os.path.dirname(os.path.abspath(htmlpath))
@ -168,37 +127,40 @@ class CHMInput(InputFormatPlugin):
x = y
return x
def donode(item, parent, base, subpath):
for child in item:
title = child.title
if not title: continue
raw = unquote_path(child.href or '')
rsrcname = os.path.basename(raw)
rsrcpath = os.path.join(subpath, rsrcname)
if (not os.path.exists(os.path.join(base, rsrcpath)) and
os.path.exists(os.path.join(base, raw))):
rsrcpath = raw
if '%' not in rsrcpath:
rsrcpath = urlquote(rsrcpath)
if not raw:
rsrcpath = ''
c = DIV(A(title, href=rsrcpath))
donode(child, c, base, subpath)
parent.append(c)
with open(htmlpath, 'wb') as f:
if chapters:
f.write('<html><head><meta http-equiv="Content-type"'
' content="text/html;charset=UTF-8" /></head><body>\n')
path0 = chapters[0][1]
if toc.count() > 1:
from lxml.html.builder import HTML, BODY, DIV, A
path0 = toc[0].href
path0 = unquote_path(path0)
subpath = os.path.dirname(path0)
base = os.path.dirname(f.name)
for chapter in chapters:
title = chapter[0]
raw = unquote_path(chapter[1])
rsrcname = os.path.basename(raw)
rsrcpath = os.path.join(subpath, rsrcname)
if (not os.path.exists(os.path.join(base, rsrcpath)) and
os.path.exists(os.path.join(base, raw))):
rsrcpath = raw
# title should already be url encoded
if '%' not in rsrcpath:
rsrcpath = urlquote(rsrcpath)
url = "<br /><a href=" + rsrcpath + ">" + title + " </a>\n"
if isinstance(url, unicode):
url = url.encode('utf-8')
f.write(url)
f.write("</body></html>")
root = DIV()
donode(toc, root, base, subpath)
raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
pretty_print=True)
f.write(raw)
else:
f.write(hhcdata)
return htmlpath
return htmlpath, toc
def _read_file(self, name):
f = open(name, 'rb')
@ -206,41 +168,27 @@ class CHMInput(InputFormatPlugin):
f.close()
return data
def _visit_node(self, node, chapters, depth):
# check that node is a normal node (not a comment, DOCTYPE, etc.)
# (normal nodes have string tags)
if isinstance(node.tag, basestring):
from calibre.ebooks.chm.reader import match_string
chapter_path = None
if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
chapter_title = None
for child in node:
if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
chapter_title = child.attrib['value']
if match_string(child.tag,'param') and match_string(child.attrib['name'],'local'):
chapter_path = child.attrib['value']
if chapter_title is not None and chapter_path is not None:
chapter = [chapter_title, chapter_path, depth]
chapters.append(chapter)
if node.tag=="UL":
depth = depth + 1
if node.tag=="/UL":
depth = depth - 1
def add_node(self, node, toc, ancestor_map):
from calibre.ebooks.chm.reader import match_string
if match_string(node.attrib['type'], 'text/sitemap'):
p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
parent = p[0] if p else None
toc = ancestor_map.get(parent, toc)
title = href = u''
for param in node.xpath('./param'):
if match_string(param.attrib['name'], 'name'):
title = param.attrib['value']
elif match_string(param.attrib['name'], 'local'):
href = param.attrib['value']
child = toc.add(title or _('Unknown'), href)
ancestor_map[node] = child
def _process_nodes(self, root):
chapters = []
depth = 0
for node in root.iter():
self._visit_node(node, chapters, depth)
return chapters
from calibre.ebooks.oeb.base import TOC
toc = TOC()
ancestor_map = {}
for node in root.xpath('//object'):
self.add_node(node, toc, ancestor_map)
return toc
def _add_item(self, oeb, title, path):
bname = os.path.basename(path)
id, href = oeb.manifest.generate(id='html',
href=ascii_filename(bname))
item = oeb.manifest.add(id, href, 'text/html')
item.html_input_href = bname
oeb.spine.add(item, True)
oeb.toc.add(title, item.href)

View File

@ -28,6 +28,7 @@
import array
import string
import sys
import codecs
import calibre.utils.chm.chmlib as chmlib
from calibre.constants import plugins
@ -184,7 +185,7 @@ locale_table = {
0x0420 : ('iso8859_6', "Urdu", "Arabic"),
0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"),
0x0843 : ('cp1251', "Uzbek_Cyrillic", "Cyrillic"),
0x042a : (None, "Vietnamese", "Vietnamese")
0x042a : ('cp1258', "Vietnamese", "Vietnamese")
}
class CHMFile:
@ -434,6 +435,19 @@ class CHMFile:
else:
return None
def get_encoding(self):
ans = self.GetEncoding()
if ans is None:
lcid = self.GetLCID()
if lcid is not None:
ans = lcid[0]
if ans:
try:
codecs.lookup(ans)
except:
ans = None
return ans
def GetDWORD(self, buff, idx=0):
'''Internal method.
Reads a double word (4 bytes) from a buffer.