mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
CHM Input: Support hierarchical table of contents. Do not generate an inline table of contents when a metadata table of contents is present. Also correctly decode the text in the table of contents
This commit is contained in:
parent
8134629e59
commit
1f2daebce6
@ -100,7 +100,7 @@ class CHMReader(CHMFile):
|
|||||||
def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False):
|
def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False):
|
||||||
html_files = set([])
|
html_files = set([])
|
||||||
try:
|
try:
|
||||||
x = self.GetEncoding()
|
x = self.get_encoding()
|
||||||
codecs.lookup(x)
|
codecs.lookup(x)
|
||||||
enc = x
|
enc = x
|
||||||
except:
|
except:
|
||||||
|
@ -7,8 +7,6 @@ import os
|
|||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
from calibre.utils.localization import get_lang
|
|
||||||
from calibre.utils.filenames import ascii_filename
|
|
||||||
from calibre.constants import filesystem_encoding
|
from calibre.constants import filesystem_encoding
|
||||||
|
|
||||||
class CHMInput(InputFormatPlugin):
|
class CHMInput(InputFormatPlugin):
|
||||||
@ -57,6 +55,7 @@ class CHMInput(InputFormatPlugin):
|
|||||||
mainpath = os.path.join(tdir, mainname)
|
mainpath = os.path.join(tdir, mainname)
|
||||||
|
|
||||||
metadata = get_metadata_from_reader(self._chm_reader)
|
metadata = get_metadata_from_reader(self._chm_reader)
|
||||||
|
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
|
||||||
self._chm_reader.CloseCHM()
|
self._chm_reader.CloseCHM()
|
||||||
# print tdir, mainpath
|
# print tdir, mainpath
|
||||||
# from calibre import ipython
|
# from calibre import ipython
|
||||||
@ -64,15 +63,31 @@ class CHMInput(InputFormatPlugin):
|
|||||||
|
|
||||||
options.debug_pipeline = None
|
options.debug_pipeline = None
|
||||||
options.input_encoding = 'utf-8'
|
options.input_encoding = 'utf-8'
|
||||||
# try a custom conversion:
|
htmlpath, toc = self._create_html_root(mainpath, log, encoding)
|
||||||
#oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
|
|
||||||
# try using html converter:
|
|
||||||
htmlpath = self._create_html_root(mainpath, log)
|
|
||||||
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
|
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
|
||||||
options.debug_pipeline = odi
|
options.debug_pipeline = odi
|
||||||
#log.debug('DEBUG: Not removing tempdir %s' % tdir)
|
if toc.count() > 1:
|
||||||
|
oeb.toc = self.parse_html_toc(oeb.spine[0])
|
||||||
|
oeb.manifest.remove(oeb.spine[0])
|
||||||
|
oeb.auto_generated_toc = False
|
||||||
return oeb
|
return oeb
|
||||||
|
|
||||||
|
def parse_html_toc(self, item):
|
||||||
|
from calibre.ebooks.oeb.base import TOC, XPath
|
||||||
|
dx = XPath('./h:div')
|
||||||
|
ax = XPath('./h:a[1]')
|
||||||
|
|
||||||
|
def do_node(parent, div):
|
||||||
|
for child in dx(div):
|
||||||
|
a = ax(child)[0]
|
||||||
|
c = parent.add(a.text, a.attrib['href'])
|
||||||
|
do_node(c, child)
|
||||||
|
|
||||||
|
toc = TOC()
|
||||||
|
root = XPath('//h:div[1]')(item.data)[0]
|
||||||
|
do_node(toc, root)
|
||||||
|
return toc
|
||||||
|
|
||||||
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
|
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
|
||||||
# use HTMLInput plugin to generate book
|
# use HTMLInput plugin to generate book
|
||||||
from calibre.customize.builtins import HTMLInput
|
from calibre.customize.builtins import HTMLInput
|
||||||
@ -81,78 +96,22 @@ class CHMInput(InputFormatPlugin):
|
|||||||
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
|
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
|
||||||
return oeb
|
return oeb
|
||||||
|
|
||||||
|
def _create_html_root(self, hhcpath, log, encoding):
|
||||||
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
|
|
||||||
import uuid
|
|
||||||
from lxml import html
|
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
|
||||||
from calibre.ebooks.oeb.base import DirContainer
|
|
||||||
oeb = create_oebbook(log, None, opts,
|
|
||||||
encoding=opts.input_encoding, populate=False)
|
|
||||||
self.oeb = oeb
|
|
||||||
|
|
||||||
metadata = oeb.metadata
|
|
||||||
if mi.title:
|
|
||||||
metadata.add('title', mi.title)
|
|
||||||
if mi.authors:
|
|
||||||
for a in mi.authors:
|
|
||||||
metadata.add('creator', a, attrib={'role':'aut'})
|
|
||||||
if mi.publisher:
|
|
||||||
metadata.add('publisher', mi.publisher)
|
|
||||||
if mi.isbn:
|
|
||||||
metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'})
|
|
||||||
if not metadata.language:
|
|
||||||
oeb.logger.warn(u'Language not specified')
|
|
||||||
metadata.add('language', get_lang().replace('_', '-'))
|
|
||||||
if not metadata.creator:
|
|
||||||
oeb.logger.warn('Creator not specified')
|
|
||||||
metadata.add('creator', _('Unknown'))
|
|
||||||
if not metadata.title:
|
|
||||||
oeb.logger.warn('Title not specified')
|
|
||||||
metadata.add('title', _('Unknown'))
|
|
||||||
|
|
||||||
bookid = str(uuid.uuid4())
|
|
||||||
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
|
||||||
for ident in metadata.identifier:
|
|
||||||
if 'id' in ident.attrib:
|
|
||||||
self.oeb.uid = metadata.identifier[0]
|
|
||||||
break
|
|
||||||
|
|
||||||
hhcdata = self._read_file(hhcpath)
|
|
||||||
hhcroot = html.fromstring(hhcdata)
|
|
||||||
chapters = self._process_nodes(hhcroot)
|
|
||||||
#print "============================="
|
|
||||||
#print "Printing hhcroot"
|
|
||||||
#print etree.tostring(hhcroot, pretty_print=True)
|
|
||||||
#print "============================="
|
|
||||||
log.debug('Found %d section nodes' % len(chapters))
|
|
||||||
|
|
||||||
if len(chapters) > 0:
|
|
||||||
path0 = chapters[0][1]
|
|
||||||
subpath = os.path.dirname(path0)
|
|
||||||
htmlpath = os.path.join(basedir, subpath)
|
|
||||||
|
|
||||||
oeb.container = DirContainer(htmlpath, log)
|
|
||||||
for chapter in chapters:
|
|
||||||
title = chapter[0]
|
|
||||||
basename = os.path.basename(chapter[1])
|
|
||||||
self._add_item(oeb, title, basename)
|
|
||||||
|
|
||||||
oeb.container = DirContainer(htmlpath, oeb.log)
|
|
||||||
return oeb
|
|
||||||
|
|
||||||
def _create_html_root(self, hhcpath, log):
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from urllib import unquote as _unquote
|
from urllib import unquote as _unquote
|
||||||
from calibre.ebooks.oeb.base import urlquote
|
from calibre.ebooks.oeb.base import urlquote
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
hhcdata = self._read_file(hhcpath)
|
hhcdata = self._read_file(hhcpath)
|
||||||
|
hhcdata = hhcdata.decode(encoding)
|
||||||
|
hhcdata = xml_to_unicode(hhcdata, verbose=True,
|
||||||
|
strip_encoding_pats=True, resolve_entities=True)[0]
|
||||||
hhcroot = html.fromstring(hhcdata)
|
hhcroot = html.fromstring(hhcdata)
|
||||||
chapters = self._process_nodes(hhcroot)
|
toc = self._process_nodes(hhcroot)
|
||||||
#print "============================="
|
#print "============================="
|
||||||
#print "Printing hhcroot"
|
#print "Printing hhcroot"
|
||||||
#print etree.tostring(hhcroot, pretty_print=True)
|
#print etree.tostring(hhcroot, pretty_print=True)
|
||||||
#print "============================="
|
#print "============================="
|
||||||
log.debug('Found %d section nodes' % len(chapters))
|
log.debug('Found %d section nodes' % toc.count())
|
||||||
htmlpath = os.path.splitext(hhcpath)[0] + ".html"
|
htmlpath = os.path.splitext(hhcpath)[0] + ".html"
|
||||||
base = os.path.dirname(os.path.abspath(htmlpath))
|
base = os.path.dirname(os.path.abspath(htmlpath))
|
||||||
|
|
||||||
@ -168,37 +127,40 @@ class CHMInput(InputFormatPlugin):
|
|||||||
x = y
|
x = y
|
||||||
return x
|
return x
|
||||||
|
|
||||||
with open(htmlpath, 'wb') as f:
|
def donode(item, parent, base, subpath):
|
||||||
if chapters:
|
for child in item:
|
||||||
f.write('<html><head><meta http-equiv="Content-type"'
|
title = child.title
|
||||||
' content="text/html;charset=UTF-8" /></head><body>\n')
|
if not title: continue
|
||||||
path0 = chapters[0][1]
|
raw = unquote_path(child.href or '')
|
||||||
path0 = unquote_path(path0)
|
|
||||||
subpath = os.path.dirname(path0)
|
|
||||||
base = os.path.dirname(f.name)
|
|
||||||
|
|
||||||
for chapter in chapters:
|
|
||||||
title = chapter[0]
|
|
||||||
raw = unquote_path(chapter[1])
|
|
||||||
rsrcname = os.path.basename(raw)
|
rsrcname = os.path.basename(raw)
|
||||||
rsrcpath = os.path.join(subpath, rsrcname)
|
rsrcpath = os.path.join(subpath, rsrcname)
|
||||||
if (not os.path.exists(os.path.join(base, rsrcpath)) and
|
if (not os.path.exists(os.path.join(base, rsrcpath)) and
|
||||||
os.path.exists(os.path.join(base, raw))):
|
os.path.exists(os.path.join(base, raw))):
|
||||||
rsrcpath = raw
|
rsrcpath = raw
|
||||||
|
|
||||||
# title should already be url encoded
|
|
||||||
if '%' not in rsrcpath:
|
if '%' not in rsrcpath:
|
||||||
rsrcpath = urlquote(rsrcpath)
|
rsrcpath = urlquote(rsrcpath)
|
||||||
url = "<br /><a href=" + rsrcpath + ">" + title + " </a>\n"
|
if not raw:
|
||||||
if isinstance(url, unicode):
|
rsrcpath = ''
|
||||||
url = url.encode('utf-8')
|
c = DIV(A(title, href=rsrcpath))
|
||||||
f.write(url)
|
donode(child, c, base, subpath)
|
||||||
|
parent.append(c)
|
||||||
|
|
||||||
f.write("</body></html>")
|
with open(htmlpath, 'wb') as f:
|
||||||
|
if toc.count() > 1:
|
||||||
|
from lxml.html.builder import HTML, BODY, DIV, A
|
||||||
|
path0 = toc[0].href
|
||||||
|
path0 = unquote_path(path0)
|
||||||
|
subpath = os.path.dirname(path0)
|
||||||
|
base = os.path.dirname(f.name)
|
||||||
|
root = DIV()
|
||||||
|
donode(toc, root, base, subpath)
|
||||||
|
raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
|
||||||
|
pretty_print=True)
|
||||||
|
f.write(raw)
|
||||||
else:
|
else:
|
||||||
f.write(hhcdata)
|
f.write(hhcdata)
|
||||||
return htmlpath
|
return htmlpath, toc
|
||||||
|
|
||||||
|
|
||||||
def _read_file(self, name):
|
def _read_file(self, name):
|
||||||
f = open(name, 'rb')
|
f = open(name, 'rb')
|
||||||
@ -206,41 +168,27 @@ class CHMInput(InputFormatPlugin):
|
|||||||
f.close()
|
f.close()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _visit_node(self, node, chapters, depth):
|
def add_node(self, node, toc, ancestor_map):
|
||||||
# check that node is a normal node (not a comment, DOCTYPE, etc.)
|
|
||||||
# (normal nodes have string tags)
|
|
||||||
if isinstance(node.tag, basestring):
|
|
||||||
from calibre.ebooks.chm.reader import match_string
|
from calibre.ebooks.chm.reader import match_string
|
||||||
|
if match_string(node.attrib['type'], 'text/sitemap'):
|
||||||
chapter_path = None
|
p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
|
||||||
if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
|
parent = p[0] if p else None
|
||||||
chapter_title = None
|
toc = ancestor_map.get(parent, toc)
|
||||||
for child in node:
|
title = href = u''
|
||||||
if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
|
for param in node.xpath('./param'):
|
||||||
chapter_title = child.attrib['value']
|
if match_string(param.attrib['name'], 'name'):
|
||||||
if match_string(child.tag,'param') and match_string(child.attrib['name'],'local'):
|
title = param.attrib['value']
|
||||||
chapter_path = child.attrib['value']
|
elif match_string(param.attrib['name'], 'local'):
|
||||||
if chapter_title is not None and chapter_path is not None:
|
href = param.attrib['value']
|
||||||
chapter = [chapter_title, chapter_path, depth]
|
child = toc.add(title or _('Unknown'), href)
|
||||||
chapters.append(chapter)
|
ancestor_map[node] = child
|
||||||
if node.tag=="UL":
|
|
||||||
depth = depth + 1
|
|
||||||
if node.tag=="/UL":
|
|
||||||
depth = depth - 1
|
|
||||||
|
|
||||||
def _process_nodes(self, root):
|
def _process_nodes(self, root):
|
||||||
chapters = []
|
from calibre.ebooks.oeb.base import TOC
|
||||||
depth = 0
|
toc = TOC()
|
||||||
for node in root.iter():
|
ancestor_map = {}
|
||||||
self._visit_node(node, chapters, depth)
|
for node in root.xpath('//object'):
|
||||||
return chapters
|
self.add_node(node, toc, ancestor_map)
|
||||||
|
return toc
|
||||||
|
|
||||||
def _add_item(self, oeb, title, path):
|
|
||||||
bname = os.path.basename(path)
|
|
||||||
id, href = oeb.manifest.generate(id='html',
|
|
||||||
href=ascii_filename(bname))
|
|
||||||
item = oeb.manifest.add(id, href, 'text/html')
|
|
||||||
item.html_input_href = bname
|
|
||||||
oeb.spine.add(item, True)
|
|
||||||
oeb.toc.add(title, item.href)
|
|
||||||
|
|
||||||
|
@ -28,6 +28,7 @@
|
|||||||
import array
|
import array
|
||||||
import string
|
import string
|
||||||
import sys
|
import sys
|
||||||
|
import codecs
|
||||||
|
|
||||||
import calibre.utils.chm.chmlib as chmlib
|
import calibre.utils.chm.chmlib as chmlib
|
||||||
from calibre.constants import plugins
|
from calibre.constants import plugins
|
||||||
@ -184,7 +185,7 @@ locale_table = {
|
|||||||
0x0420 : ('iso8859_6', "Urdu", "Arabic"),
|
0x0420 : ('iso8859_6', "Urdu", "Arabic"),
|
||||||
0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"),
|
0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"),
|
||||||
0x0843 : ('cp1251', "Uzbek_Cyrillic", "Cyrillic"),
|
0x0843 : ('cp1251', "Uzbek_Cyrillic", "Cyrillic"),
|
||||||
0x042a : (None, "Vietnamese", "Vietnamese")
|
0x042a : ('cp1258', "Vietnamese", "Vietnamese")
|
||||||
}
|
}
|
||||||
|
|
||||||
class CHMFile:
|
class CHMFile:
|
||||||
@ -434,6 +435,19 @@ class CHMFile:
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_encoding(self):
|
||||||
|
ans = self.GetEncoding()
|
||||||
|
if ans is None:
|
||||||
|
lcid = self.GetLCID()
|
||||||
|
if lcid is not None:
|
||||||
|
ans = lcid[0]
|
||||||
|
if ans:
|
||||||
|
try:
|
||||||
|
codecs.lookup(ans)
|
||||||
|
except:
|
||||||
|
ans = None
|
||||||
|
return ans
|
||||||
|
|
||||||
def GetDWORD(self, buff, idx=0):
|
def GetDWORD(self, buff, idx=0):
|
||||||
'''Internal method.
|
'''Internal method.
|
||||||
Reads a double word (4 bytes) from a buffer.
|
Reads a double word (4 bytes) from a buffer.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user