Implement #4825 (CHM format)

2026-04-23 09:34:51 -04:00 · 2010-03-05 10:49:28 -07:00 · 2010-03-05 10:49:28 -07:00 · eae90e2ef4
commit eae90e2ef4
parent c7e8c889a4
5 changed files with 172 additions and 7 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -110,7 +110,7 @@ class CHMMetadataReader(MetadataReaderPlugin):
    description = _('Read metadata from %s files') % 'CHM'

    def get_metadata(self, stream, ftype):
-        from calibre.ebooks.metadata.chm import get_metadata
+        from calibre.ebooks.chm.metadata import get_metadata
        return get_metadata(stream)


--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@ -25,15 +25,16 @@ class CHMInput(InputFormatPlugin):
        rdr = CHMReader(chm_path, log)
        log.debug('Extracting CHM to %s' % output_dir)
        rdr.extract_content(output_dir)
+        self._chm_reader = rdr
        return rdr.hhc_path


    def convert(self, stream, options, file_ext, log, accelerators):
-        from calibre.ebooks.metadata.chm import get_metadata_
+        from calibre.ebooks.chm.metadata import get_metadata_from_reader
        from calibre.customize.ui import plugin_for_input_format

        log.debug('Processing CHM...')
-        with TemporaryDirectory('chm2oeb') as tdir:
+        with TemporaryDirectory('_chm2oeb') as tdir:
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
@ -48,8 +49,9 @@ class CHMInput(InputFormatPlugin):
            log.debug('stream.name=%s' % stream.name)
            mainname = self._chmtohtml(tdir, chm_name, no_images, log)
            mainpath = os.path.join(tdir, mainname)
+            #raw_input()

-            metadata = get_metadata_(tdir)
+            metadata = get_metadata_from_reader(self._chm_reader)

            odi = options.debug_pipeline
            options.debug_pipeline = None
@ -170,6 +172,7 @@ class CHMInput(InputFormatPlugin):
        if isinstance(node.tag, basestring):
            from calibre.ebooks.chm.reader import match_string

+            chapter_path = None
            if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
                for child in node:
                    if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@ -0,0 +1,157 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.metadata import string_to_authors, MetaInformation
+from calibre.utils.logging import default_log
+from calibre.ptempfile import TemporaryFile
+
+def _clean(s):
+    return s.replace(u'\u00a0', u' ')
+
+def _detag(tag):
+    str = u""
+    for elem in tag:
+        if hasattr(elem, "contents"):
+            str += _detag(elem)
+        else:
+            str += _clean(elem)
+    return str
+
+
+def _metadata_from_table(soup, searchfor):
+    td = soup.find('td', text=re.compile(searchfor, flags=re.I))
+    if td is None:
+        return None
+    td = td.parent
+    # there appears to be multiple ways of structuring the metadata
+    # on the home page. cue some nasty special-case hacks...
+    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
+        meta = _detag(td.findNextSibling('td'))
+        return re.sub('^:', '', meta).strip()
+    else:
+        meta = _detag(td)
+        return re.sub(r'^[^:]+:', '', meta).strip()
+
+def _metadata_from_span(soup, searchfor):
+    span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
+    if span is None:
+        return None
+    # this metadata might need some cleaning up still :/
+    return _detag(span.renderContents().strip())
+
+def _get_authors(soup):
+    aut = (_metadata_from_span(soup, r'author')
+        or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
+    ans = [_('Unknown')]
+    if aut is not None:
+        ans = string_to_authors(aut)
+    return ans
+
+def _get_publisher(soup):
+    return (_metadata_from_span(soup, 'imprint')
+        or _metadata_from_table(soup, 'publisher'))
+
+def _get_isbn(soup):
+    return (_metadata_from_span(soup, 'isbn')
+        or _metadata_from_table(soup, 'isbn'))
+
+def _get_comments(soup):
+    date = (_metadata_from_span(soup, 'cwdate')
+        or _metadata_from_table(soup, 'pub date'))
+    pages = ( _metadata_from_span(soup, 'pages')
+        or _metadata_from_table(soup, 'pages'))
+    try:
+        # date span can have copyright symbols in it...
+        date = date.replace(u'\u00a9', '').strip()
+        # and pages often comes as '(\d+ pages)'
+        pages = re.search(r'\d+', pages).group(0)
+        return u'Published %s, %s pages.' % (date, pages)
+    except:
+        pass
+    return None
+
+def _get_cover(soup, rdr):
+    ans = None
+    try:
+        ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
+    except TypeError:
+        # meeehh, no handy alt-tag goodness, try some hackery
+        # the basic idea behind this is that in general, the cover image
+        # has a height:width ratio of ~1.25, whereas most of the nav
+        # buttons are decidedly less than that.
+        # what we do in this is work out that ratio, take 1.25 off it and
+        # save the absolute value when we sort by this value, the smallest
+        # one is most likely to be the cover image, hopefully.
+        r = {}
+        for img in soup('img'):
+            try:
+                r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
+            except KeyError:
+                # interestingly, occasionally the only image without height
+                # or width attrs is the cover...
+                r[0] = img['src']
+        l = r.keys()
+        l.sort()
+        ans = r[l[0]]
+    # this link comes from the internal html, which is in a subdir
+    if ans is not None:
+        try:
+            ans = rdr.GetFile(ans)
+        except:
+            ans = rdr.root + "/" + ans
+            try:
+                ans = rdr.GetFile(ans)
+            except:
+                ans = None
+        if ans is not None:
+            from PIL import Image
+            from cStringIO import StringIO
+            buf = StringIO()
+            try:
+                Image.open(StringIO(ans)).convert('RGB').save(buf, 'JPEG')
+                ans = buf.getvalue()
+            except:
+                ans = None
+    return ans
+
+
+def get_metadata_from_reader(rdr):
+    raw = rdr.GetFile(rdr.home)
+    home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
+        resolve_entities=True)[0])
+
+    title = rdr.title
+    authors = _get_authors(home)
+    mi = MetaInformation(title, authors)
+    publisher = _get_publisher(home)
+    if publisher:
+        mi.publisher = publisher
+    isbn = _get_isbn(home)
+    if isbn:
+        mi.isbn = isbn
+    comments = _get_comments(home)
+    if comments:
+        mi.comments = comments
+
+    cdata = _get_cover(home, rdr)
+    if cdata is not None:
+        mi.cover_data = ('jpg', cdata)
+
+    return mi
+
+def get_metadata(stream):
+    with TemporaryFile('_chm_metadata.chm') as fname:
+        with open(fname, 'wb') as f:
+            f.write(stream.read())
+        from calibre.ebooks.chm.reader import CHMReader
+        rdr = CHMReader(fname, default_log)
+        return get_metadata_from_reader(rdr)
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -135,8 +135,13 @@ class CHMReader(CHMFile):
                if guess_mimetype(path)[0] == ('text/html'):
                    data = self._reformat(data)
                f.write(data)
-        #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
        self._extracted = True
+        files = os.listdir(output_dir)
+        if self.hhc_path not in files:
+            for f in files:
+                if f.lower() == self.hhc_path.lower():
+                    self.hhc_path = f
+                    break

    def _reformat(self, data):
        try:
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -20,7 +20,7 @@ What formats does |app| support conversion to/from?
 |app| supports the conversion of many input formats to many output formats.
 It can convert every input format in the following list, to every output format.

-*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
+*Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT

 *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT

@ -191,7 +191,7 @@ Library Management

 What formats does |app| read metadata from?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-|app| reads metadata from the following formats: LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
+|app| reads metadata from the following formats: CHM, LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI

 Where are the book files stored?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~