Implement #4825 (CHM format)

2025-07-09 03:04:10 -04:00 · 2010-03-05 10:49:28 -07:00 · 2010-03-05 10:49:28 -07:00 · eae90e2ef4
commit eae90e2ef4
parent c7e8c889a4
5 changed files with 172 additions and 7 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -110,7 +110,7 @@ class CHMMetadataReader(MetadataReaderPlugin):
    description = _('Read metadata from %s files') % 'CHM'
    def get_metadata(self, stream, ftype):
-        from calibre.ebooks.metadata.chm import get_metadata
+        from calibre.ebooks.chm.metadata import get_metadata
        return get_metadata(stream)
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@ -25,15 +25,16 @@ class CHMInput(InputFormatPlugin):
        rdr = CHMReader(chm_path, log)
        log.debug('Extracting CHM to %s' % output_dir)
        rdr.extract_content(output_dir)
        self._chm_reader = rdr
        return rdr.hhc_path
    def convert(self, stream, options, file_ext, log, accelerators):
-        from calibre.ebooks.metadata.chm import get_metadata_
+        from calibre.ebooks.chm.metadata import get_metadata_from_reader
        from calibre.customize.ui import plugin_for_input_format
        log.debug('Processing CHM...')
-        with TemporaryDirectory('chm2oeb') as tdir:
+        with TemporaryDirectory('_chm2oeb') as tdir:
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
@ -48,8 +49,9 @@ class CHMInput(InputFormatPlugin):
            log.debug('stream.name=%s' % stream.name)
            mainname = self._chmtohtml(tdir, chm_name, no_images, log)
            mainpath = os.path.join(tdir, mainname)
            #raw_input()
-            metadata = get_metadata_(tdir)
+            metadata = get_metadata_from_reader(self._chm_reader)
            odi = options.debug_pipeline
            options.debug_pipeline = None
@ -170,6 +172,7 @@ class CHMInput(InputFormatPlugin):
        if isinstance(node.tag, basestring):
            from calibre.ebooks.chm.reader import match_string
            chapter_path = None
            if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
                for child in node:
                    if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@ -0,0 +1,157 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.metadata import string_to_authors, MetaInformation
 from calibre.utils.logging import default_log
 from calibre.ptempfile import TemporaryFile
 def _clean(s):
    return s.replace(u'\u00a0', u' ')
 def _detag(tag):
    str = u""
    for elem in tag:
        if hasattr(elem, "contents"):
            str += _detag(elem)
        else:
            str += _clean(elem)
    return str
 def _metadata_from_table(soup, searchfor):
    td = soup.find('td', text=re.compile(searchfor, flags=re.I))
    if td is None:
        return None
    td = td.parent
    # there appears to be multiple ways of structuring the metadata
    # on the home page. cue some nasty special-case hacks...
    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
        meta = _detag(td.findNextSibling('td'))
        return re.sub('^:', '', meta).strip()
    else:
        meta = _detag(td)
        return re.sub(r'^[^:]+:', '', meta).strip()
 def _metadata_from_span(soup, searchfor):
    span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
    if span is None:
        return None
    # this metadata might need some cleaning up still :/
    return _detag(span.renderContents().strip())
 def _get_authors(soup):
    aut = (_metadata_from_span(soup, r'author')
        or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
    ans = [_('Unknown')]
    if aut is not None:
        ans = string_to_authors(aut)
    return ans
 def _get_publisher(soup):
    return (_metadata_from_span(soup, 'imprint')
        or _metadata_from_table(soup, 'publisher'))
 def _get_isbn(soup):
    return (_metadata_from_span(soup, 'isbn')
        or _metadata_from_table(soup, 'isbn'))
 def _get_comments(soup):
    date = (_metadata_from_span(soup, 'cwdate')
        or _metadata_from_table(soup, 'pub date'))
    pages = ( _metadata_from_span(soup, 'pages')
        or _metadata_from_table(soup, 'pages'))
    try:
        # date span can have copyright symbols in it...
        date = date.replace(u'\u00a9', '').strip()
        # and pages often comes as '(\d+ pages)'
        pages = re.search(r'\d+', pages).group(0)
        return u'Published %s, %s pages.' % (date, pages)
    except:
        pass
    return None
 def _get_cover(soup, rdr):
    ans = None
    try:
        ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
    except TypeError:
        # meeehh, no handy alt-tag goodness, try some hackery
        # the basic idea behind this is that in general, the cover image
        # has a height:width ratio of ~1.25, whereas most of the nav
        # buttons are decidedly less than that.
        # what we do in this is work out that ratio, take 1.25 off it and
        # save the absolute value when we sort by this value, the smallest
        # one is most likely to be the cover image, hopefully.
        r = {}
        for img in soup('img'):
            try:
                r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
            except KeyError:
                # interestingly, occasionally the only image without height
                # or width attrs is the cover...
                r[0] = img['src']
        l = r.keys()
        l.sort()
        ans = r[l[0]]
    # this link comes from the internal html, which is in a subdir
    if ans is not None:
        try:
            ans = rdr.GetFile(ans)
        except:
            ans = rdr.root + "/" + ans
            try:
                ans = rdr.GetFile(ans)
            except:
                ans = None
        if ans is not None:
            from PIL import Image
            from cStringIO import StringIO
            buf = StringIO()
            try:
                Image.open(StringIO(ans)).convert('RGB').save(buf, 'JPEG')
                ans = buf.getvalue()
            except:
                ans = None
    return ans
 def get_metadata_from_reader(rdr):
    raw = rdr.GetFile(rdr.home)
    home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
        resolve_entities=True)[0])
    title = rdr.title
    authors = _get_authors(home)
    mi = MetaInformation(title, authors)
    publisher = _get_publisher(home)
    if publisher:
        mi.publisher = publisher
    isbn = _get_isbn(home)
    if isbn:
        mi.isbn = isbn
    comments = _get_comments(home)
    if comments:
        mi.comments = comments
    cdata = _get_cover(home, rdr)
    if cdata is not None:
        mi.cover_data = ('jpg', cdata)
    return mi
 def get_metadata(stream):
    with TemporaryFile('_chm_metadata.chm') as fname:
        with open(fname, 'wb') as f:
            f.write(stream.read())
        from calibre.ebooks.chm.reader import CHMReader
        rdr = CHMReader(fname, default_log)
        return get_metadata_from_reader(rdr)
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -135,8 +135,13 @@ class CHMReader(CHMFile):
                if guess_mimetype(path)[0] == ('text/html'):
                    data = self._reformat(data)
                f.write(data)
        #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
        self._extracted = True
        files = os.listdir(output_dir)
        if self.hhc_path not in files:
            for f in files:
                if f.lower() == self.hhc_path.lower():
                    self.hhc_path = f
                    break
    def _reformat(self, data):
        try:
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -20,7 +20,7 @@ What formats does |app| support conversion to/from?
 |app| supports the conversion of many input formats to many output formats.
 It can convert every input format in the following list, to every output format.
-*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
+*Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
 *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT
@ -191,7 +191,7 @@ Library Management
 What formats does |app| read metadata from?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-|app| reads metadata from the following formats: LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
+|app| reads metadata from the following formats: CHM, LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
 Where are the book files stored?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~