From eae90e2ef409e1a8fffa9bd904beb9764edffc85 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 5 Mar 2010 10:49:28 -0700
Subject: [PATCH] Implement #4825 (CHM format)

---
 src/calibre/customize/builtins.py  |   2 +-
 src/calibre/ebooks/chm/input.py    |   9 +-
 src/calibre/ebooks/chm/metadata.py | 157 +++++++++++++++++++++++++++++
 src/calibre/ebooks/chm/reader.py   |   7 +-
 src/calibre/manual/faq.rst         |   4 +-
 5 files changed, 172 insertions(+), 7 deletions(-)
 create mode 100644 src/calibre/ebooks/chm/metadata.py

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 0ba197fac3..391b7d22e6 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -110,7 +110,7 @@ class CHMMetadataReader(MetadataReaderPlugin):
     description = _('Read metadata from %s files') % 'CHM'
 
     def get_metadata(self, stream, ftype):
-        from calibre.ebooks.metadata.chm import get_metadata
+        from calibre.ebooks.chm.metadata import get_metadata
         return get_metadata(stream)
 
 
diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py
index e2a270f2b8..3f0aa21f08 100644
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@@ -25,15 +25,16 @@ class CHMInput(InputFormatPlugin):
         rdr = CHMReader(chm_path, log)
         log.debug('Extracting CHM to %s' % output_dir)
         rdr.extract_content(output_dir)
+        self._chm_reader = rdr
         return rdr.hhc_path
 
 
     def convert(self, stream, options, file_ext, log, accelerators):
-        from calibre.ebooks.metadata.chm import get_metadata_
+        from calibre.ebooks.chm.metadata import get_metadata_from_reader
         from calibre.customize.ui import plugin_for_input_format
 
         log.debug('Processing CHM...')
-        with TemporaryDirectory('chm2oeb') as tdir:
+        with TemporaryDirectory('_chm2oeb') as tdir:
             html_input = plugin_for_input_format('html')
             for opt in html_input.options:
                 setattr(options, opt.option.name, opt.recommended_value)
@@ -48,8 +49,9 @@ class CHMInput(InputFormatPlugin):
             log.debug('stream.name=%s' % stream.name)
             mainname = self._chmtohtml(tdir, chm_name, no_images, log)
             mainpath = os.path.join(tdir, mainname)
+            #raw_input()
 
-            metadata = get_metadata_(tdir)
+            metadata = get_metadata_from_reader(self._chm_reader)
 
             odi = options.debug_pipeline
             options.debug_pipeline = None
@@ -170,6 +172,7 @@ class CHMInput(InputFormatPlugin):
         if isinstance(node.tag, basestring):
             from calibre.ebooks.chm.reader import match_string
 
+            chapter_path = None
             if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
                 for child in node:
                     if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py
new file mode 100644
index 0000000000..7386d54658
--- /dev/null
+++ b/src/calibre/ebooks/chm/metadata.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.metadata import string_to_authors, MetaInformation
+from calibre.utils.logging import default_log
+from calibre.ptempfile import TemporaryFile
+
+def _clean(s):
+    return s.replace(u'\u00a0', u' ')
+
+def _detag(tag):
+    str = u""
+    for elem in tag:
+        if hasattr(elem, "contents"):
+            str += _detag(elem)
+        else:
+            str += _clean(elem)
+    return str
+
+
+def _metadata_from_table(soup, searchfor):
+    td = soup.find('td', text=re.compile(searchfor, flags=re.I))
+    if td is None:
+        return None
+    td = td.parent
+    # there appears to be multiple ways of structuring the metadata
+    # on the home page. cue some nasty special-case hacks...
+    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
+        meta = _detag(td.findNextSibling('td'))
+        return re.sub('^:', '', meta).strip()
+    else:
+        meta = _detag(td)
+        return re.sub(r'^[^:]+:', '', meta).strip()
+
+def _metadata_from_span(soup, searchfor):
+    span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
+    if span is None:
+        return None
+    # this metadata might need some cleaning up still :/
+    return _detag(span.renderContents().strip())
+
+def _get_authors(soup):
+    aut = (_metadata_from_span(soup, r'author')
+        or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
+    ans = [_('Unknown')]
+    if aut is not None:
+        ans = string_to_authors(aut)
+    return ans
+
+def _get_publisher(soup):
+    return (_metadata_from_span(soup, 'imprint')
+        or _metadata_from_table(soup, 'publisher'))
+
+def _get_isbn(soup):
+    return (_metadata_from_span(soup, 'isbn')
+        or _metadata_from_table(soup, 'isbn'))
+
+def _get_comments(soup):
+    date = (_metadata_from_span(soup, 'cwdate')
+        or _metadata_from_table(soup, 'pub date'))
+    pages = ( _metadata_from_span(soup, 'pages')
+        or _metadata_from_table(soup, 'pages'))
+    try:
+        # date span can have copyright symbols in it...
+        date = date.replace(u'\u00a9', '').strip()
+        # and pages often comes as '(\d+ pages)'
+        pages = re.search(r'\d+', pages).group(0)
+        return u'Published %s, %s pages.' % (date, pages)
+    except:
+        pass
+    return None
+
+def _get_cover(soup, rdr):
+    ans = None
+    try:
+        ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
+    except TypeError:
+        # meeehh, no handy alt-tag goodness, try some hackery
+        # the basic idea behind this is that in general, the cover image
+        # has a height:width ratio of ~1.25, whereas most of the nav
+        # buttons are decidedly less than that.
+        # what we do in this is work out that ratio, take 1.25 off it and
+        # save the absolute value when we sort by this value, the smallest
+        # one is most likely to be the cover image, hopefully.
+        r = {}
+        for img in soup('img'):
+            try:
+                r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
+            except KeyError:
+                # interestingly, occasionally the only image without height
+                # or width attrs is the cover...
+                r[0] = img['src']
+        l = r.keys()
+        l.sort()
+        ans = r[l[0]]
+    # this link comes from the internal html, which is in a subdir
+    if ans is not None:
+        try:
+            ans = rdr.GetFile(ans)
+        except:
+            ans = rdr.root + "/" + ans
+            try:
+                ans = rdr.GetFile(ans)
+            except:
+                ans = None
+        if ans is not None:
+            from PIL import Image
+            from cStringIO import StringIO
+            buf = StringIO()
+            try:
+                Image.open(StringIO(ans)).convert('RGB').save(buf, 'JPEG')
+                ans = buf.getvalue()
+            except:
+                ans = None
+    return ans
+
+
+def get_metadata_from_reader(rdr):
+    raw = rdr.GetFile(rdr.home)
+    home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
+        resolve_entities=True)[0])
+
+    title = rdr.title
+    authors = _get_authors(home)
+    mi = MetaInformation(title, authors)
+    publisher = _get_publisher(home)
+    if publisher:
+        mi.publisher = publisher
+    isbn = _get_isbn(home)
+    if isbn:
+        mi.isbn = isbn
+    comments = _get_comments(home)
+    if comments:
+        mi.comments = comments
+
+    cdata = _get_cover(home, rdr)
+    if cdata is not None:
+        mi.cover_data = ('jpg', cdata)
+
+    return mi
+
+def get_metadata(stream):
+    with TemporaryFile('_chm_metadata.chm') as fname:
+        with open(fname, 'wb') as f:
+            f.write(stream.read())
+        from calibre.ebooks.chm.reader import CHMReader
+        rdr = CHMReader(fname, default_log)
+        return get_metadata_from_reader(rdr)
diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index 33272e9695..412ca94d8a 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -135,8 +135,13 @@ class CHMReader(CHMFile):
                 if guess_mimetype(path)[0] == ('text/html'):
                     data = self._reformat(data)
                 f.write(data)
-        #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
         self._extracted = True
+        files = os.listdir(output_dir)
+        if self.hhc_path not in files:
+            for f in files:
+                if f.lower() == self.hhc_path.lower():
+                    self.hhc_path = f
+                    break
 
     def _reformat(self, data):
         try:
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index 043c8d7041..eff65fdb7b 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -20,7 +20,7 @@ What formats does |app| support conversion to/from?
 |app| supports the conversion of many input formats to many output formats.
 It can convert every input format in the following list, to every output format.
 
-*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
+*Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
 
 *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT
 
@@ -191,7 +191,7 @@ Library Management
 
 What formats does |app| read metadata from?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-|app| reads metadata from the following formats: LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
+|app| reads metadata from the following formats: CHM, LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
 
 Where are the book files stored?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~