Integrate patched pychm into calibre build process

2025-07-09 03:04:10 -04:00 · 2010-02-27 22:15:25 -07:00 · 2010-02-27 22:15:25 -07:00 · 2f437907a3
commit 2f437907a3
parent 95f655585b
16 changed files with 3377 additions and 235 deletions
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@ -89,6 +89,7 @@ fc_inc = '/usr/include/fontconfig'
 fc_lib = '/usr/lib'
 podofo_inc = '/usr/include/podofo'
 podofo_lib = '/usr/lib'
 chmlib_inc_dirs = chmlib_lib_dirs = []
 if iswindows:
    prefix  = r'C:\cygwin\home\kovid\sw'
@ -96,6 +97,10 @@ if iswindows:
    sw_lib_dir  = os.path.join(prefix, 'lib')
    fc_inc = os.path.join(sw_inc_dir, 'fontconfig')
    fc_lib = sw_lib_dir
    chmlib_inc_dirs = consolidate('CHMLIB_INC_DIR', os.path.join(prefix,
        'build', 'chmlib-0.40', 'src'))
    chmlib_lib_dirs = consolidate('CHMLIB_LIB_DIR', os.path.join(prefix,
        'build', 'chmlib-0.40', 'src', 'Release'))
    png_inc_dirs = [sw_inc_dir]
    png_lib_dirs = [sw_lib_dir]
    png_libs = ['png12']
--- a/setup/extensions.py
+++ b/setup/extensions.py
@ -12,12 +12,13 @@ from distutils import sysconfig
 from PyQt4.pyqtconfig import QtGuiModuleMakefile
 from setup import Command, islinux, isfreebsd, isosx, SRC, iswindows
-from setup.build_environment import fc_inc, fc_lib, \
+from setup.build_environment import fc_inc, fc_lib, chmlib_inc_dirs, \
        fc_error, poppler_libs, poppler_lib_dirs, poppler_inc_dirs, podofo_inc, \
        podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE, \
        QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, win_ddk, \
        magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs, \
-        magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs, jpg_lib_dirs
+        magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs, \
        jpg_lib_dirs, chmlib_lib_dirs
 MT
 isunix = islinux or isosx or isfreebsd
@ -56,6 +57,22 @@ if iswindows:
    pdfreflow_libs = ['advapi32', 'User32', 'Gdi32']
 extensions = [
    Extension('chmlib',
            ['calibre/utils/chm/swig_chm.c'],
            libraries=['ChmLib' if iswindows else 'chm'],
            inc_dirs=chmlib_inc_dirs,
            lib_dirs=chmlib_lib_dirs,
            cflags=["-DSWIG_COBJECT_TYPES"]),
    Extension('chm_extra',
            ['calibre/utils/chm/extra.c'],
            libraries=['ChmLib' if iswindows else 'chm'],
            inc_dirs=chmlib_inc_dirs,
            lib_dirs=chmlib_lib_dirs,
            cflags=["-D__PYTHON__"]),
    Extension('pdfreflow',
                reflow_sources,
                headers=reflow_headers,
--- a/setup/installer/linux/freeze.py
+++ b/setup/installer/linux/freeze.py
@ -42,6 +42,7 @@ class LinuxFreeze(Command):
                        '/usr/lib/liblcms.so.1',
                        '/tmp/calibre-mount-helper',
                        '/usr/lib/libunrar.so',
                        '/usr/lib/libchm.so.0',
                        '/usr/lib/libsqlite3.so.0',
                        '/usr/lib/libsqlite3.so.0',
                        '/usr/lib/libmng.so.1',
--- a/setup/installer/osx/app/main.py
+++ b/setup/installer/osx/app/main.py
@ -459,7 +459,7 @@ class Py2App(object):
    @flush
    def add_misc_libraries(self):
-        for x in ('usb', 'unrar', 'readline.6.0', 'wmflite-0.2.7'):
+        for x in ('usb', 'unrar', 'readline.6.0', 'wmflite-0.2.7', 'chm.0'):
            info('\nAdding', x)
            x = 'lib%s.dylib'%x
            shutil.copy2(join(SW, 'lib', x), self.frameworks_dir)
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -57,7 +57,8 @@ if plugins is None:
        sys.path.insert(0, plugin_path)
        for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo', 'cPalmdoc',
-            'fontconfig', 'pdfreflow', 'progress_indicator'] + \
+            'fontconfig', 'pdfreflow', 'progress_indicator', 'chmlib',
            'chm_extra'] + \
                    (['winutil'] if iswindows else []) + \
                    (['usbobserver'] if isosx else []):
            try:
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -103,6 +103,17 @@ class ComicMetadataReader(MetadataReaderPlugin):
            mi.cover_data = (ext.lower(), data)
        return mi
 class CHMMetadataReader(MetadataReaderPlugin):
    name        = 'Read CHM metadata'
    file_types  = set(['chm'])
    description = _('Read metadata from %s files') % 'CHM'
    def get_metadata(self, stream, ftype):
        from calibre.ebooks.metadata.chm import get_metadata
        return get_metadata(stream)
 class EPUBMetadataReader(MetadataReaderPlugin):
    name        = 'Read EPUB metadata'
@ -384,6 +395,7 @@ from calibre.ebooks.rtf.input import RTFInput
 from calibre.ebooks.tcr.input import TCRInput
 from calibre.ebooks.txt.input import TXTInput
 from calibre.ebooks.lrf.input import LRFInput
 from calibre.ebooks.chm.input import CHMInput
 from calibre.ebooks.epub.output import EPUBOutput
 from calibre.ebooks.fb2.output import FB2Output
@ -444,6 +456,7 @@ plugins += [
    TCRInput,
    TXTInput,
    LRFInput,
    CHMInput,
 ]
 plugins += [
    EPUBOutput,
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -15,7 +15,7 @@ class ANDROID(USBMS):
    supported_platforms = ['windows', 'osx', 'linux']
    # Ordered list of supported formats
-    FORMATS     = ['epub']
+    FORMATS     = ['epub', 'pdf']
    VENDOR_ID   = {
            0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]},
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@ -1,213 +1,18 @@
 from __future__ import with_statement
 ''' CHM File decoding support '''
 __license__ = 'GPL v3'
 __copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
                 ' and Alex Bramley <a.bramley at gmail.com>.'
-import os, shutil, uuid, re
+import os, uuid
 from tempfile import mkdtemp
 from mimetypes import guess_type as guess_mimetype
 from BeautifulSoup import BeautifulSoup, NavigableString
 from lxml import html
 from pychm.chm import CHMFile
 from pychm.chmlib import (
  CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
  chm_enumerate,
 )
-from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.customize.conversion import InputFormatPlugin
-from calibre.utils.config import OptionParser
+from calibre.ebooks.chm.reader import CHMReader, match_string
-from calibre.ebooks.metadata.toc import TOC
+from calibre.ptempfile import TemporaryDirectory
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
 def match_string(s1, s2_already_lowered):
    if s1 is not None and s2_already_lowered is not None:
        if s1.lower()==s2_already_lowered:
            return True
    return False
 def check_all_prev_empty(tag):
    if tag is None:
        return True
    if tag.__class__ == NavigableString and not check_empty(tag):
        return False
    return check_all_prev_empty(tag.previousSibling)
 def check_empty(s, rex = re.compile(r'\S')):
    return rex.search(s) is None
 def option_parser():
    parser = OptionParser(usage=_('%prog [options] mybook.chm'))
    parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
    parser.add_option('--verbose', default=False, action='store_true', dest='verbose')
    parser.add_option("-t", "--title", action="store", type="string", \
                    dest="title", help=_("Set the book title"))
    parser.add_option('--title-sort', action='store', type='string', default=None,
                      dest='title_sort', help=_('Set sort key for the title'))
    parser.add_option("-a", "--author", action="store", type="string", \
                    dest="author", help=_("Set the author"))
    parser.add_option('--author-sort', action='store', type='string', default=None,
                      dest='author_sort', help=_('Set sort key for the author'))
    parser.add_option("-c", "--category", action="store", type="string", \
                    dest="category", help=_("The category this book belongs"
                    " to. E.g.: History"))
    parser.add_option("--thumbnail", action="store", type="string", \
                    dest="thumbnail", help=_("Path to a graphic that will be"
                    " set as this files' thumbnail"))
    parser.add_option("--comment", action="store", type="string", \
                    dest="freetext", help=_("Path to a txt file containing a comment."))
    parser.add_option("--get-thumbnail", action="store_true", \
                    dest="get_thumbnail", default=False, \
                    help=_("Extract thumbnail from LRF file"))
    parser.add_option('--publisher', default=None, help=_('Set the publisher'))
    parser.add_option('--classification', default=None, help=_('Set the book classification'))
    parser.add_option('--creator', default=None, help=_('Set the book creator'))
    parser.add_option('--producer', default=None, help=_('Set the book producer'))
    parser.add_option('--get-cover', action='store_true', default=False,
                      help=_('Extract cover from LRF file. Note that the LRF format has no defined cover, so we use some heuristics to guess the cover.'))
    parser.add_option('--bookid', action='store', type='string', default=None,
                      dest='book_id', help=_('Set book ID'))
    parser.add_option('--font-delta', action='store', type='int', default=0,
                      dest='font_delta', help=_('Set font delta'))
    return parser
 class CHMError(Exception):
    pass
 class CHMReader(CHMFile):
    def __init__(self, input, log):
        CHMFile.__init__(self)
        if not self.LoadCHM(input):
            raise CHMError("Unable to open CHM file '%s'"%(input,))
        self.log = log
        self._sourcechm = input
        self._contents = None
        self._playorder = 0
        self._metadata = False
        self._extracted = False
        # location of '.hhc' file, which is the CHM TOC.
        self.root, ext = os.path.splitext(self.topics.lstrip('/'))
        self.hhc_path = self.root + ".hhc"
    def _parse_toc(self, ul, basedir=os.getcwdu()):
        toc = TOC(play_order=self._playorder, base_path=basedir, text='')
        self._playorder += 1
        for li in ul('li', recursive=False):
            href = li.object('param', {'name': 'Local'})[0]['value']
            if href.count('#'):
                href, frag = href.split('#')
            else:
                frag = None
            name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
            #print "========>", name
            toc.add_item(href, frag, name, play_order=self._playorder)
            self._playorder += 1
            if li.ul:
               child = self._parse_toc(li.ul)
               child.parent = toc
               toc.append(child)
        #print toc
        return toc
    def GetFile(self, path):
        # have to have abs paths for ResolveObject, but Contents() deliberately
        # makes them relative. So we don't have to worry, re-add the leading /.
        # note this path refers to the internal CHM structure
        if path[0] != '/':
            path = '/' + path
        res, ui = self.ResolveObject(path)
        if res != CHM_RESOLVE_SUCCESS:
            raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
        size, data = self.RetrieveObject(ui)
        if size == 0:
            raise CHMError("'%s' is zero bytes in length!"%(path,))
        return data
    def ExtractFiles(self, output_dir=os.getcwdu()):
        for path in self.Contents():
            lpath = os.path.join(output_dir, path)
            self._ensure_dir(lpath)
            data = self.GetFile(path)
            with open(lpath, 'wb') as f:
                if guess_mimetype(path)[0] == ('text/html'):
                    data = self._reformat(data)
                f.write(data)
        #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
        self._extracted = True
    def _reformat(self, data):
        try:
            soup = BeautifulSoup(data)
        except UnicodeEncodeError:
            # hit some strange encoding problems...
            print "Unable to parse html for cleaning, leaving it :("
            return data
        # nuke javascript...
        [s.extract() for s in soup('script')]
        # remove forward and back nav bars from the top/bottom of each page
        # cos they really fuck with the flow of things and generally waste space
        # since we can't use [a,b] syntax to select arbitrary items from a list
        # we'll have to do this manually...
        t = soup('table')
        if t:
            if (t[0].previousSibling is None
              or t[0].previousSibling.previousSibling is None):
                t[0].extract()
            if (t[-1].nextSibling is None
              or t[-1].nextSibling.nextSibling is None):
                t[-1].extract()
        # for some very odd reason each page's content appears to be in a table
        # too. and this table has sub-tables for random asides... grr.
        # remove br at top of page if present after nav bars removed
        br = soup('br')
        if br:
            if check_all_prev_empty(br[0].previousSibling):
                br[0].extract()
        # some images seem to be broken in some chm's :/
        for img in soup('img'):
            try:
                # some are supposedly "relative"... lies.
                while img['src'].startswith('../'): img['src'] = img['src'][3:]
                # some have ";<junk>" at the end.
                img['src'] = img['src'].split(';')[0]
            except KeyError:
                # and some don't even have a src= ?!
                pass
        # now give back some pretty html.
        return soup.prettify()
    def Contents(self):
        if self._contents is not None:
            return self._contents
        paths = []
        def get_paths(chm, ui, ctx):
            # skip directories
            # note this path refers to the internal CHM structure
            if ui.path[-1] != '/':
                # and make paths relative
                paths.append(ui.path.lstrip('/'))
        chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
        self._contents = paths
        return self._contents
    def _ensure_dir(self, path):
        dir = os.path.dirname(path)
        if not os.path.isdir(dir):
            os.makedirs(dir)
    def extract_content(self, output_dir=os.getcwdu()):
        self.ExtractFiles(output_dir=output_dir)
 class CHMInput(InputFormatPlugin):
    name        = 'CHM Input'
@ -215,11 +20,6 @@ class CHMInput(InputFormatPlugin):
    description = 'Convert CHM files to OEB'
    file_types  = set(['chm'])
    options = set([
        OptionRecommendation(name='dummy_option', recommended_value=False,
            help=_('dummy option until real options are determined.')),
    ])
    def _chmtohtml(self, output_dir, chm_path, no_images, log):
        log.debug('Opening CHM file')
        rdr = CHMReader(chm_path, log)
@ -230,37 +30,36 @@ class CHMInput(InputFormatPlugin):
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.metadata.chm import get_metadata_
        from calibre.customize.ui import plugin_for_input_format
        log.debug('Processing CHM...')
-        tdir = mkdtemp(prefix='chm2oeb_')
+        with TemporaryDirectory('chm2oeb') as tdir:
-        from calibre.customize.ui import plugin_for_input_format
+            html_input = plugin_for_input_format('html')
-        html_input = plugin_for_input_format('html')
+            for opt in html_input.options:
-        for opt in html_input.options:
+                setattr(options, opt.option.name, opt.recommended_value)
-            setattr(options, opt.option.name, opt.recommended_value)
+            options.input_encoding = 'utf-8'
-        options.input_encoding = 'utf-8'
+            no_images = False #options.no_images
-        no_images = False #options.no_images
+            chm_name = stream.name
-        chm_name = stream.name
+            #chm_data = stream.read()
        #chm_data = stream.read()
-        #closing stream so CHM can be opened by external library
+            #closing stream so CHM can be opened by external library
-        stream.close()
+            stream.close()
-        log.debug('tdir=%s' % tdir)
+            log.debug('tdir=%s' % tdir)
-        log.debug('stream.name=%s' % stream.name)
+            log.debug('stream.name=%s' % stream.name)
-        mainname = self._chmtohtml(tdir, chm_name, no_images, log)
+            mainname = self._chmtohtml(tdir, chm_name, no_images, log)
-        mainpath = os.path.join(tdir, mainname)
+            mainpath = os.path.join(tdir, mainname)
-        metadata = get_metadata_(tdir)
+            metadata = get_metadata_(tdir)
-        odi = options.debug_pipeline
+            odi = options.debug_pipeline
-        options.debug_pipeline = None
+            options.debug_pipeline = None
-        # try a custom conversion:
+            # try a custom conversion:
-        #oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
+            #oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
-        # try using html converter:
+            # try using html converter:
-        htmlpath = self._create_html_root(mainpath, log)
+            htmlpath = self._create_html_root(mainpath, log)
-        oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
+            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
-        options.debug_pipeline = odi
+            options.debug_pipeline = odi
-        #log.debug('DEBUG: Not removing tempdir %s' % tdir)
+            #log.debug('DEBUG: Not removing tempdir %s' % tdir)
        shutil.rmtree(tdir)
        return oeb
    def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -0,0 +1,207 @@
 from __future__ import with_statement
 ''' CHM File decoding support '''
 __license__ = 'GPL v3'
 __copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
                 ' and Alex Bramley <a.bramley at gmail.com>.'
 import os, re
 from mimetypes import guess_type as guess_mimetype
 from BeautifulSoup import BeautifulSoup, NavigableString
 from calibre.utils.chm.chm import CHMFile
 from calibre.utils.chm.chmlib import (
  CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
  chm_enumerate,
 )
 from calibre.utils.config import OptionParser
 from calibre.ebooks.metadata.toc import TOC
 def match_string(s1, s2_already_lowered):
    if s1 is not None and s2_already_lowered is not None:
        if s1.lower()==s2_already_lowered:
            return True
    return False
 def check_all_prev_empty(tag):
    if tag is None:
        return True
    if tag.__class__ == NavigableString and not check_empty(tag):
        return False
    return check_all_prev_empty(tag.previousSibling)
 def check_empty(s, rex = re.compile(r'\S')):
    return rex.search(s) is None
 def option_parser():
    parser = OptionParser(usage=_('%prog [options] mybook.chm'))
    parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
    parser.add_option('--verbose', default=False, action='store_true', dest='verbose')
    parser.add_option("-t", "--title", action="store", type="string", \
                    dest="title", help=_("Set the book title"))
    parser.add_option('--title-sort', action='store', type='string', default=None,
                      dest='title_sort', help=_('Set sort key for the title'))
    parser.add_option("-a", "--author", action="store", type="string", \
                    dest="author", help=_("Set the author"))
    parser.add_option('--author-sort', action='store', type='string', default=None,
                      dest='author_sort', help=_('Set sort key for the author'))
    parser.add_option("-c", "--category", action="store", type="string", \
                    dest="category", help=_("The category this book belongs"
                    " to. E.g.: History"))
    parser.add_option("--thumbnail", action="store", type="string", \
                    dest="thumbnail", help=_("Path to a graphic that will be"
                    " set as this files' thumbnail"))
    parser.add_option("--comment", action="store", type="string", \
                    dest="freetext", help=_("Path to a txt file containing a comment."))
    parser.add_option("--get-thumbnail", action="store_true", \
                    dest="get_thumbnail", default=False, \
                    help=_("Extract thumbnail from LRF file"))
    parser.add_option('--publisher', default=None, help=_('Set the publisher'))
    parser.add_option('--classification', default=None, help=_('Set the book classification'))
    parser.add_option('--creator', default=None, help=_('Set the book creator'))
    parser.add_option('--producer', default=None, help=_('Set the book producer'))
    parser.add_option('--get-cover', action='store_true', default=False,
                      help=_('Extract cover from LRF file. Note that the LRF format has no defined cover, so we use some heuristics to guess the cover.'))
    parser.add_option('--bookid', action='store', type='string', default=None,
                      dest='book_id', help=_('Set book ID'))
    parser.add_option('--font-delta', action='store', type='int', default=0,
                      dest='font_delta', help=_('Set font delta'))
    return parser
 class CHMError(Exception):
    pass
 class CHMReader(CHMFile):
    def __init__(self, input, log):
        CHMFile.__init__(self)
        if not self.LoadCHM(input):
            raise CHMError("Unable to open CHM file '%s'"%(input,))
        self.log = log
        self._sourcechm = input
        self._contents = None
        self._playorder = 0
        self._metadata = False
        self._extracted = False
        # location of '.hhc' file, which is the CHM TOC.
        self.root, ext = os.path.splitext(self.topics.lstrip('/'))
        self.hhc_path = self.root + ".hhc"
    def _parse_toc(self, ul, basedir=os.getcwdu()):
        toc = TOC(play_order=self._playorder, base_path=basedir, text='')
        self._playorder += 1
        for li in ul('li', recursive=False):
            href = li.object('param', {'name': 'Local'})[0]['value']
            if href.count('#'):
                href, frag = href.split('#')
            else:
                frag = None
            name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
            #print "========>", name
            toc.add_item(href, frag, name, play_order=self._playorder)
            self._playorder += 1
            if li.ul:
               child = self._parse_toc(li.ul)
               child.parent = toc
               toc.append(child)
        #print toc
        return toc
    def GetFile(self, path):
        # have to have abs paths for ResolveObject, but Contents() deliberately
        # makes them relative. So we don't have to worry, re-add the leading /.
        # note this path refers to the internal CHM structure
        if path[0] != '/':
            path = '/' + path
        res, ui = self.ResolveObject(path)
        if res != CHM_RESOLVE_SUCCESS:
            raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
        size, data = self.RetrieveObject(ui)
        if size == 0:
            raise CHMError("'%s' is zero bytes in length!"%(path,))
        return data
    def ExtractFiles(self, output_dir=os.getcwdu()):
        for path in self.Contents():
            lpath = os.path.join(output_dir, path)
            self._ensure_dir(lpath)
            data = self.GetFile(path)
            with open(lpath, 'wb') as f:
                if guess_mimetype(path)[0] == ('text/html'):
                    data = self._reformat(data)
                f.write(data)
        #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
        self._extracted = True
    def _reformat(self, data):
        try:
            soup = BeautifulSoup(data)
        except UnicodeEncodeError:
            # hit some strange encoding problems...
            print "Unable to parse html for cleaning, leaving it :("
            return data
        # nuke javascript...
        [s.extract() for s in soup('script')]
        # remove forward and back nav bars from the top/bottom of each page
        # cos they really fuck with the flow of things and generally waste space
        # since we can't use [a,b] syntax to select arbitrary items from a list
        # we'll have to do this manually...
        t = soup('table')
        if t:
            if (t[0].previousSibling is None
              or t[0].previousSibling.previousSibling is None):
                t[0].extract()
            if (t[-1].nextSibling is None
              or t[-1].nextSibling.nextSibling is None):
                t[-1].extract()
        # for some very odd reason each page's content appears to be in a table
        # too. and this table has sub-tables for random asides... grr.
        # remove br at top of page if present after nav bars removed
        br = soup('br')
        if br:
            if check_all_prev_empty(br[0].previousSibling):
                br[0].extract()
        # some images seem to be broken in some chm's :/
        for img in soup('img'):
            try:
                # some are supposedly "relative"... lies.
                while img['src'].startswith('../'): img['src'] = img['src'][3:]
                # some have ";<junk>" at the end.
                img['src'] = img['src'].split(';')[0]
            except KeyError:
                # and some don't even have a src= ?!
                pass
        # now give back some pretty html.
        return soup.prettify()
    def Contents(self):
        if self._contents is not None:
            return self._contents
        paths = []
        def get_paths(chm, ui, ctx):
            # skip directories
            # note this path refers to the internal CHM structure
            if ui.path[-1] != '/':
                # and make paths relative
                paths.append(ui.path.lstrip('/'))
        chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
        self._contents = paths
        return self._contents
    def _ensure_dir(self, path):
        dir = os.path.dirname(path)
        if not os.path.isdir(dir):
            os.makedirs(dir)
    def extract_content(self, output_dir=os.getcwdu()):
        self.ExtractFiles(output_dir=output_dir)
--- a/src/calibre/utils/chm/init.py
+++ b/src/calibre/utils/chm/init.py
@ -0,0 +1,34 @@
 ## Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
 ## pychm is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ## This program is distributed in the hope that it will be useful,
 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.
 ## You should have received a copy of the GNU General Public
 ## License along with this program; see the file COPYING.  If not,
 ## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 ## Boston, MA 02111-1307, USA
 ## $Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $
 '''
   chm - A package to manipulate CHM files
   The chm package provides four modules: chm, chmlib, extra and
   _chmlib. _chmlib and chmlib are very low level libraries generated
   from  SWIG interface files, and are simple wrappers around the API
   defined by the C library chmlib.
   The extra module adds full-text search support.
   the chm module provides some higher level classes to simplify
   access to the CHM files information.
 '''
 __all__ = ["chm", "chmlib", "_chmlib", "extra"]
 __version__ = "0.8.4"
 __revision__ = "$Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $"
--- a/src/calibre/utils/chm/chm.py
+++ b/src/calibre/utils/chm/chm.py
@ -0,0 +1,512 @@
 ## Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
 ## Based on code by:
 ## Copyright (C) 2003  Razvan Cojocaru <razvanco@gmx.net>
 ## pychm is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ## This program is distributed in the hope that it will be useful,
 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.
 ## You should have received a copy of the GNU General Public
 ## License along with this program; see the file COPYING.  If not,
 ## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 ## Boston, MA 02111-1307, USA
 ## $Id: chm.py,v 1.12 2006/08/07 12:31:51 rubensr Exp $
 '''
   chm - A high-level front end for the chmlib python module.
   The chm module provides high level access to the functionality
   included in chmlib. It encapsulates functions in the CHMFile class, and
   provides some additional features, such as the ability to obtain
   the contents tree of a CHM archive.
 '''
 import array
 import string
 import sys
 import calibre.utils.chm.chmlib as chmlib
 from calibre.constants import plugins
 extra, extra_err = plugins['chm_extra']
 if extra_err:
    raise RuntimeError('Failed to load chm.extra: '+extra_err)
 charset_table = {
    0   : 'iso8859_1',  # ANSI_CHARSET
    238 : 'iso8859_2',  # EASTEUROPE_CHARSET
    178 : 'iso8859_6',  # ARABIC_CHARSET
    161 : 'iso8859_7',  # GREEK_CHARSET
    177 : 'iso8859_8',  # HEBREW_CHARSET
    162 : 'iso8859_9',  # TURKISH_CHARSET
    222 : 'iso8859_11', # THAI_CHARSET - hmm not in python 2.2...
    186 : 'iso8859_13', # BALTIC_CHARSET
    204 : 'cp1251',     # RUSSIAN_CHARSET
    255 : 'cp437',      # OEM_CHARSET
    128 : 'cp932',      # SHIFTJIS_CHARSET
    134 : 'cp936',      # GB2312_CHARSET
    129 : 'cp949',      # HANGUL_CHARSET
    136 : 'cp950',      # CHINESEBIG5_CHARSET
    1   : None,         # DEFAULT_CHARSET
    2   : None,         # SYMBOL_CHARSET
    130 : None,         # JOHAB_CHARSET
    163 : None,         # VIETNAMESE_CHARSET
    77  : None,         # MAC_CHARSET
 }
 locale_table = {
    0x0436 : ('iso8859_1', "Afrikaans", "Western Europe & US"),
    0x041c : ('iso8859_2', "Albanian", "Central Europe"),
    0x0401 : ('iso8859_6', "Arabic_Saudi_Arabia", "Arabic"),
    0x0801 : ('iso8859_6', "Arabic_Iraq", "Arabic"),
    0x0c01 : ('iso8859_6', "Arabic_Egypt", "Arabic"),
    0x1001 : ('iso8859_6', "Arabic_Libya", "Arabic"),
    0x1401 : ('iso8859_6', "Arabic_Algeria", "Arabic"),
    0x1801 : ('iso8859_6', "Arabic_Morocco", "Arabic"),
    0x1c01 : ('iso8859_6', "Arabic_Tunisia", "Arabic"),
    0x2001 : ('iso8859_6', "Arabic_Oman", "Arabic"),
    0x2401 : ('iso8859_6', "Arabic_Yemen", "Arabic"),
    0x2801 : ('iso8859_6', "Arabic_Syria", "Arabic"),
    0x2c01 : ('iso8859_6', "Arabic_Jordan", "Arabic"),
    0x3001 : ('iso8859_6', "Arabic_Lebanon", "Arabic"),
    0x3401 : ('iso8859_6', "Arabic_Kuwait", "Arabic"),
    0x3801 : ('iso8859_6', "Arabic_UAE", "Arabic"),
    0x3c01 : ('iso8859_6', "Arabic_Bahrain", "Arabic"),
    0x4001 : ('iso8859_6', "Arabic_Qatar", "Arabic"),
    0x042b : (None,        "Armenian","Armenian"),
    0x042c : ('iso8859_9', "Azeri_Latin", "Turkish"),
    0x082c : ('cp1251',    "Azeri_Cyrillic", "Cyrillic"),
    0x042d : ('iso8859_1', "Basque", "Western Europe & US"),
    0x0423 : ('cp1251',    "Belarusian", "Cyrillic"),
    0x0402 : ('cp1251',    "Bulgarian", "Cyrillic"),
    0x0403 : ('iso8859_1', "Catalan", "Western Europe & US"),
    0x0404 : ('cp950',     "Chinese_Taiwan", "Traditional Chinese"),
    0x0804 : ('cp936',     "Chinese_PRC", "Simplified Chinese"),
    0x0c04 : ('cp950',     "Chinese_Hong_Kong", "Traditional Chinese"),
    0x1004 : ('cp936',     "Chinese_Singapore", "Simplified Chinese"),
    0x1404 : ('cp950',     "Chinese_Macau", "Traditional Chinese"),
    0x041a : ('iso8859_2', "Croatian", "Central Europe"),
    0x0405 : ('iso8859_2', "Czech", "Central Europe"),
    0x0406 : ('iso8859_1', "Danish", "Western Europe & US"),
    0x0413 : ('iso8859_1', "Dutch_Standard", "Western Europe & US"),
    0x0813 : ('iso8859_1', "Dutch_Belgian", "Western Europe & US"),
    0x0409 : ('iso8859_1', "English_United_States", "Western Europe & US"),
    0x0809 : ('iso8859_1', "English_United_Kingdom", "Western Europe & US"),
    0x0c09 : ('iso8859_1', "English_Australian", "Western Europe & US"),
    0x1009 : ('iso8859_1', "English_Canadian", "Western Europe & US"),
    0x1409 : ('iso8859_1', "English_New_Zealand", "Western Europe & US"),
    0x1809 : ('iso8859_1', "English_Irish", "Western Europe & US"),
    0x1c09 : ('iso8859_1', "English_South_Africa", "Western Europe & US"),
    0x2009 : ('iso8859_1', "English_Jamaica", "Western Europe & US"),
    0x2409 : ('iso8859_1', "English_Caribbean", "Western Europe & US"),
    0x2809 : ('iso8859_1', "English_Belize", "Western Europe & US"),
    0x2c09 : ('iso8859_1', "English_Trinidad", "Western Europe & US"),
    0x3009 : ('iso8859_1', "English_Zimbabwe", "Western Europe & US"),
    0x3409 : ('iso8859_1', "English_Philippines", "Western Europe & US"),
    0x0425 : ('iso8859_13',"Estonian", "Baltic",),
    0x0438 : ('iso8859_1', "Faeroese", "Western Europe & US"),
    0x0429 : ('iso8859_6', "Farsi", "Arabic"),
    0x040b : ('iso8859_1', "Finnish", "Western Europe & US"),
    0x040c : ('iso8859_1', "French_Standard", "Western Europe & US"),
    0x080c : ('iso8859_1', "French_Belgian", "Western Europe & US"),
    0x0c0c : ('iso8859_1', "French_Canadian", "Western Europe & US"),
    0x100c : ('iso8859_1', "French_Swiss", "Western Europe & US"),
    0x140c : ('iso8859_1', "French_Luxembourg", "Western Europe & US"),
    0x180c : ('iso8859_1', "French_Monaco", "Western Europe & US"),
    0x0437 : (None,        "Georgian", "Georgian"),
    0x0407 : ('iso8859_1', "German_Standard", "Western Europe & US"),
    0x0807 : ('iso8859_1', "German_Swiss", "Western Europe & US"),
    0x0c07 : ('iso8859_1', "German_Austrian", "Western Europe & US"),
    0x1007 : ('iso8859_1', "German_Luxembourg", "Western Europe & US"),
    0x1407 : ('iso8859_1', "German_Liechtenstein", "Western Europe & US"),
    0x0408 : ('iso8859_7', "Greek", "Greek"),
    0x040d : ('iso8859_8', "Hebrew", "Hebrew"),
    0x0439 : (None,        "Hindi", "Indic"),
    0x040e : ('iso8859_2', "Hungarian", "Central Europe"),
    0x040f : ('iso8859_1', "Icelandic", "Western Europe & US"),
    0x0421 : ('iso8859_1', "Indonesian", "Western Europe & US"),
    0x0410 : ('iso8859_1', "Italian_Standard", "Western Europe & US"),
    0x0810 : ('iso8859_1', "Italian_Swiss", "Western Europe & US"),
    0x0411 : ('cp932',     "Japanese", "Japanese"),
    0x043f : ('cp1251',    "Kazakh", "Cyrillic"),
    0x0457 : (None,        "Konkani", "Indic"),
    0x0412 : ('cp949',     "Korean", "Korean"),
    0x0426 : ('iso8859_13',"Latvian", "Baltic",),
    0x0427 : ('iso8859_13',"Lithuanian", "Baltic",),
    0x042f : ('cp1251',    "Macedonian", "Cyrillic"),
    0x043e : ('iso8859_1', "Malay_Malaysia", "Western Europe & US"),
    0x083e : ('iso8859_1', "Malay_Brunei_Darussalam", "Western Europe & US"),
    0x044e : (None,        "Marathi", "Indic"),
    0x0414 : ('iso8859_1', "Norwegian_Bokmal", "Western Europe & US"),
    0x0814 : ('iso8859_1', "Norwegian_Nynorsk", "Western Europe & US"),
    0x0415 : ('iso8859_2', "Polish", "Central Europe"),
    0x0416 : ('iso8859_1', "Portuguese_Brazilian", "Western Europe & US"),
    0x0816 : ('iso8859_1', "Portuguese_Standard", "Western Europe & US"),
    0x0418 : ('iso8859_2', "Romanian", "Central Europe"),
    0x0419 : ('cp1251',    "Russian", "Cyrillic"),
    0x044f : (None,        "Sanskrit", "Indic"),
    0x081a : ('iso8859_2', "Serbian_Latin", "Central Europe"),
    0x0c1a : ('cp1251',    "Serbian_Cyrillic", "Cyrillic"),
    0x041b : ('iso8859_2', "Slovak", "Central Europe"),
    0x0424 : ('iso8859_2', "Slovenian", "Central Europe"),
    0x040a : ('iso8859_1', "Spanish_Trad_Sort", "Western Europe & US"),
    0x080a : ('iso8859_1', "Spanish_Mexican", "Western Europe & US"),
    0x0c0a : ('iso8859_1', "Spanish_Modern_Sort", "Western Europe & US"),
    0x100a : ('iso8859_1', "Spanish_Guatemala", "Western Europe & US"),
    0x140a : ('iso8859_1', "Spanish_Costa_Rica", "Western Europe & US"),
    0x180a : ('iso8859_1', "Spanish_Panama", "Western Europe & US"),
    0x1c0a : ('iso8859_1', "Spanish_Dominican_Repub", "Western Europe & US"),
    0x200a : ('iso8859_1', "Spanish_Venezuela", "Western Europe & US"),
    0x240a : ('iso8859_1', "Spanish_Colombia", "Western Europe & US"),
    0x280a : ('iso8859_1', "Spanish_Peru", "Western Europe & US"),
    0x2c0a : ('iso8859_1', "Spanish_Argentina", "Western Europe & US"),
    0x300a : ('iso8859_1', "Spanish_Ecuador", "Western Europe & US"),
    0x340a : ('iso8859_1', "Spanish_Chile", "Western Europe & US"),
    0x380a : ('iso8859_1', "Spanish_Uruguay", "Western Europe & US"),
    0x3c0a : ('iso8859_1', "Spanish_Paraguay", "Western Europe & US"),
    0x400a : ('iso8859_1', "Spanish_Bolivia", "Western Europe & US"),
    0x440a : ('iso8859_1', "Spanish_El_Salvador", "Western Europe & US"),
    0x480a : ('iso8859_1', "Spanish_Honduras", "Western Europe & US"),
    0x4c0a : ('iso8859_1', "Spanish_Nicaragua", "Western Europe & US"),
    0x500a : ('iso8859_1', "Spanish_Puerto_Rico", "Western Europe & US"),
    0x0441 : ('iso8859_1', "Swahili", "Western Europe & US"),
    0x041d : ('iso8859_1', "Swedish", "Western Europe & US"),
    0x081d : ('iso8859_1', "Swedish_Finland", "Western Europe & US"),
    0x0449 : (None,        "Tamil", "Indic"),
    0x0444 : ('cp1251',    "Tatar", "Cyrillic"),
    0x041e : ('iso8859_11',"Thai", "Thai"),
    0x041f : ('iso8859_9', "Turkish", "Turkish"),
    0x0422 : ('cp1251',    "Ukrainian", "Cyrillic"),
    0x0420 : ('iso8859_6', "Urdu", "Arabic"),
    0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"),
    0x0843 : ('cp1251',    "Uzbek_Cyrillic", "Cyrillic"),
    0x042a : (None,        "Vietnamese", "Vietnamese")
 }
 class CHMFile:
    "A class to manage access to CHM files."
    filename = ""
    file = None
    title = ""
    home = "/"
    index = None
    topics = None
    encoding = None
    lcid = None
    binaryindex = None
    def __init__(self):
        self.searchable = 0
    def LoadCHM(self, archiveName):
        '''Loads a CHM archive.
        This function will also call GetArchiveInfo to obtain information
        such as the index file name and the topics file. It returns 1 on
        success, and 0 if it fails.
        '''
        if (self.filename != None):
            self.CloseCHM()
        self.file = chmlib.chm_open(archiveName)
        if (self.file == None):
            return 0
        self.filename = archiveName
        self.GetArchiveInfo()
        return 1
    def CloseCHM(self):
        '''Closes the CHM archive.
        This function will close the CHM file, if it is open. All variables
        are also reset.
        '''
        if (self.filename != None):
            chmlib.chm_close(self.file)
            self.file = None
            self.filename = ''
            self.title = ""
            self.home = "/"
            self.index = None
            self.topics = None
            self.encoding = None
    def GetArchiveInfo(self):
        '''Obtains information on CHM archive.
        This function checks the /#SYSTEM file inside the CHM archive to
        obtain the index, home page, topics, encoding and title. It is called
        from LoadCHM.
        '''
        #extra.is_searchable crashed...
        #self.searchable = extra.is_searchable (self.file)
        self.searchable = False
        self.lcid = None
        result, ui = chmlib.chm_resolve_object(self.file, '/#SYSTEM')
        if (result != chmlib.CHM_RESOLVE_SUCCESS):
            sys.stderr.write('GetArchiveInfo: #SYSTEM does not exist\n')
            return 0
        size, text = chmlib.chm_retrieve_object(self.file, ui, 4l, ui.length)
        if (size == 0):
            sys.stderr.write('GetArchiveInfo: file size = 0\n')
            return 0
        buff = array.array('B', text)
        index = 0
        while (index < size):
            cursor = buff[index] + (buff[index+1] * 256)
            if (cursor == 0):
                index += 2
                cursor = buff[index] + (buff[index+1] * 256)
                index += 2
                self.topics = '/' + text[index:index+cursor-1]
            elif (cursor == 1):
                index += 2
                cursor = buff[index] + (buff[index+1] * 256)
                index += 2
                self.index = '/' + text[index:index+cursor-1]
            elif (cursor == 2):
                index += 2
                cursor = buff[index] + (buff[index+1] * 256)
                index += 2
                self.home = '/' + text[index:index+cursor-1]
            elif (cursor == 3):
                index += 2
                cursor = buff[index] + (buff[index+1] * 256)
                index += 2
                self.title = text[index:index+cursor-1]
            elif (cursor == 4):
                index += 2
                cursor = buff[index] + (buff[index+1] * 256)
                index += 2
                self.lcid = buff[index] + (buff[index+1] * 256)
            elif (cursor == 6):
                index += 2
                cursor = buff[index] + (buff[index+1] * 256)
                index += 2
                tmp = text[index:index+cursor-1]
                if not self.topics:
                    tmp1 = '/' + tmp + '.hhc'
                    tmp2 = '/' + tmp + '.hhk'
                    res1, ui1 = chmlib.chm_resolve_object(self.file, tmp1)
                    res2, ui2 = chmlib.chm_resolve_object(self.file, tmp2)
                    if (not self.topics) and \
                           (res1 == chmlib.CHM_RESOLVE_SUCCESS):
                        self.topics = '/' + tmp + '.hhc'
                    if (not self.index) and \
                           (res2 == chmlib.CHM_RESOLVE_SUCCESS):
                        self.index = '/' + tmp + '.hhk'
            elif (cursor == 16):
                index += 2
                cursor = buff[index] + (buff[index+1] * 256)
                index += 2
                self.encoding = text[index:index+cursor-1]
            else:
                index += 2
                cursor = buff[index] + (buff[index+1] * 256)
                index += 2
            index += cursor
        self.GetWindowsInfo()
        if not self.lcid:
            self.lcid = extra.get_lcid (self.file)
        return 1
    def GetTopicsTree(self):
        '''Reads and returns the topics tree.
        This auxiliary function reads and returns the topics tree file
        contents for the CHM archive.
        '''
        if (self.topics == None):
            return None
        if self.topics:
            res, ui = chmlib.chm_resolve_object(self.file, self.topics)
            if (res != chmlib.CHM_RESOLVE_SUCCESS):
                return None
        size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
        if (size == 0):
            sys.stderr.write('GetTopicsTree: file size = 0\n')
            return None
        return text
    def GetIndex(self):
        '''Reads and returns the index tree.
        This auxiliary function reads and returns the index tree file
        contents for the CHM archive.
        '''
        if (self.index == None):
            return None
        if self.index:
            res, ui = chmlib.chm_resolve_object(self.file, self.index)
            if (res != chmlib.CHM_RESOLVE_SUCCESS):
                return None
        size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
        if (size == 0):
            sys.stderr.write('GetIndex: file size = 0\n')
            return None
        return text
    def ResolveObject(self, document):
        '''Tries to locate a document in the archive.
        This function tries to locate the document inside the archive. It
        returns a tuple where the first element is zero if the function
        was successful, and the second is the UnitInfo for that document.
        The UnitInfo is used to retrieve the document contents
        '''
        if self.file:
            #path = os.path.abspath(document)
            path = document
            return chmlib.chm_resolve_object(self.file, path)
        else:
            return (1, None)
    def RetrieveObject(self, ui, start = -1, length = -1):
        '''Retrieves the contents of a document.
        This function takes a UnitInfo and two optional arguments, the first
        being the start address and the second is the length. These define
        the amount of data to be read from the archive.
        '''
        if self.file and ui:
            if length == -1:
                len = ui.length
            else:
                len = length
            if start == -1:
                st = 0l
            else:
                st = long(start)
            return chmlib.chm_retrieve_object(self.file, ui, st, len)
        else:
            return (0, '')
    def Search(self, text, wholewords=0, titleonly=0):
        '''Performs full-text search on the archive.
        The first parameter is the word to look for, the second
        indicates if the search should be for whole words only, and
        the third parameter indicates if the search should be
        restricted to page titles.
        This method will return a tuple, the first item
        indicating if the search results were partial, and the second
        item being a dictionary containing the results.'''
        if text and text != '' and self.file:
            return extra.search (self.file, text, wholewords,
                                 titleonly)
        else:
            return None
    def IsSearchable(self):
        '''Indicates if the full-text search is available for this
        archive - this flag is updated when GetArchiveInfo is called'''
        return self.searchable
    def GetEncoding(self):
        '''Returns a string that can be used with the codecs python package
        to encode or decode the files in the chm archive. If an error is
        found, or if it is not possible to find the encoding, None is
        returned.'''
        if self.encoding:
            vals = string.split(self.encoding, ',')
            if len(vals) > 2:
                try:
                    return charset_table[int(vals[2])]
                except KeyError:
                    pass
        return None
    def GetLCID(self):
        '''Returns the archive Locale ID'''
        if self.lcid in locale_table:
            return locale_table[self.lcid]
        else:
            return None
    def GetDWORD(self, buff, idx=0):
        '''Internal method.
        Reads a double word (4 bytes) from a buffer.
        '''
        result = buff[idx] + (buff[idx+1]<<8) + (buff[idx+2]<<16) + \
                 (buff[idx+3]<<24)
        if result == 0xFFFFFFFF:
            result = 0
        return result
    def GetString(self, text, idx):
        '''Internal method.
        Retrieves a string from the #STRINGS buffer.
        '''
        next = string.find(text, '\x00', idx)
        chunk = text[idx:next]
        return chunk
    def GetWindowsInfo(self):
        '''Gets information from the #WINDOWS file.
        Checks the #WINDOWS file to see if it has any info that was
        not found in #SYSTEM (topics, index or default page.
        '''
        result, ui = chmlib.chm_resolve_object(self.file, '/#WINDOWS')
        if (result != chmlib.CHM_RESOLVE_SUCCESS):
            return -1
        size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, 8)
        if (size < 8):
            return -2
        buff = array.array('B', text)
        num_entries = self.GetDWORD(buff, 0)
        entry_size = self.GetDWORD(buff, 4)
        if num_entries < 1:
            return -3
        size, text = chmlib.chm_retrieve_object(self.file, ui, 8l, entry_size)
        if (size < entry_size):
            return -4
        buff = array.array('B', text)
        toc_index = self.GetDWORD(buff, 0x60)
        idx_index = self.GetDWORD(buff, 0x64)
        dft_index = self.GetDWORD(buff, 0x68)
        result, ui = chmlib.chm_resolve_object(self.file, '/#STRINGS')
        if (result != chmlib.CHM_RESOLVE_SUCCESS):
            return -5
        size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
        if (size == 0):
            return -6
        if (not self.topics):
            self.topics = self.GetString(text, toc_index)
            if not self.topics.startswith("/"):
                self.topics = "/" + self.topics
        if (not self.index):
            self.index = self.GetString(text, idx_index)
            if not self.index.startswith("/"):
                self.index = "/" + self.index
        if (dft_index != 0):
            self.home = self.GetString(text, dft_index)
            if not self.home.startswith("/"):
                self.home = "/" + self.home
--- a/src/calibre/utils/chm/chmlib.py
+++ b/src/calibre/utils/chm/chmlib.py
@ -0,0 +1,100 @@
 # This file was created automatically by SWIG.
 # Don't modify this file, modify the SWIG interface instead.
 # This file is compatible with both classic and new-style classes.
 from calibre.constants import plugins
 _chmlib, chmlib_err = plugins['chmlib']
 if chmlib_err:
    raise RuntimeError('Failed to load chmlib: '+chmlib_err)
 def _swig_setattr(self,class_type,name,value):
    if (name == "this"):
        if isinstance(value, class_type):
            self.__dict__[name] = value.this
            if hasattr(value,"thisown"): self.__dict__["thisown"] = value.thisown
            del value.thisown
            return
    method = class_type.__swig_setmethods__.get(name,None)
    if method: return method(self,value)
    self.__dict__[name] = value
 def _swig_getattr(self,class_type,name):
    method = class_type.__swig_getmethods__.get(name,None)
    if method: return method(self)
    raise AttributeError,name
 import types
 try:
    _object = types.ObjectType
    _newclass = 1
 except AttributeError:
    class _object : pass
    _newclass = 0
 CHM_UNCOMPRESSED = _chmlib.CHM_UNCOMPRESSED
 CHM_COMPRESSED = _chmlib.CHM_COMPRESSED
 CHM_MAX_PATHLEN = _chmlib.CHM_MAX_PATHLEN
 class chmUnitInfo(_object):
    __swig_setmethods__ = {}
    __setattr__ = lambda self, name, value: _swig_setattr(self, chmUnitInfo, name, value)
    __swig_getmethods__ = {}
    __getattr__ = lambda self, name: _swig_getattr(self, chmUnitInfo, name)
    __swig_setmethods__["start"] = _chmlib.chmUnitInfo_start_set
    __swig_getmethods__["start"] = _chmlib.chmUnitInfo_start_get
    if _newclass:start = property(_chmlib.chmUnitInfo_start_get,_chmlib.chmUnitInfo_start_set)
    __swig_setmethods__["length"] = _chmlib.chmUnitInfo_length_set
    __swig_getmethods__["length"] = _chmlib.chmUnitInfo_length_get
    if _newclass:length = property(_chmlib.chmUnitInfo_length_get,_chmlib.chmUnitInfo_length_set)
    __swig_setmethods__["space"] = _chmlib.chmUnitInfo_space_set
    __swig_getmethods__["space"] = _chmlib.chmUnitInfo_space_get
    if _newclass:space = property(_chmlib.chmUnitInfo_space_get,_chmlib.chmUnitInfo_space_set)
    __swig_setmethods__["path"] = _chmlib.chmUnitInfo_path_set
    __swig_getmethods__["path"] = _chmlib.chmUnitInfo_path_get
    if _newclass:path = property(_chmlib.chmUnitInfo_path_get,_chmlib.chmUnitInfo_path_set)
    def __init__(self,*args):
        _swig_setattr(self, chmUnitInfo, 'this', apply(_chmlib.new_chmUnitInfo,args))
        _swig_setattr(self, chmUnitInfo, 'thisown', 1)
    def __del__(self, destroy= _chmlib.delete_chmUnitInfo):
        try:
            if self.thisown: destroy(self)
        except: pass
    def __repr__(self):
        return "<C chmUnitInfo instance at %s>" % (self.this,)
 class chmUnitInfoPtr(chmUnitInfo):
    def __init__(self,this):
        _swig_setattr(self, chmUnitInfo, 'this', this)
        if not hasattr(self,"thisown"): _swig_setattr(self, chmUnitInfo, 'thisown', 0)
        _swig_setattr(self, chmUnitInfo,self.__class__,chmUnitInfo)
 _chmlib.chmUnitInfo_swigregister(chmUnitInfoPtr)
 chm_open = _chmlib.chm_open
 chm_close = _chmlib.chm_close
 CHM_PARAM_MAX_BLOCKS_CACHED = _chmlib.CHM_PARAM_MAX_BLOCKS_CACHED
 chm_set_param = _chmlib.chm_set_param
 CHM_RESOLVE_SUCCESS = _chmlib.CHM_RESOLVE_SUCCESS
 CHM_RESOLVE_FAILURE = _chmlib.CHM_RESOLVE_FAILURE
 chm_resolve_object = _chmlib.chm_resolve_object
 chm_retrieve_object = _chmlib.chm_retrieve_object
 CHM_ENUMERATE_NORMAL = _chmlib.CHM_ENUMERATE_NORMAL
 CHM_ENUMERATE_META = _chmlib.CHM_ENUMERATE_META
 CHM_ENUMERATE_SPECIAL = _chmlib.CHM_ENUMERATE_SPECIAL
 CHM_ENUMERATE_FILES = _chmlib.CHM_ENUMERATE_FILES
 CHM_ENUMERATE_DIRS = _chmlib.CHM_ENUMERATE_DIRS
 CHM_ENUMERATE_ALL = _chmlib.CHM_ENUMERATE_ALL
 CHM_ENUMERATOR_FAILURE = _chmlib.CHM_ENUMERATOR_FAILURE
 CHM_ENUMERATOR_CONTINUE = _chmlib.CHM_ENUMERATOR_CONTINUE
 CHM_ENUMERATOR_SUCCESS = _chmlib.CHM_ENUMERATOR_SUCCESS
 chm_enumerate = _chmlib.chm_enumerate
 chm_enumerate_dir = _chmlib.chm_enumerate_dir
--- a/src/calibre/utils/chm/extra.c
+++ b/src/calibre/utils/chm/extra.c
@ -0,0 +1,759 @@
 /*
 * extra.c - full-text search support for pychm
 *
 * Copyright (C) 2004 Rubens Ramos <rubensr@users.sourceforge.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, 
 * Boston, MA 02111-1307, USA.
 *
 * Author: Rubens Ramos <rubensr@users.sourceforge.net>
 *
 * Heavily based on work done by:
 * Pabs <pabs@zip.to> - chmdeco
 * Razvan Cojocaru <razvanco@gmx.net> - xCHM
 *
 */
 #include "chm_lib.h"
 #ifdef __PYTHON__
 #include "Python.h"
 #else
 #include <stdio.h>
 #define PyObject void
 #endif
 #include <stdlib.h>
 #ifdef _MSC_VER
 #include "stdint.h"
 #define strcasecmp stricmp
 #define strncasecmp strnicmp
 #else
 #include <inttypes.h>
 #include <strings.h>
 #endif
 #if defined( _MSC_VER ) && !defined( __cplusplus )
 # define inline __inline
 #endif
 #if defined(_WIN32) || defined(__WIN32__)
 #       if defined(_MSC_VER)
 #               if defined(STATIC_LINKED)
 #                       define MODEXPORT(a) a
 #                       define MODIMPORT(a) extern a
 #               else
 #                       define MODEXPORT(a) __declspec(dllexport) a
 #                       define MODIMPORT(a) extern a
 #               endif
 #       else
 #               if defined(__BORLANDC__)
 #                       define MODEXPORT(a) a _export
 #                       define MODIMPORT(a) a _export
 #               else
 #                       define MODEXPORT(a) a
 #                       define MODIMPORT(a) a
 #               endif
 #       endif
 #else
 #       define MODEXPORT(a) a
 #       define MODIMPORT(a) a
 #endif
 #define false 0
 #define true 1
 #define FTS_HEADER_LEN 0x32
 #define TOPICS_ENTRY_LEN 16
 #define COMMON_BUF_LEN 1025
 #define FREE(x) free (x); x = NULL
 inline uint16_t 
 get_uint16 (uint8_t* b) {
  return b[0] |
    b[1]<<8;
 }
 inline uint32_t 
 get_uint32 (uint8_t* b) {
  return b[0] |
    b[1]<<8   |
    b[2]<<16  |
    b[3]<<24;
 }
 inline uint64_t 
 get_uint64 (uint8_t* b) {
  return b[0]           |
    b[1]<<8             |
    b[2]<<16            |
    b[3]<<24            |
    (uint64_t) b[4]<<32 |
    (uint64_t) b[5]<<40 |
    (uint64_t) b[6]<<48 |
    (uint64_t) b[7]<<56;
 }
 inline uint64_t 
 be_encint (unsigned char *buffer, size_t *length)
 {
  uint64_t result = 0;
  int shift=0;
  *length = 0;
  do {
    result |= ((*buffer) & 0x7f) << shift;
    shift += 7;
    *length = *length + 1;
  } while (*(buffer++) & 0x80);
  return result;
 }
 /*
  Finds the first unset bit in memory. Returns the number of set bits found.
  Returns -1 if the buffer runs out before we find an unset bit.
 */
 inline int
 ffus (unsigned char* byte, int* bit, size_t *length) {
  int bits = 0;
  *length = 0;
  while(*byte & (1 << *bit)){
    if(*bit)
      --(*bit);
    else {
      ++byte;
      ++(*length);
      *bit = 7;
    }
    ++bits;
  }
  if(*bit)
    --(*bit);
  else {
    ++(*length);
    *bit = 7;
  }
  return bits;
 }
 inline uint64_t
 sr_int(unsigned char* byte, int* bit,
       unsigned char s, unsigned char r, size_t *length)
 {
  uint64_t ret;
  unsigned char mask;
  int n, n_bits, num_bits, base, count;
  size_t fflen;
  *length = 0;
  if(!bit || *bit > 7 || s != 2)
    return ~(uint64_t)0;
  ret = 0;
  count = ffus(byte, bit, &fflen);
  *length += fflen;
  byte += *length;
  n_bits = n = r + (count ? count-1 : 0) ;
  while (n > 0) {
    num_bits = n > *bit ? *bit : n-1;
    base = n > *bit ? 0 : *bit - (n-1);
    switch (num_bits){
    case 0:
      mask = 1;
      break;
    case 1:
      mask = 3;
      break;
    case 2:
      mask = 7;
      break;
    case 3:
      mask = 0xf;
      break;
    case 4:
      mask = 0x1f;
      break;
    case 5:
      mask = 0x3f;
      break;
    case 6:
      mask = 0x7f;
      break;
    case 7:
      mask = 0xff;
      break;
    default:
      mask = 0xff;
      break;
    }
    mask <<= base;
    ret = (ret << (num_bits+1)) |
      (uint64_t)((*byte & mask) >> base);
    if( n > *bit ){
      ++byte;
      ++(*length);
      n -= *bit+1;
      *bit = 7;
    } else {
      *bit -= n;
      n = 0;
    }
  }
  if(count)
    ret |= (uint64_t)1 << n_bits;
  return ret;
 }
 inline uint32_t
 get_leaf_node_offset(struct chmFile *chmfile,
                     const char *text,
                     uint32_t initial_offset,
                     uint32_t buff_size,
                     uint16_t tree_depth,
                     struct chmUnitInfo *ui)
 {
  unsigned char word_len;
  unsigned char pos;
  uint16_t free_space;
  char *wrd_buf;
  char *word = NULL;
  uint32_t test_offset = 0;
  uint32_t i = sizeof(uint16_t);
  unsigned char *buffer = (unsigned char *)malloc (buff_size);
  if (NULL == buffer)
    return 0;
  while (--tree_depth) {
    if (initial_offset == test_offset) {
      FREE(buffer);
      return 0;
    }
    test_offset = initial_offset;
    if (chm_retrieve_object (chmfile, ui, buffer, 
                             initial_offset, buff_size) == 0) {
      FREE(buffer);
      return 0;
    }
    free_space = get_uint16 (buffer);
    while (i < buff_size - free_space) {
      word_len = *(buffer + i);
      pos = *(buffer + i + 1);
      wrd_buf = (char*)malloc (word_len);
      memcpy (wrd_buf, buffer + i + 2, word_len - 1);
      wrd_buf[word_len - 1] = 0;
      if (pos == 0) {
        FREE (word);
        word = (char *) strdup (wrd_buf);
      } else {
        word = (char*)realloc (word, word_len + pos + 1);
        strcpy (word + pos, wrd_buf);
      }
      FREE(wrd_buf);
      if (strcasecmp (text, word) <= 0) {
        initial_offset = get_uint32 (buffer + i + word_len + 1);
        break;
      }
      i += word_len + sizeof (unsigned char) + sizeof(uint32_t) + 
        sizeof(uint16_t);
    }
  }
  if(initial_offset == test_offset)
    initial_offset = 0;
  FREE(word);
  FREE(buffer);
  return initial_offset;
 }
 inline int 
 pychm_process_wlc (struct chmFile *chmfile,
                   uint64_t wlc_count, uint64_t wlc_size,
                   uint32_t wlc_offset, unsigned char ds,
                   unsigned char dr, unsigned char cs,
                   unsigned char cr, unsigned char ls,
                   unsigned char lr, struct chmUnitInfo *uimain,
                   struct chmUnitInfo* uitbl,
                   struct chmUnitInfo *uistrings,
                   struct chmUnitInfo* topics,
                   struct chmUnitInfo *urlstr,
                   PyObject *dict)
 {
  uint32_t stroff, urloff;
  uint64_t i, j, count;
  size_t length;
  int wlc_bit = 7;
  size_t off = 0;
  uint64_t index = 0;
  unsigned char entry[TOPICS_ENTRY_LEN];
  unsigned char combuf[COMMON_BUF_LEN];
  unsigned char *buffer = (unsigned char *)malloc (wlc_size);
  char *url = NULL;
  char *topic = NULL;
  if (chm_retrieve_object(chmfile, uimain, buffer, 
                          wlc_offset, wlc_size) == 0) {
    FREE(buffer);
    return false;
  }
  for (i = 0; i < wlc_count; ++i) {
    if(wlc_bit != 7) {
      ++off;
      wlc_bit = 7;
    }
    index += sr_int(buffer + off, &wlc_bit, ds, dr, &length);
    off += length;
    if(chm_retrieve_object(chmfile, topics, entry, 
                           index * 16, TOPICS_ENTRY_LEN) == 0) {
      FREE(topic);
      FREE(url);
      FREE(buffer);
      return false;
    }
    combuf[COMMON_BUF_LEN - 1] = 0;
    stroff = get_uint32 (entry + 4);
    FREE (topic);
    if (chm_retrieve_object (chmfile, uistrings, combuf, 
                             stroff, COMMON_BUF_LEN - 1) == 0) {
      topic = strdup ("Untitled in index");
    } else {
      combuf[COMMON_BUF_LEN - 1] = 0;
      topic = strdup ((char*)combuf);
    }
    urloff = get_uint32 (entry + 8);
    if(chm_retrieve_object (chmfile, uitbl, combuf, 
                            urloff, 12) == 0) {
      FREE(buffer);
      return false;
    }
    urloff = get_uint32 (combuf + 8);
    if (chm_retrieve_object (chmfile, urlstr, combuf, 
                             urloff + 8, COMMON_BUF_LEN - 1) == 0) {
      FREE(topic);
      FREE(url);
      FREE(buffer);
      return false;
    }
    combuf[COMMON_BUF_LEN - 1] = 0;
    FREE (url);
    url = strdup ((char*)combuf);
    if (url && topic) {
 #ifdef __PYTHON__
      PyDict_SetItemString (dict, topic, 
                            PyString_FromString (url));
 #else
      printf ("%s ==> %s\n", url, topic);
 #endif
    }
    count = sr_int (buffer + off, &wlc_bit, cs, cr, &length);
    off += length;
    for (j = 0; j < count; ++j) {
      sr_int (buffer + off, &wlc_bit, ls, lr, &length);
      off += length;
    }
  }
  FREE(topic);
  FREE(url);
  FREE(buffer);
  return true;
 }
 int 
 chm_search (struct chmFile *chmfile,
            const char *text, int whole_words, 
            int titles_only, PyObject *dict)
 {
  unsigned char header[FTS_HEADER_LEN];
  unsigned char doc_index_s;
  unsigned char doc_index_r;
  unsigned char code_count_s;
  unsigned char code_count_r;
  unsigned char loc_codes_s;
  unsigned char loc_codes_r;
  unsigned char word_len, pos;
  unsigned char *buffer;
  char *word = NULL;
  uint32_t node_offset;
  uint32_t node_len;
  uint16_t tree_depth;
  uint32_t i;
  uint16_t free_space;
  uint64_t wlc_count, wlc_size;
  uint32_t wlc_offset;
  char *wrd_buf;
  unsigned char title;
  size_t encsz;
  struct chmUnitInfo ui, uitopics, uiurltbl, uistrings, uiurlstr;
  int partial = false;
  if (NULL == text)
    return -1;
  if (chm_resolve_object (chmfile, "/$FIftiMain", &ui) !=
      CHM_RESOLVE_SUCCESS || 
      chm_resolve_object (chmfile, "/#TOPICS", &uitopics) !=
      CHM_RESOLVE_SUCCESS ||
      chm_resolve_object (chmfile, "/#STRINGS", &uistrings) !=
      CHM_RESOLVE_SUCCESS ||
      chm_resolve_object (chmfile, "/#URLTBL", &uiurltbl) !=
      CHM_RESOLVE_SUCCESS ||
      chm_resolve_object (chmfile, "/#URLSTR", &uiurlstr) !=
      CHM_RESOLVE_SUCCESS)
    return false;
  if(chm_retrieve_object(chmfile, &ui, header, 0, FTS_HEADER_LEN) == 0)
    return false;
  doc_index_s = header[0x1E];
  doc_index_r = header[0x1F];
  code_count_s = header[0x20];
  code_count_r = header[0x21];
  loc_codes_s = header[0x22];
  loc_codes_r = header[0x23];
  if(doc_index_s != 2 || code_count_s != 2 || loc_codes_s != 2) {
    return false;
  }
  node_offset = get_uint32 (header + 0x14);
  node_len = get_uint32 (header + 0x2e);
  tree_depth = get_uint16 (header + 0x18);
  i = sizeof(uint16_t);
  buffer = (unsigned char*)malloc (node_len);
  node_offset = get_leaf_node_offset (chmfile, text, node_offset, node_len,
                                      tree_depth, &ui);
  if (!node_offset) { 
    FREE(buffer);
    return false;
  }
  do {
    if (chm_retrieve_object (chmfile, &ui, buffer, 
                             node_offset, node_len) == 0) {
      FREE(word);
      FREE(buffer);
      return false;
    }
    free_space = get_uint16 (buffer + 6);
    i = sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint16_t);
    encsz = 0;
    while (i < node_len - free_space) {
      word_len = *(buffer + i);
      pos = *(buffer + i + 1);
      wrd_buf = (char*)malloc (word_len);
      memcpy (wrd_buf, buffer + i + 2, word_len - 1);
      wrd_buf[word_len - 1] = 0;
      if (pos == 0) {
        FREE(word);
        word = (char *) strdup (wrd_buf);
      } else {
        word = (char*)realloc (word, word_len + pos + 1);
        strcpy (word + pos, wrd_buf);
      }
      FREE(wrd_buf);
      i += 2 + word_len;
      title = *(buffer + i - 1);
      wlc_count = be_encint (buffer + i, &encsz);
      i += encsz;
      wlc_offset = get_uint32 (buffer + i);
      i += sizeof(uint32_t) + sizeof(uint16_t);
      wlc_size =  be_encint (buffer + i, &encsz);
      i += encsz;
      node_offset = get_uint32 (buffer);
      if (!title && titles_only)
        continue;
      if (whole_words && !strcasecmp(text, word)) {
        partial = pychm_process_wlc (chmfile, wlc_count, wlc_size, 
                                     wlc_offset, doc_index_s, 
                                     doc_index_r,code_count_s, 
                                     code_count_r, loc_codes_s, 
                                     loc_codes_r, &ui, &uiurltbl,
                                     &uistrings, &uitopics,
                                     &uiurlstr, dict);
        FREE(word);
        FREE(buffer);
        return partial;
      }
      if (!whole_words) {
        if (!strncasecmp (word, text, strlen(text))) {
          partial = true;
          pychm_process_wlc (chmfile, wlc_count, wlc_size, 
                             wlc_offset, doc_index_s, 
                             doc_index_r,code_count_s, 
                             code_count_r, loc_codes_s, 
                             loc_codes_r, &ui, &uiurltbl,
                             &uistrings, &uitopics,
                             &uiurlstr, dict);
        } else if (strncasecmp (text, word, strlen(text)) < -1)
          break;
      }
    }
  } while (!whole_words && 
           !strncmp (word, text, strlen(text)) && 
           node_offset);
  FREE(word);
  FREE(buffer);
  return partial;
 }
 typedef struct {
  const char *file;
  int offset;
 } Langrec;
 Langrec lang_files[] = {
  {"/$FIftiMain",               0x7E},
  {"$WWKeywordLinks/BTree",     0x34},
  {"$WWAssociativeLinks/BTree", 0x34}
 };
 #define LANG_FILES_SIZE (sizeof(lang_files)/sizeof(Langrec))
 int
 chm_get_lcid (struct chmFile *chmfile) {
  struct chmUnitInfo ui;
  uint32_t lang;
  int i;
  for (i=0; i<LANG_FILES_SIZE; i++) {
    if (chm_resolve_object (chmfile, lang_files[i].file, &ui) == 
        CHM_RESOLVE_SUCCESS) {
      if (chm_retrieve_object (chmfile, &ui, (unsigned char *) &lang, 
                               lang_files[i].offset, sizeof(uint32_t)) != 0)
        return lang;
    }
  }
  return -1;
 }
 #ifdef __PYTHON__
 static PyObject *
 is_searchable (PyObject *self, PyObject *args) {
  struct chmFile *file;
  PyObject *obj0;
  struct chmUnitInfo ui;
  if (PyArg_ParseTuple (args, "O:is_searchable", &obj0)) {
    file = (struct chmFile *) PyCObject_AsVoidPtr(obj0);
    if (chm_resolve_object (file, "/$FIftiMain", &ui) !=
        CHM_RESOLVE_SUCCESS || 
        chm_resolve_object (file, "/#TOPICS", &ui) !=
        CHM_RESOLVE_SUCCESS ||
        chm_resolve_object (file, "/#STRINGS", &ui) !=
        CHM_RESOLVE_SUCCESS ||
        chm_resolve_object (file, "/#URLTBL", &ui) !=
        CHM_RESOLVE_SUCCESS ||
        chm_resolve_object (file, "/#URLSTR", &ui) !=
        CHM_RESOLVE_SUCCESS)
      return Py_BuildValue ("i", 0);
    else
      return Py_BuildValue ("i", 1);
  } else {
    PyErr_SetString(PyExc_TypeError, "Expected chmfile (not CHMFile!)");
    return NULL;
  }
 }
 static PyObject *
 search (PyObject *self, PyObject *args) {
  char *text;
  int whole_words;
  int titles_only;
  int partial;
  struct chmFile *file;
  PyObject *obj0;
  PyObject *dict;
  if (PyArg_ParseTuple (args, "Osii:search", &obj0, &text, 
                        &whole_words, &titles_only)) {
    dict = PyDict_New();
    if (dict) {
      file = (struct chmFile *) PyCObject_AsVoidPtr(obj0);
      partial = chm_search (file, 
                            text, whole_words, titles_only, dict);
      return Py_BuildValue ("(iO)", partial, dict);
    } else {
      PyErr_NoMemory();
      return NULL;
    }
  } else {
    PyErr_SetString(PyExc_TypeError,
                    "Expected chmfile (not CHMFile!), string, int, int");
    return NULL;
  }
 }
 static PyObject *
 get_lcid (PyObject *self, PyObject *args) {
  int code;
  struct chmFile *file;
  PyObject *obj0;
  if (PyArg_ParseTuple (args, "O:get_lcid", &obj0)) {
      file = (struct chmFile *) PyCObject_AsVoidPtr(obj0);
      code = chm_get_lcid (file);
      if (code != -1)
        return Py_BuildValue ("i", code);
      else 
        Py_INCREF(Py_None);
      return Py_None;
  } else {
    PyErr_SetString(PyExc_TypeError,"Expected a chmfile (not a CHMFile!)");
    return NULL;
  }
 }
 static PyMethodDef
 IndexMethods[] = {
  {"get_lcid", get_lcid, METH_VARARGS, 
   "Returns LCID (Locale ID) for archive."},
  {"search", search, METH_VARARGS, 
   "Perform Full-Text search."},
  {"is_searchable", is_searchable, METH_VARARGS, 
   "Return 1 if it is possible to search the archive, 0 otherwise."},
  {NULL, NULL, 0, NULL}
 };
 #ifdef __cplusplus
 extern "C"
 #endif
 MODEXPORT(void)
 initchm_extra (void) {
  Py_InitModule ("chm_extra", IndexMethods);
 }
 #else
 int
 main (int argc, char **argv) {
  struct chmFile *file;
  char text[255];
  int whole_words, titles_only;
  int partial;
  if (argc == 2) {
    file = chm_open (argv[1]);
    if (file) {
      printf ("\nLCID= %d (%08X)\n", chm_get_lcid(file), chm_get_lcid(file));
      while (1) {
        printf ("\n<whole_words> <titles_only> <string>\n");
        printf ("> ");
        if (scanf ("%d %d %s", &whole_words, &titles_only, text))
          partial = chm_search (file, 
                                text, whole_words, titles_only, NULL);
        else 
          break;
        printf ("Partial = %d\n", partial);
      }
      chm_close (file);
      return 0;
    }
    return -1;
  } else {
    printf ("\n%s <filename>\n", argv[0]);
    return 0;
  }
 }
 #endif
--- a/src/calibre/utils/chm/stdint.h
+++ b/src/calibre/utils/chm/stdint.h
@ -0,0 +1,247 @@
 // ISO C9x  compliant stdint.h for Microsoft Visual Studio
 // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
 // 
 //  Copyright (c) 2006-2008 Alexander Chemeris
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 // 
 //   1. Redistributions of source code must retain the above copyright notice,
 //      this list of conditions and the following disclaimer.
 // 
 //   2. Redistributions in binary form must reproduce the above copyright
 //      notice, this list of conditions and the following disclaimer in the
 //      documentation and/or other materials provided with the distribution.
 // 
 //   3. The name of the author may be used to endorse or promote products
 //      derived from this software without specific prior written permission.
 // 
 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // 
 ///////////////////////////////////////////////////////////////////////////////
 #ifndef _MSC_VER // [
 #error "Use this header only with Microsoft Visual C++ compilers!"
 #endif // _MSC_VER ]
 #ifndef _MSC_STDINT_H_ // [
 #define _MSC_STDINT_H_
 #if _MSC_VER > 1000
 #pragma once
 #endif
 #include <limits.h>
 // For Visual Studio 6 in C++ mode and for many Visual Studio versions when
 // compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
 // or compiler give many errors like this:
 //   error C2733: second C linkage of overloaded function 'wmemchr' not allowed
 #ifdef __cplusplus
 extern "C" {
 #endif
 #  include <wchar.h>
 #ifdef __cplusplus
 }
 #endif
 // Define _W64 macros to mark types changing their size, like intptr_t.
 #ifndef _W64
 #  if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
 #     define _W64 __w64
 #  else
 #     define _W64
 #  endif
 #endif
 // 7.18.1 Integer types
 // 7.18.1.1 Exact-width integer types
 // Visual Studio 6 and Embedded Visual C++ 4 doesn't
 // realize that, e.g. char has the same size as __int8
 // so we give up on __intX for them.
 #if (_MSC_VER < 1300)
   typedef signed char       int8_t;
   typedef signed short      int16_t;
   typedef signed int        int32_t;
   typedef unsigned char     uint8_t;
   typedef unsigned short    uint16_t;
   typedef unsigned int      uint32_t;
 #else
   typedef signed __int8     int8_t;
   typedef signed __int16    int16_t;
   typedef signed __int32    int32_t;
   typedef unsigned __int8   uint8_t;
   typedef unsigned __int16  uint16_t;
   typedef unsigned __int32  uint32_t;
 #endif
 typedef signed __int64       int64_t;
 typedef unsigned __int64     uint64_t;
 // 7.18.1.2 Minimum-width integer types
 typedef int8_t    int_least8_t;
 typedef int16_t   int_least16_t;
 typedef int32_t   int_least32_t;
 typedef int64_t   int_least64_t;
 typedef uint8_t   uint_least8_t;
 typedef uint16_t  uint_least16_t;
 typedef uint32_t  uint_least32_t;
 typedef uint64_t  uint_least64_t;
 // 7.18.1.3 Fastest minimum-width integer types
 typedef int8_t    int_fast8_t;
 typedef int16_t   int_fast16_t;
 typedef int32_t   int_fast32_t;
 typedef int64_t   int_fast64_t;
 typedef uint8_t   uint_fast8_t;
 typedef uint16_t  uint_fast16_t;
 typedef uint32_t  uint_fast32_t;
 typedef uint64_t  uint_fast64_t;
 // 7.18.1.4 Integer types capable of holding object pointers
 #ifdef _WIN64 // [
   typedef signed __int64    intptr_t;
   typedef unsigned __int64  uintptr_t;
 #else // _WIN64 ][
   typedef _W64 signed int   intptr_t;
   typedef _W64 unsigned int uintptr_t;
 #endif // _WIN64 ]
 // 7.18.1.5 Greatest-width integer types
 typedef int64_t   intmax_t;
 typedef uint64_t  uintmax_t;
 // 7.18.2 Limits of specified-width integer types
 #if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [   See footnote 220 at page 257 and footnote 221 at page 259
 // 7.18.2.1 Limits of exact-width integer types
 #define INT8_MIN     ((int8_t)_I8_MIN)
 #define INT8_MAX     _I8_MAX
 #define INT16_MIN    ((int16_t)_I16_MIN)
 #define INT16_MAX    _I16_MAX
 #define INT32_MIN    ((int32_t)_I32_MIN)
 #define INT32_MAX    _I32_MAX
 #define INT64_MIN    ((int64_t)_I64_MIN)
 #define INT64_MAX    _I64_MAX
 #define UINT8_MAX    _UI8_MAX
 #define UINT16_MAX   _UI16_MAX
 #define UINT32_MAX   _UI32_MAX
 #define UINT64_MAX   _UI64_MAX
 // 7.18.2.2 Limits of minimum-width integer types
 #define INT_LEAST8_MIN    INT8_MIN
 #define INT_LEAST8_MAX    INT8_MAX
 #define INT_LEAST16_MIN   INT16_MIN
 #define INT_LEAST16_MAX   INT16_MAX
 #define INT_LEAST32_MIN   INT32_MIN
 #define INT_LEAST32_MAX   INT32_MAX
 #define INT_LEAST64_MIN   INT64_MIN
 #define INT_LEAST64_MAX   INT64_MAX
 #define UINT_LEAST8_MAX   UINT8_MAX
 #define UINT_LEAST16_MAX  UINT16_MAX
 #define UINT_LEAST32_MAX  UINT32_MAX
 #define UINT_LEAST64_MAX  UINT64_MAX
 // 7.18.2.3 Limits of fastest minimum-width integer types
 #define INT_FAST8_MIN    INT8_MIN
 #define INT_FAST8_MAX    INT8_MAX
 #define INT_FAST16_MIN   INT16_MIN
 #define INT_FAST16_MAX   INT16_MAX
 #define INT_FAST32_MIN   INT32_MIN
 #define INT_FAST32_MAX   INT32_MAX
 #define INT_FAST64_MIN   INT64_MIN
 #define INT_FAST64_MAX   INT64_MAX
 #define UINT_FAST8_MAX   UINT8_MAX
 #define UINT_FAST16_MAX  UINT16_MAX
 #define UINT_FAST32_MAX  UINT32_MAX
 #define UINT_FAST64_MAX  UINT64_MAX
 // 7.18.2.4 Limits of integer types capable of holding object pointers
 #ifdef _WIN64 // [
 #  define INTPTR_MIN   INT64_MIN
 #  define INTPTR_MAX   INT64_MAX
 #  define UINTPTR_MAX  UINT64_MAX
 #else // _WIN64 ][
 #  define INTPTR_MIN   INT32_MIN
 #  define INTPTR_MAX   INT32_MAX
 #  define UINTPTR_MAX  UINT32_MAX
 #endif // _WIN64 ]
 // 7.18.2.5 Limits of greatest-width integer types
 #define INTMAX_MIN   INT64_MIN
 #define INTMAX_MAX   INT64_MAX
 #define UINTMAX_MAX  UINT64_MAX
 // 7.18.3 Limits of other integer types
 #ifdef _WIN64 // [
 #  define PTRDIFF_MIN  _I64_MIN
 #  define PTRDIFF_MAX  _I64_MAX
 #else  // _WIN64 ][
 #  define PTRDIFF_MIN  _I32_MIN
 #  define PTRDIFF_MAX  _I32_MAX
 #endif  // _WIN64 ]
 #define SIG_ATOMIC_MIN  INT_MIN
 #define SIG_ATOMIC_MAX  INT_MAX
 #ifndef SIZE_MAX // [
 #  ifdef _WIN64 // [
 #     define SIZE_MAX  _UI64_MAX
 #  else // _WIN64 ][
 #     define SIZE_MAX  _UI32_MAX
 #  endif // _WIN64 ]
 #endif // SIZE_MAX ]
 // WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
 #ifndef WCHAR_MIN // [
 #  define WCHAR_MIN  0
 #endif  // WCHAR_MIN ]
 #ifndef WCHAR_MAX // [
 #  define WCHAR_MAX  _UI16_MAX
 #endif  // WCHAR_MAX ]
 #define WINT_MIN  0
 #define WINT_MAX  _UI16_MAX
 #endif // __STDC_LIMIT_MACROS ]
 // 7.18.4 Limits of other integer types
 #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
 // 7.18.4.1 Macros for minimum-width integer constants
 #define INT8_C(val)  val##i8
 #define INT16_C(val) val##i16
 #define INT32_C(val) val##i32
 #define INT64_C(val) val##i64
 #define UINT8_C(val)  val##ui8
 #define UINT16_C(val) val##ui16
 #define UINT32_C(val) val##ui32
 #define UINT64_C(val) val##ui64
 // 7.18.4.2 Macros for greatest-width integer constants
 #define INTMAX_C   INT64_C
 #define UINTMAX_C  UINT64_C
 #endif // __STDC_CONSTANT_MACROS ]
 #endif // _MSC_STDINT_H_ ]
--- a/src/calibre/utils/chm/swig_chm.c
+++ b/src/calibre/utils/chm/swig_chm.c
--- a/src/calibre/utils/chm/swig_chm.i
+++ b/src/calibre/utils/chm/swig_chm.i
@ -0,0 +1,214 @@
 %module chmlib
 %include "typemaps.i"
 %include "cstring.i"
 %{
 /*
 Copyright (C) 2003 Rubens Ramos <rubensr@users.sourceforge.net>
 Based on code by:
 Copyright (C) 2003  Razvan Cojocaru <razvanco@gmx.net>
 pychm is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License as
 published by the Free Software Foundation; either version 2 of the
 License, or (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 General Public License for more details.
 You should have received a copy of the GNU General Public
 License along with this program; see the file COPYING.  If not,
 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 Boston, MA 02111-1307, USA
 $Id: swig_chm.i,v 1.1.1.1 2003/12/02 12:38:14 rubensr Exp $
 */
 #include "chm_lib.h"
 #include <stdio.h>
 static PyObject *my_callback = NULL;
 static PyObject *
 my_set_callback(PyObject *dummy, PyObject *arg)
 {
    PyObject *result = NULL;
    if (!PyCallable_Check(arg)) {
      PyErr_SetString(PyExc_TypeError, "parameter must be callable");
      return NULL;
    }
    Py_XINCREF(arg);         /* Add a reference to new callback */
    Py_XDECREF(my_callback);  /* Dispose of previous callback */
    my_callback = arg;       /* Remember new callback */
    /* Boilerplate to return "None" */
    Py_INCREF(Py_None);
    result = Py_None;
    return result;
 }
 int dummy_enumerator (struct chmFile *h, 
                      struct chmUnitInfo *ui, 
                      void *context) {
    PyObject *arglist;
    PyObject *result;
    PyObject *py_h;
    PyObject *py_ui;
    PyObject *py_c;
    py_h  = SWIG_NewPointerObj((void *) h, SWIGTYPE_p_chmFile, 0);
    py_ui = SWIG_NewPointerObj((void *) ui, SWIGTYPE_p_chmUnitInfo, 0);
    py_c  = PyCObject_AsVoidPtr(context);
    /* Time to call the callback */
    arglist = Py_BuildValue("(OOO)", py_h, py_ui, py_c);
    if (arglist) {
      result = PyEval_CallObject(my_callback, arglist);
      Py_DECREF(arglist);
      Py_DECREF(result);
      Py_DECREF(py_h);
      Py_DECREF(py_ui);
      if (result == NULL) {
        return 0; /* Pass error back */
      } else {
        return 1;
      }
    } else
      return 0;
 }
 %}
 %typemap(in) CHM_ENUMERATOR {
  if (!my_set_callback(self, $input)) goto fail;
  $1 = dummy_enumerator;
 }
 %typemap(in) void *context {
  if (!($1 = PyCObject_FromVoidPtr($input, NULL))) goto fail;
 }
 %typemap(in, numinputs=0) struct chmUnitInfo *OutValue (struct chmUnitInfo *temp = (struct chmUnitInfo *) calloc(1, sizeof(struct chmUnitInfo))) {
  $1 = temp;
 }
 %typemap(argout) struct chmUnitInfo *OutValue {
  PyObject *o, *o2, *o3;
  o = SWIG_NewPointerObj((void *) $1, SWIGTYPE_p_chmUnitInfo, 1);
  if ((!$result) || ($result == Py_None)) {
    $result = o;
  } else {
    if (!PyTuple_Check($result)) {
      PyObject *o2 = $result;
      $result = PyTuple_New(1);
      PyTuple_SetItem($result,0,o2);
    }
    o3 = PyTuple_New(1);
    PyTuple_SetItem(o3,0,o);
    o2 = $result;
    $result = PySequence_Concat(o2,o3);
    Py_DECREF(o2);
    Py_DECREF(o3);
  }
 }
 %typemap(check) unsigned char *OUTPUT {
  /* nasty hack */
 #ifdef __cplusplus
   $1 = ($1_ltype) new char[arg5];
 #else
   $1 = ($1_ltype) malloc(arg5);
 #endif
   if ($1 == NULL) SWIG_fail;
 }
 %typemap(argout,fragment="t_output_helper") unsigned char *OUTPUT {
   PyObject *o;
   o = PyString_FromStringAndSize($1, arg5);
   $result = t_output_helper($result,o);
 #ifdef __cplusplus
   delete [] $1;
 #else
   free($1);
 #endif
 }
 #ifdef WIN32
 typedef unsigned __int64 LONGUINT64;
 typedef __int64          LONGINT64;
 #else
 typedef unsigned long long LONGUINT64;
 typedef long long          LONGINT64;
 #endif
 /* the two available spaces in a CHM file                      */
 /* N.B.: The format supports arbitrarily many spaces, but only */
 /*       two appear to be used at present.                     */
 #define CHM_UNCOMPRESSED (0)
 #define CHM_COMPRESSED   (1)
 /* structure representing an ITS (CHM) file stream             */
 struct chmFile;
 /* structure representing an element from an ITS file stream   */
 #define CHM_MAX_PATHLEN  256
 struct chmUnitInfo
 {
    LONGUINT64         start;
    LONGUINT64         length;
    int                space;
    char               path[CHM_MAX_PATHLEN+1];
 };
 /* open an ITS archive */
 struct chmFile* chm_open(const char *filename);
 /* close an ITS archive */
 void chm_close(struct chmFile *h);
 /* methods for ssetting tuning parameters for particular file */
 #define CHM_PARAM_MAX_BLOCKS_CACHED 0
 void chm_set_param(struct chmFile *h,
                   int paramType,
                   int paramVal);
 /* resolve a particular object from the archive */
 #define CHM_RESOLVE_SUCCESS (0)
 #define CHM_RESOLVE_FAILURE (1)
 int chm_resolve_object(struct chmFile *h,
                       const char *objPath,
                       struct chmUnitInfo *OutValue);
 /* retrieve part of an object from the archive */
 LONGINT64 chm_retrieve_object(struct chmFile *h,
                              struct chmUnitInfo *ui,
                              unsigned char *OUTPUT,
                              LONGUINT64 addr,
                              LONGINT64 len);
 /* enumerate the objects in the .chm archive */
 typedef int (*CHM_ENUMERATOR)(struct chmFile *h,
                              struct chmUnitInfo *ui,
                              void *context);
 #define CHM_ENUMERATE_NORMAL    (1)
 #define CHM_ENUMERATE_META      (2)
 #define CHM_ENUMERATE_SPECIAL   (4)
 #define CHM_ENUMERATE_FILES     (8)
 #define CHM_ENUMERATE_DIRS      (16)
 #define CHM_ENUMERATE_ALL       (31)
 #define CHM_ENUMERATOR_FAILURE  (0)
 #define CHM_ENUMERATOR_CONTINUE (1)
 #define CHM_ENUMERATOR_SUCCESS  (2)
 int chm_enumerate(struct chmFile *h,
                  int what,
                  CHM_ENUMERATOR e,
                  void *context);
 int chm_enumerate_dir(struct chmFile *h,
                      const char *prefix,
                      int what,
                      CHM_ENUMERATOR e,
                      void *context);