Initial CHM changes

2026-05-30 02:32:33 -04:00 · 2010-02-07 22:43:15 -08:00
parent 647ad206c9
commit 7a74dc3410
7 changed files with 991 additions and 0 deletions
@@ -0,0 +1,8 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Used for chm input
+'''
@@ -0,0 +1,34 @@
+## Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
+
+## pychm is free software; you can redistribute it and/or
+## modify it under the terms of the GNU General Public License as
+## published by the Free Software Foundation; either version 2 of the
+## License, or (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+## General Public License for more details.
+
+## You should have received a copy of the GNU General Public
+## License along with this program; see the file COPYING.  If not,
+## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+## Boston, MA 02111-1307, USA
+
+## $Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $
+
+'''
+   chm - A package to manipulate CHM files
+
+   The chm package provides four modules: chm, chmlib, extra and
+   _chmlib. _chmlib and chmlib are very low level libraries generated
+   from  SWIG interface files, and are simple wrappers around the API
+   defined by the C library chmlib.
+   The extra module adds full-text search support.
+   the chm module provides some higher level classes to simplify
+   access to the CHM files information.
+'''
+__all__ = ["chm", "chmlib", "_chmlib", "extra"]
+__version__ = "0.8.4"
+__revision__ = "$Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $"
+
@@ -0,0 +1,508 @@
+## Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
+
+## Based on code by:
+## Copyright (C) 2003  Razvan Cojocaru <razvanco@gmx.net>
+
+## pychm is free software; you can redistribute it and/or
+## modify it under the terms of the GNU General Public License as
+## published by the Free Software Foundation; either version 2 of the
+## License, or (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+## General Public License for more details.
+
+## You should have received a copy of the GNU General Public
+## License along with this program; see the file COPYING.  If not,
+## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+## Boston, MA 02111-1307, USA
+
+## $Id: chm.py,v 1.12 2006/08/07 12:31:51 rubensr Exp $
+
+'''
+   chm - A high-level front end for the chmlib python module.
+
+   The chm module provides high level access to the functionality
+   included in chmlib. It encapsulates functions in the CHMFile class, and
+   provides some additional features, such as the ability to obtain
+   the contents tree of a CHM archive.
+   
+'''
+
+import chmlib
+import extra
+import array
+import string
+import os.path
+import sys
+
+charset_table = { 
+    0   : 'iso8859_1',  # ANSI_CHARSET
+    238 : 'iso8859_2',  # EASTEUROPE_CHARSET
+    178 : 'iso8859_6',  # ARABIC_CHARSET
+    161 : 'iso8859_7',  # GREEK_CHARSET
+    177 : 'iso8859_8',  # HEBREW_CHARSET
+    162 : 'iso8859_9',  # TURKISH_CHARSET
+    222 : 'iso8859_11', # THAI_CHARSET - hmm not in python 2.2...
+    186 : 'iso8859_13', # BALTIC_CHARSET
+    204 : 'cp1251',     # RUSSIAN_CHARSET
+    255 : 'cp437',      # OEM_CHARSET
+    128 : 'cp932',      # SHIFTJIS_CHARSET
+    134 : 'cp936',      # GB2312_CHARSET
+    129 : 'cp949',      # HANGUL_CHARSET
+    136 : 'cp950',      # CHINESEBIG5_CHARSET
+    1   : None,         # DEFAULT_CHARSET
+    2   : None,         # SYMBOL_CHARSET    
+    130 : None,         # JOHAB_CHARSET     
+    163 : None,         # VIETNAMESE_CHARSET
+    77  : None,         # MAC_CHARSET       
+}
+
+locale_table = {
+    0x0436 : ('iso8859_1', "Afrikaans", "Western Europe & US"),
+    0x041c : ('iso8859_2', "Albanian", "Central Europe"),
+    0x0401 : ('iso8859_6', "Arabic_Saudi_Arabia", "Arabic"),
+    0x0801 : ('iso8859_6', "Arabic_Iraq", "Arabic"),
+    0x0c01 : ('iso8859_6', "Arabic_Egypt", "Arabic"),
+    0x1001 : ('iso8859_6', "Arabic_Libya", "Arabic"),
+    0x1401 : ('iso8859_6', "Arabic_Algeria", "Arabic"),
+    0x1801 : ('iso8859_6', "Arabic_Morocco", "Arabic"),
+    0x1c01 : ('iso8859_6', "Arabic_Tunisia", "Arabic"),
+    0x2001 : ('iso8859_6', "Arabic_Oman", "Arabic"),
+    0x2401 : ('iso8859_6', "Arabic_Yemen", "Arabic"),
+    0x2801 : ('iso8859_6', "Arabic_Syria", "Arabic"),
+    0x2c01 : ('iso8859_6', "Arabic_Jordan", "Arabic"),
+    0x3001 : ('iso8859_6', "Arabic_Lebanon", "Arabic"),
+    0x3401 : ('iso8859_6', "Arabic_Kuwait", "Arabic"),
+    0x3801 : ('iso8859_6', "Arabic_UAE", "Arabic"),
+    0x3c01 : ('iso8859_6', "Arabic_Bahrain", "Arabic"),
+    0x4001 : ('iso8859_6', "Arabic_Qatar", "Arabic"),
+    0x042b : (None,        "Armenian","Armenian"),
+    0x042c : ('iso8859_9', "Azeri_Latin", "Turkish"),
+    0x082c : ('cp1251',    "Azeri_Cyrillic", "Cyrillic"),
+    0x042d : ('iso8859_1', "Basque", "Western Europe & US"),
+    0x0423 : ('cp1251',    "Belarusian", "Cyrillic"),
+    0x0402 : ('cp1251',    "Bulgarian", "Cyrillic"),
+    0x0403 : ('iso8859_1', "Catalan", "Western Europe & US"),
+    0x0404 : ('cp950',     "Chinese_Taiwan", "Traditional Chinese"),
+    0x0804 : ('cp936',     "Chinese_PRC", "Simplified Chinese"),
+    0x0c04 : ('cp950',     "Chinese_Hong_Kong", "Traditional Chinese"),
+    0x1004 : ('cp936',     "Chinese_Singapore", "Simplified Chinese"),
+    0x1404 : ('cp950',     "Chinese_Macau", "Traditional Chinese"),
+    0x041a : ('iso8859_2', "Croatian", "Central Europe"),
+    0x0405 : ('iso8859_2', "Czech", "Central Europe"),
+    0x0406 : ('iso8859_1', "Danish", "Western Europe & US"),
+    0x0413 : ('iso8859_1', "Dutch_Standard", "Western Europe & US"),
+    0x0813 : ('iso8859_1', "Dutch_Belgian", "Western Europe & US"),
+    0x0409 : ('iso8859_1', "English_United_States", "Western Europe & US"),
+    0x0809 : ('iso8859_1', "English_United_Kingdom", "Western Europe & US"),
+    0x0c09 : ('iso8859_1', "English_Australian", "Western Europe & US"),
+    0x1009 : ('iso8859_1', "English_Canadian", "Western Europe & US"),
+    0x1409 : ('iso8859_1', "English_New_Zealand", "Western Europe & US"),
+    0x1809 : ('iso8859_1', "English_Irish", "Western Europe & US"),
+    0x1c09 : ('iso8859_1', "English_South_Africa", "Western Europe & US"),
+    0x2009 : ('iso8859_1', "English_Jamaica", "Western Europe & US"),
+    0x2409 : ('iso8859_1', "English_Caribbean", "Western Europe & US"),
+    0x2809 : ('iso8859_1', "English_Belize", "Western Europe & US"),
+    0x2c09 : ('iso8859_1', "English_Trinidad", "Western Europe & US"),
+    0x3009 : ('iso8859_1', "English_Zimbabwe", "Western Europe & US"),
+    0x3409 : ('iso8859_1', "English_Philippines", "Western Europe & US"),
+    0x0425 : ('iso8859_13',"Estonian", "Baltic",),
+    0x0438 : ('iso8859_1', "Faeroese", "Western Europe & US"),
+    0x0429 : ('iso8859_6', "Farsi", "Arabic"),
+    0x040b : ('iso8859_1', "Finnish", "Western Europe & US"),
+    0x040c : ('iso8859_1', "French_Standard", "Western Europe & US"),
+    0x080c : ('iso8859_1', "French_Belgian", "Western Europe & US"),
+    0x0c0c : ('iso8859_1', "French_Canadian", "Western Europe & US"),
+    0x100c : ('iso8859_1', "French_Swiss", "Western Europe & US"),
+    0x140c : ('iso8859_1', "French_Luxembourg", "Western Europe & US"),
+    0x180c : ('iso8859_1', "French_Monaco", "Western Europe & US"),
+    0x0437 : (None,        "Georgian", "Georgian"),
+    0x0407 : ('iso8859_1', "German_Standard", "Western Europe & US"),
+    0x0807 : ('iso8859_1', "German_Swiss", "Western Europe & US"),
+    0x0c07 : ('iso8859_1', "German_Austrian", "Western Europe & US"),
+    0x1007 : ('iso8859_1', "German_Luxembourg", "Western Europe & US"),
+    0x1407 : ('iso8859_1', "German_Liechtenstein", "Western Europe & US"),
+    0x0408 : ('iso8859_7', "Greek", "Greek"),
+    0x040d : ('iso8859_8', "Hebrew", "Hebrew"),
+    0x0439 : (None,        "Hindi", "Indic"),
+    0x040e : ('iso8859_2', "Hungarian", "Central Europe"),
+    0x040f : ('iso8859_1', "Icelandic", "Western Europe & US"),
+    0x0421 : ('iso8859_1', "Indonesian", "Western Europe & US"),
+    0x0410 : ('iso8859_1', "Italian_Standard", "Western Europe & US"),
+    0x0810 : ('iso8859_1', "Italian_Swiss", "Western Europe & US"),
+    0x0411 : ('cp932',     "Japanese", "Japanese"),
+    0x043f : ('cp1251',    "Kazakh", "Cyrillic"),
+    0x0457 : (None,        "Konkani", "Indic"),
+    0x0412 : ('cp949',     "Korean", "Korean"),
+    0x0426 : ('iso8859_13',"Latvian", "Baltic",),
+    0x0427 : ('iso8859_13',"Lithuanian", "Baltic",),
+    0x042f : ('cp1251',    "Macedonian", "Cyrillic"),
+    0x043e : ('iso8859_1', "Malay_Malaysia", "Western Europe & US"),
+    0x083e : ('iso8859_1', "Malay_Brunei_Darussalam", "Western Europe & US"),
+    0x044e : (None,        "Marathi", "Indic"),
+    0x0414 : ('iso8859_1', "Norwegian_Bokmal", "Western Europe & US"),
+    0x0814 : ('iso8859_1', "Norwegian_Nynorsk", "Western Europe & US"),
+    0x0415 : ('iso8859_2', "Polish", "Central Europe"),
+    0x0416 : ('iso8859_1', "Portuguese_Brazilian", "Western Europe & US"),
+    0x0816 : ('iso8859_1', "Portuguese_Standard", "Western Europe & US"),
+    0x0418 : ('iso8859_2', "Romanian", "Central Europe"),
+    0x0419 : ('cp1251',    "Russian", "Cyrillic"),
+    0x044f : (None,        "Sanskrit", "Indic"),
+    0x081a : ('iso8859_2', "Serbian_Latin", "Central Europe"),
+    0x0c1a : ('cp1251',    "Serbian_Cyrillic", "Cyrillic"),
+    0x041b : ('iso8859_2', "Slovak", "Central Europe"),
+    0x0424 : ('iso8859_2', "Slovenian", "Central Europe"),
+    0x040a : ('iso8859_1', "Spanish_Trad_Sort", "Western Europe & US"),
+    0x080a : ('iso8859_1', "Spanish_Mexican", "Western Europe & US"),
+    0x0c0a : ('iso8859_1', "Spanish_Modern_Sort", "Western Europe & US"),
+    0x100a : ('iso8859_1', "Spanish_Guatemala", "Western Europe & US"),
+    0x140a : ('iso8859_1', "Spanish_Costa_Rica", "Western Europe & US"),
+    0x180a : ('iso8859_1', "Spanish_Panama", "Western Europe & US"),
+    0x1c0a : ('iso8859_1', "Spanish_Dominican_Repub", "Western Europe & US"),
+    0x200a : ('iso8859_1', "Spanish_Venezuela", "Western Europe & US"),
+    0x240a : ('iso8859_1', "Spanish_Colombia", "Western Europe & US"),
+    0x280a : ('iso8859_1', "Spanish_Peru", "Western Europe & US"),
+    0x2c0a : ('iso8859_1', "Spanish_Argentina", "Western Europe & US"),
+    0x300a : ('iso8859_1', "Spanish_Ecuador", "Western Europe & US"),
+    0x340a : ('iso8859_1', "Spanish_Chile", "Western Europe & US"),
+    0x380a : ('iso8859_1', "Spanish_Uruguay", "Western Europe & US"),
+    0x3c0a : ('iso8859_1', "Spanish_Paraguay", "Western Europe & US"),
+    0x400a : ('iso8859_1', "Spanish_Bolivia", "Western Europe & US"),
+    0x440a : ('iso8859_1', "Spanish_El_Salvador", "Western Europe & US"),
+    0x480a : ('iso8859_1', "Spanish_Honduras", "Western Europe & US"),
+    0x4c0a : ('iso8859_1', "Spanish_Nicaragua", "Western Europe & US"),
+    0x500a : ('iso8859_1', "Spanish_Puerto_Rico", "Western Europe & US"),
+    0x0441 : ('iso8859_1', "Swahili", "Western Europe & US"),
+    0x041d : ('iso8859_1', "Swedish", "Western Europe & US"),
+    0x081d : ('iso8859_1', "Swedish_Finland", "Western Europe & US"),
+    0x0449 : (None,        "Tamil", "Indic"),
+    0x0444 : ('cp1251',    "Tatar", "Cyrillic"),
+    0x041e : ('iso8859_11',"Thai", "Thai"),
+    0x041f : ('iso8859_9', "Turkish", "Turkish"),
+    0x0422 : ('cp1251',    "Ukrainian", "Cyrillic"),
+    0x0420 : ('iso8859_6', "Urdu", "Arabic"),
+    0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"),
+    0x0843 : ('cp1251',    "Uzbek_Cyrillic", "Cyrillic"),
+    0x042a : (None,        "Vietnamese", "Vietnamese")
+}
+
+class CHMFile:
+    "A class to manage access to CHM files."
+    filename = ""
+    file = None
+    title = ""
+    home = "/"
+    index = None
+    topics = None
+    encoding = None
+    lcid = None
+    binaryindex = None
+    
+    def __init__(self):
+        self.searchable = 0
+    
+    def LoadCHM(self, archiveName):
+        '''Loads a CHM archive.
+        This function will also call GetArchiveInfo to obtain information
+        such as the index file name and the topics file. It returns 1 on
+        success, and 0 if it fails.
+        '''
+        if (self.filename != None):
+            self.CloseCHM()
+
+        self.file = chmlib.chm_open(archiveName)
+        if (self.file == None):
+            return 0
+
+        self.filename = archiveName
+        self.GetArchiveInfo()
+
+        return 1
+
+    def CloseCHM(self):
+        '''Closes the CHM archive.
+        This function will close the CHM file, if it is open. All variables
+        are also reset.
+        '''
+        if (self.filename != None):
+            chmlib.chm_close(self.file)
+            self.file = None
+            self.filename = ''
+            self.title = ""
+            self.home = "/"
+            self.index = None
+            self.topics = None
+            self.encoding = None
+
+    def GetArchiveInfo(self):
+        '''Obtains information on CHM archive.
+        This function checks the /#SYSTEM file inside the CHM archive to
+        obtain the index, home page, topics, encoding and title. It is called
+        from LoadCHM.
+        '''
+
+        #extra.is_searchable crashed...
+        #self.searchable = extra.is_searchable (self.file)
+        self.searchable = False
+        self.lcid = None
+        
+        result, ui = chmlib.chm_resolve_object(self.file, '/#SYSTEM')
+        if (result != chmlib.CHM_RESOLVE_SUCCESS):
+            sys.stderr.write('GetArchiveInfo: #SYSTEM does not exist\n')
+            return 0
+        
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 4l, ui.length)
+        if (size == 0):
+            sys.stderr.write('GetArchiveInfo: file size = 0\n')
+            return 0
+
+        buff = array.array('B', text)
+
+        index = 0
+        while (index < size):
+            cursor = buff[index] + (buff[index+1] * 256)
+
+            if (cursor == 0):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.topics = '/' + text[index:index+cursor-1]
+            elif (cursor == 1):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.index = '/' + text[index:index+cursor-1]
+            elif (cursor == 2):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.home = '/' + text[index:index+cursor-1]
+            elif (cursor == 3):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.title = text[index:index+cursor-1]
+            elif (cursor == 4):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.lcid = buff[index] + (buff[index+1] * 256)
+            elif (cursor == 6):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                tmp = text[index:index+cursor-1]
+                if not self.topics:
+                    tmp1 = '/' + tmp + '.hhc'
+                    tmp2 = '/' + tmp + '.hhk'
+                    res1, ui1 = chmlib.chm_resolve_object(self.file, tmp1)
+                    res2, ui2 = chmlib.chm_resolve_object(self.file, tmp2)
+                    if (not self.topics) and \
+                           (res1 == chmlib.CHM_RESOLVE_SUCCESS):
+                        self.topics = '/' + tmp + '.hhc'
+                    if (not self.index) and \
+                           (res2 == chmlib.CHM_RESOLVE_SUCCESS):
+                        self.index = '/' + tmp + '.hhk'
+            elif (cursor == 16):
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+                self.encoding = text[index:index+cursor-1]
+            else:
+                index += 2
+                cursor = buff[index] + (buff[index+1] * 256)
+                index += 2
+            index += cursor
+
+        self.GetWindowsInfo()
+        
+        if not self.lcid:
+            self.lcid = extra.get_lcid (self.file)
+
+        return 1
+
+    def GetTopicsTree(self):
+        '''Reads and returns the topics tree.
+        This auxiliary function reads and returns the topics tree file
+        contents for the CHM archive.
+        '''
+        if (self.topics == None):
+            return None
+
+        if self.topics:
+            res, ui = chmlib.chm_resolve_object(self.file, self.topics)
+            if (res != chmlib.CHM_RESOLVE_SUCCESS):
+                return None
+
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
+        if (size == 0):
+            sys.stderr.write('GetTopicsTree: file size = 0\n')
+            return None
+        return text
+
+    def GetIndex(self):
+        '''Reads and returns the index tree.
+        This auxiliary function reads and returns the index tree file
+        contents for the CHM archive.
+        '''
+        if (self.index == None):
+            return None
+
+        if self.index:
+            res, ui = chmlib.chm_resolve_object(self.file, self.index)
+            if (res != chmlib.CHM_RESOLVE_SUCCESS):
+                return None
+
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
+        if (size == 0):
+            sys.stderr.write('GetIndex: file size = 0\n')
+            return None
+        return text
+
+    def ResolveObject(self, document):
+        '''Tries to locate a document in the archive.
+        This function tries to locate the document inside the archive. It
+        returns a tuple where the first element is zero if the function
+        was successful, and the second is the UnitInfo for that document.
+        The UnitInfo is used to retrieve the document contents
+        '''
+        if self.file:
+            #path = os.path.abspath(document)
+            path = document
+            return chmlib.chm_resolve_object(self.file, path)
+        else:
+            return (1, None)
+
+    def RetrieveObject(self, ui, start = -1, length = -1):
+        '''Retrieves the contents of a document.
+        This function takes a UnitInfo and two optional arguments, the first
+        being the start address and the second is the length. These define
+        the amount of data to be read from the archive.
+        '''
+        if self.file and ui:
+            if length == -1:
+                len = ui.length
+            else:
+                len = length
+            if start == -1:
+                st = 0l
+            else:
+                st = long(start)
+            return chmlib.chm_retrieve_object(self.file, ui, st, len)
+        else:
+            return (0, '')
+
+    def Search(self, text, wholewords=0, titleonly=0):
+        '''Performs full-text search on the archive.
+        The first parameter is the word to look for, the second
+        indicates if the search should be for whole words only, and
+        the third parameter indicates if the search should be
+        restricted to page titles.
+        This method will return a tuple, the first item
+        indicating if the search results were partial, and the second
+        item being a dictionary containing the results.'''
+        if text and text != '' and self.file:
+            return extra.search (self.file, text, wholewords,
+                                 titleonly)
+        else:
+            return None
+
+    def IsSearchable(self):
+        '''Indicates if the full-text search is available for this
+        archive - this flag is updated when GetArchiveInfo is called'''
+        return self.searchable
+
+    def GetEncoding(self):
+        '''Returns a string that can be used with the codecs python package
+        to encode or decode the files in the chm archive. If an error is
+        found, or if it is not possible to find the encoding, None is
+        returned.'''
+        if self.encoding:
+            vals = string.split(self.encoding, ',')
+            if len(vals) > 2:
+                try:
+                    return charset_table[int(vals[2])]
+                except KeyError:
+                    pass
+        return None
+
+    def GetLCID(self):
+        '''Returns the archive Locale ID'''
+        if self.lcid in locale_table:
+            return locale_table[self.lcid]
+        else:
+            return None
+
+    def GetDWORD(self, buff, idx=0):
+        '''Internal method.
+        Reads a double word (4 bytes) from a buffer.
+        '''
+        result = buff[idx] + (buff[idx+1]<<8) + (buff[idx+2]<<16) + \
+                 (buff[idx+3]<<24)
+
+        if result == 0xFFFFFFFF:
+            result = 0
+            
+        return result
+
+    def GetString(self, text, idx):
+        '''Internal method.
+        Retrieves a string from the #STRINGS buffer.
+        '''
+        next = string.find(text, '\x00', idx)
+        chunk = text[idx:next]
+        return chunk
+    
+    def GetWindowsInfo(self):
+        '''Gets information from the #WINDOWS file.
+        Checks the #WINDOWS file to see if it has any info that was
+        not found in #SYSTEM (topics, index or default page.
+        '''
+        result, ui = chmlib.chm_resolve_object(self.file, '/#WINDOWS')
+        if (result != chmlib.CHM_RESOLVE_SUCCESS):
+            return -1
+
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, 8)
+        if (size < 8):
+            return -2
+
+        buff = array.array('B', text)
+        num_entries = self.GetDWORD(buff, 0)
+        entry_size = self.GetDWORD(buff, 4)
+
+        if num_entries < 1:
+            return -3
+        
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 8l, entry_size)
+        if (size < entry_size):
+            return -4
+
+        buff = array.array('B', text)
+        toc_index = self.GetDWORD(buff, 0x60)
+        idx_index = self.GetDWORD(buff, 0x64)
+        dft_index = self.GetDWORD(buff, 0x68)
+        
+        result, ui = chmlib.chm_resolve_object(self.file, '/#STRINGS')
+        if (result != chmlib.CHM_RESOLVE_SUCCESS):
+            return -5
+        
+        size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
+        if (size == 0):
+            return -6
+
+        if (not self.topics):
+            self.topics = self.GetString(text, toc_index)
+            if not self.topics.startswith("/"):
+                self.topics = "/" + self.topics
+            
+        if (not self.index):
+            self.index = self.GetString(text, idx_index)
+            if not self.index.startswith("/"):
+                self.index = "/" + self.index
+
+        if (dft_index != 0):
+            self.home = self.GetString(text, dft_index)
+            if not self.home.startswith("/"):
+                self.home = "/" + self.home
@@ -0,0 +1,93 @@
+# This file was created automatically by SWIG.
+# Don't modify this file, modify the SWIG interface instead.
+# This file is compatible with both classic and new-style classes.
+import _chmlib
+def _swig_setattr(self,class_type,name,value):
+    if (name == "this"):
+        if isinstance(value, class_type):
+            self.__dict__[name] = value.this
+            if hasattr(value,"thisown"): self.__dict__["thisown"] = value.thisown
+            del value.thisown
+            return
+    method = class_type.__swig_setmethods__.get(name,None)
+    if method: return method(self,value)
+    self.__dict__[name] = value
+
+def _swig_getattr(self,class_type,name):
+    method = class_type.__swig_getmethods__.get(name,None)
+    if method: return method(self)
+    raise AttributeError,name
+
+import types
+try:
+    _object = types.ObjectType
+    _newclass = 1
+except AttributeError:
+    class _object : pass
+    _newclass = 0
+
+
+CHM_UNCOMPRESSED = _chmlib.CHM_UNCOMPRESSED
+CHM_COMPRESSED = _chmlib.CHM_COMPRESSED
+CHM_MAX_PATHLEN = _chmlib.CHM_MAX_PATHLEN
+class chmUnitInfo(_object):
+    __swig_setmethods__ = {}
+    __setattr__ = lambda self, name, value: _swig_setattr(self, chmUnitInfo, name, value)
+    __swig_getmethods__ = {}
+    __getattr__ = lambda self, name: _swig_getattr(self, chmUnitInfo, name)
+    __swig_setmethods__["start"] = _chmlib.chmUnitInfo_start_set
+    __swig_getmethods__["start"] = _chmlib.chmUnitInfo_start_get
+    if _newclass:start = property(_chmlib.chmUnitInfo_start_get,_chmlib.chmUnitInfo_start_set)
+    __swig_setmethods__["length"] = _chmlib.chmUnitInfo_length_set
+    __swig_getmethods__["length"] = _chmlib.chmUnitInfo_length_get
+    if _newclass:length = property(_chmlib.chmUnitInfo_length_get,_chmlib.chmUnitInfo_length_set)
+    __swig_setmethods__["space"] = _chmlib.chmUnitInfo_space_set
+    __swig_getmethods__["space"] = _chmlib.chmUnitInfo_space_get
+    if _newclass:space = property(_chmlib.chmUnitInfo_space_get,_chmlib.chmUnitInfo_space_set)
+    __swig_setmethods__["path"] = _chmlib.chmUnitInfo_path_set
+    __swig_getmethods__["path"] = _chmlib.chmUnitInfo_path_get
+    if _newclass:path = property(_chmlib.chmUnitInfo_path_get,_chmlib.chmUnitInfo_path_set)
+    def __init__(self,*args):
+        _swig_setattr(self, chmUnitInfo, 'this', apply(_chmlib.new_chmUnitInfo,args))
+        _swig_setattr(self, chmUnitInfo, 'thisown', 1)
+    def __del__(self, destroy= _chmlib.delete_chmUnitInfo):
+        try:
+            if self.thisown: destroy(self)
+        except: pass
+    def __repr__(self):
+        return "<C chmUnitInfo instance at %s>" % (self.this,)
+
+class chmUnitInfoPtr(chmUnitInfo):
+    def __init__(self,this):
+        _swig_setattr(self, chmUnitInfo, 'this', this)
+        if not hasattr(self,"thisown"): _swig_setattr(self, chmUnitInfo, 'thisown', 0)
+        _swig_setattr(self, chmUnitInfo,self.__class__,chmUnitInfo)
+_chmlib.chmUnitInfo_swigregister(chmUnitInfoPtr)
+
+chm_open = _chmlib.chm_open
+
+chm_close = _chmlib.chm_close
+
+CHM_PARAM_MAX_BLOCKS_CACHED = _chmlib.CHM_PARAM_MAX_BLOCKS_CACHED
+chm_set_param = _chmlib.chm_set_param
+
+CHM_RESOLVE_SUCCESS = _chmlib.CHM_RESOLVE_SUCCESS
+CHM_RESOLVE_FAILURE = _chmlib.CHM_RESOLVE_FAILURE
+chm_resolve_object = _chmlib.chm_resolve_object
+
+chm_retrieve_object = _chmlib.chm_retrieve_object
+
+CHM_ENUMERATE_NORMAL = _chmlib.CHM_ENUMERATE_NORMAL
+CHM_ENUMERATE_META = _chmlib.CHM_ENUMERATE_META
+CHM_ENUMERATE_SPECIAL = _chmlib.CHM_ENUMERATE_SPECIAL
+CHM_ENUMERATE_FILES = _chmlib.CHM_ENUMERATE_FILES
+CHM_ENUMERATE_DIRS = _chmlib.CHM_ENUMERATE_DIRS
+CHM_ENUMERATE_ALL = _chmlib.CHM_ENUMERATE_ALL
+CHM_ENUMERATOR_FAILURE = _chmlib.CHM_ENUMERATOR_FAILURE
+CHM_ENUMERATOR_CONTINUE = _chmlib.CHM_ENUMERATOR_CONTINUE
+CHM_ENUMERATOR_SUCCESS = _chmlib.CHM_ENUMERATOR_SUCCESS
+chm_enumerate = _chmlib.chm_enumerate
+
+chm_enumerate_dir = _chmlib.chm_enumerate_dir
+
+
@@ -0,0 +1,348 @@
+from __future__ import with_statement
+''' CHM File decoding support '''
+__license__ = 'GPL v3'
+__copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
+                 ' and Alex Bramley <a.bramley at gmail.com>.'
+
+import sys, logging, os, re, shutil, subprocess, uuid
+from shutil import rmtree
+from tempfile import mkdtemp
+from mimetypes import guess_type as guess_mimetype
+from htmlentitydefs import name2codepoint
+from pprint import PrettyPrinter
+
+from BeautifulSoup import BeautifulSoup
+from lxml import html, etree
+from calibre.ebooks.chm.chm.chm import CHMFile
+from calibre.ebooks.chm.chm.chmlib import (
+  CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
+  chm_enumerate, chm_retrieve_object,
+)
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.utils.config import OptionParser
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.metadata.opf2 import OPFCreator, Guide
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
+from calibre.utils.localization import get_lang
+from calibre.utils.filenames import ascii_filename
+
+
+def match_string(s1, s2_already_lowered):
+    if s1 is not None and s2_already_lowered is not None:
+        if s1.lower()==s2_already_lowered:
+            return True
+    return False
+
+def option_parser():
+    parser = OptionParser(usage=_('%prog [options] mybook.chm'))
+    parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
+    parser.add_option('--verbose', default=False, action='store_true', dest='verbose')
+    parser.add_option("-t", "--title", action="store", type="string", \
+                    dest="title", help=_("Set the book title"))
+    parser.add_option('--title-sort', action='store', type='string', default=None,
+                      dest='title_sort', help=_('Set sort key for the title'))
+    parser.add_option("-a", "--author", action="store", type="string", \
+                    dest="author", help=_("Set the author"))
+    parser.add_option('--author-sort', action='store', type='string', default=None,
+                      dest='author_sort', help=_('Set sort key for the author'))
+    parser.add_option("-c", "--category", action="store", type="string", \
+                    dest="category", help=_("The category this book belongs"
+                    " to. E.g.: History"))
+    parser.add_option("--thumbnail", action="store", type="string", \
+                    dest="thumbnail", help=_("Path to a graphic that will be"
+                    " set as this files' thumbnail"))
+    parser.add_option("--comment", action="store", type="string", \
+                    dest="freetext", help=_("Path to a txt file containing a comment."))
+    parser.add_option("--get-thumbnail", action="store_true", \
+                    dest="get_thumbnail", default=False, \
+                    help=_("Extract thumbnail from LRF file"))
+    parser.add_option('--publisher', default=None, help=_('Set the publisher'))
+    parser.add_option('--classification', default=None, help=_('Set the book classification'))
+    parser.add_option('--creator', default=None, help=_('Set the book creator'))
+    parser.add_option('--producer', default=None, help=_('Set the book producer'))
+    parser.add_option('--get-cover', action='store_true', default=False,
+                      help=_('Extract cover from LRF file. Note that the LRF format has no defined cover, so we use some heuristics to guess the cover.'))
+    parser.add_option('--bookid', action='store', type='string', default=None,
+                      dest='book_id', help=_('Set book ID'))
+    parser.add_option('--font-delta', action='store', type='int', default=0,
+                      dest='font_delta', help=_('Set font delta'))
+    return parser
+
+class CHMError(Exception):
+    pass
+
+class CHMReader(CHMFile):
+    def __init__(self, input, log):
+        CHMFile.__init__(self)
+        if not self.LoadCHM(input):
+            raise CHMError("Unable to open CHM file '%s'"%(input,))
+        self.log = log
+        self._sourcechm = input
+        self._contents = None
+        self._playorder = 0
+        self._metadata = False
+        self._extracted = False
+
+        # location of '.hhc' file, which is the CHM TOC.
+        self.root, ext = os.path.splitext(self.topics.lstrip('/'))
+        self.hhc_path = self.root + ".hhc"
+
+
+    def _parse_toc(self, ul, basedir=os.getcwdu()):
+        toc = TOC(play_order=self._playorder, base_path=basedir, text='')
+        self._playorder += 1
+        for li in ul('li', recursive=False):
+            href = li.object('param', {'name': 'Local'})[0]['value']
+            if href.count('#'):
+                href, frag = href.split('#')
+            else:
+                frag = None
+            name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
+            #print "========>", name
+            toc.add_item(href, frag, name, play_order=self._playorder)
+            self._playorder += 1
+            if li.ul:
+               child = self._parse_toc(li.ul)
+               child.parent = toc
+               toc.append(child)
+        #print toc
+        return toc
+
+
+    def GetFile(self, path):
+        # have to have abs paths for ResolveObject, but Contents() deliberately
+        # makes them relative. So we don't have to worry, re-add the leading /.
+        # note this path refers to the internal CHM structure
+        if path[0] != '/':
+            path = '/' + path
+        res, ui = self.ResolveObject(path)
+        if res != CHM_RESOLVE_SUCCESS:
+            raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
+        size, data = self.RetrieveObject(ui)
+        if size == 0:
+            raise CHMError("'%s' is zero bytes in length!"%(path,))
+        return data
+
+    def ExtractFiles(self, output_dir=os.getcwdu()):
+        for path in self.Contents():
+            lpath = os.path.join(output_dir, path)
+            self._ensure_dir(lpath)
+            data = self.GetFile(path)
+            with open(lpath, 'wb') as f:
+                if guess_mimetype(path)[0] == ('text/html'):
+                    data = self._reformat(data)
+                f.write(data)
+        #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
+        self._extracted = True
+
+    def _reformat(self, data):
+        try:
+            html = BeautifulSoup(data)
+        except UnicodeEncodeError:
+            # hit some strange encoding problems...
+            print "Unable to parse html for cleaning, leaving it :("
+            return data
+        # nuke javascript...
+        [s.extract() for s in html('script')]
+        # remove forward and back nav bars from the top/bottom of each page
+        # cos they really fuck with the flow of things and generally waste space
+        # since we can't use [a,b] syntax to select arbitrary items from a list
+        # we'll have to do this manually...
+        t = html('table')
+        if t:
+            if (t[0].previousSibling is None
+              or t[0].previousSibling.previousSibling is None):
+                t[0].extract()
+            if (t[-1].nextSibling is None
+              or t[-1].nextSibling.nextSibling is None):
+                t[-1].extract()
+        # for some very odd reason each page's content appears to be in a table
+        # too. and this table has sub-tables for random asides... grr.
+
+        # some images seem to be broken in some chm's :/
+        for img in html('img'):
+            try:
+                # some are supposedly "relative"... lies.
+                while img['src'].startswith('../'): img['src'] = img['src'][3:]
+                # some have ";<junk>" at the end.
+                img['src'] = img['src'].split(';')[0]
+            except KeyError:
+                # and some don't even have a src= ?!
+                pass
+        # now give back some pretty html.
+        return html.prettify()
+
+    def Contents(self):
+        if self._contents is not None:
+            return self._contents
+        paths = []
+        def get_paths(chm, ui, ctx):
+            # skip directories
+            # note this path refers to the internal CHM structure
+            if ui.path[-1] != '/':
+                # and make paths relative
+                paths.append(ui.path.lstrip('/'))
+        chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
+        self._contents = paths
+        return self._contents
+
+    def _ensure_dir(self, path):
+        dir = os.path.dirname(path)
+        if not os.path.isdir(dir):
+            os.makedirs(dir)
+
+    def extract_content(self, output_dir=os.getcwdu()):
+        self.ExtractFiles(output_dir=output_dir)
+
+
+class CHMInput(InputFormatPlugin):
+
+    name        = 'CHM Input'
+    author      = 'Kovid Goyal and Alex Bramley'
+    description = 'Convert CHM files to OEB'
+    file_types  = set(['chm'])
+
+    options = set([
+        OptionRecommendation(name='dummy_option', recommended_value=False,
+            help=_('dummy option until real options are determined.')),
+    ])
+
+    def _chmtohtml(self, output_dir, chm_path, no_images, log):
+        log.debug('Opening CHM file')
+        rdr = CHMReader(chm_path, log)
+        log.debug('Extracting CHM to %s' % output_dir)
+        rdr.extract_content(output_dir)
+        return rdr.hhc_path
+
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.metadata.chm import get_metadata_
+
+        log.debug('Processing CHM...')
+        tdir = mkdtemp(prefix='chm2oeb_')
+        from calibre.customize.ui import plugin_for_input_format
+        html_input = plugin_for_input_format('html')
+        for opt in html_input.options:
+            setattr(options, opt.option.name, opt.recommended_value)
+        options.input_encoding = 'utf-8'
+        no_images = False #options.no_images
+        chm_name = stream.name
+        #chm_data = stream.read()
+        
+        #closing stream so CHM can be opened by external library
+        stream.close()
+        log.debug('tdir=%s' % tdir)
+        log.debug('stream.name=%s' % stream.name)
+        mainname = self._chmtohtml(tdir, chm_name, no_images, log)
+        mainpath = os.path.join(tdir, mainname)
+
+        metadata = get_metadata_(tdir)
+
+        cwd = os.getcwdu()
+        odi = options.debug_pipeline
+        options.debug_pipeline = None
+        # try a custom conversion:
+        oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
+        options.debug_pipeline = odi
+        #log.debug('DEBUG: Not removing tempdir %s' % tdir)
+        shutil.rmtree(tdir)
+        return oeb
+
+    def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        from calibre.ebooks.oeb.base import DirContainer, \
+            rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, \
+            xpath
+        from calibre import guess_type
+        import cssutils
+        oeb = create_oebbook(log, None, opts, self,
+                encoding=opts.input_encoding, populate=False)
+        self.oeb = oeb
+
+        metadata = oeb.metadata
+        if mi.title:
+            metadata.add('title', mi.title)
+        if mi.authors:
+            for a in mi.authors:
+                metadata.add('creator', a, attrib={'role':'aut'})
+        if mi.publisher:
+            metadata.add('publisher', mi.publisher)
+        if mi.isbn:
+            metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'})
+        if not metadata.language:
+            oeb.logger.warn(u'Language not specified')
+            metadata.add('language', get_lang())
+        if not metadata.creator:
+            oeb.logger.warn('Creator not specified')
+            metadata.add('creator', self.oeb.translate(__('Unknown')))
+        if not metadata.title:
+            oeb.logger.warn('Title not specified')
+            metadata.add('title', self.oeb.translate(__('Unknown')))
+
+        bookid = str(uuid.uuid4())
+        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
+        for ident in metadata.identifier:
+            if 'id' in ident.attrib:
+                self.oeb.uid = metadata.identifier[0]
+                break
+
+        hhcdata = self._read_file(hhcpath)
+        hhcroot = html.fromstring(hhcdata)
+        chapters = self._process_nodes(hhcroot)
+        #print "============================="
+        #print "Printing hhcroot"
+        #print etree.tostring(hhcroot, pretty_print=True)
+        #print "============================="
+        log.debug('Found %d section nodes' % len(chapters))
+        
+        if len(chapters) > 0:
+            path0 = chapters[0][1]
+            subpath = os.path.dirname(path0)
+            htmlpath = os.path.join(basedir, subpath)
+
+            oeb.container = DirContainer(htmlpath, log)
+            for chapter in chapters:
+                title = chapter[0]
+                basename = os.path.basename(chapter[1])
+                self._add_item(oeb, title, basename)
+
+            oeb.container = DirContainer(htmlpath, oeb.log)
+        return oeb
+
+    def _read_file(self, name):
+        f = open(name, 'rb')
+        data = f.read()
+        f.close()
+        return data
+
+    def _visit_node(self, node, chapters):
+        # check that node is a normal node (not a comment, DOCTYPE, etc.)
+        # (normal nodes have string tags)
+        if isinstance(node.tag, basestring):
+            if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
+                for child in node:
+                    if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
+                        chapter_title = child.attrib['value']
+                    if match_string(child.tag,'param') and match_string(child.attrib['name'],'local'):
+                        chapter_path = child.attrib['value']
+                if chapter_title is not None and chapter_path is not None:
+                    chapter = [chapter_title, chapter_path]
+                    chapters.append(chapter)
+
+    def _process_nodes(self, root):
+        chapters = []
+        for node in root.iter():
+            self._visit_node(node, chapters)
+        return chapters
+
+    def _add_item(self, oeb, title, path):
+        bname = os.path.basename(path)
+        id, href = oeb.manifest.generate(id='html',
+                href=ascii_filename(bname))
+        item = oeb.manifest.add(id, href, 'text/html')
+        item.html_input_href = bname
+        oeb.spine.add(item, True)
+        oeb.toc.add(title, item.href)
+