Initial CHM changes

This commit is contained in:
James Ralston 2010-02-07 22:43:15 -08:00
parent 647ad206c9
commit 7a74dc3410
7 changed files with 991 additions and 0 deletions

View File

@ -0,0 +1,8 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Used for chm input
'''

View File

@ -0,0 +1,34 @@
## Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
## pychm is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
## You should have received a copy of the GNU General Public
## License along with this program; see the file COPYING. If not,
## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
## Boston, MA 02111-1307, USA
## $Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $
'''
chm - A package to manipulate CHM files
The chm package provides four modules: chm, chmlib, extra and
_chmlib. _chmlib and chmlib are very low level libraries generated
from SWIG interface files, and are simple wrappers around the API
defined by the C library chmlib.
The extra module adds full-text search support.
the chm module provides some higher level classes to simplify
access to the CHM files information.
'''
__all__ = ["chm", "chmlib", "_chmlib", "extra"]
__version__ = "0.8.4"
__revision__ = "$Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $"

Binary file not shown.

View File

@ -0,0 +1,508 @@
## Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
## Based on code by:
## Copyright (C) 2003 Razvan Cojocaru <razvanco@gmx.net>
## pychm is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
## You should have received a copy of the GNU General Public
## License along with this program; see the file COPYING. If not,
## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
## Boston, MA 02111-1307, USA
## $Id: chm.py,v 1.12 2006/08/07 12:31:51 rubensr Exp $
'''
chm - A high-level front end for the chmlib python module.
The chm module provides high level access to the functionality
included in chmlib. It encapsulates functions in the CHMFile class, and
provides some additional features, such as the ability to obtain
the contents tree of a CHM archive.
'''
import chmlib
import extra
import array
import string
import os.path
import sys
charset_table = {
0 : 'iso8859_1', # ANSI_CHARSET
238 : 'iso8859_2', # EASTEUROPE_CHARSET
178 : 'iso8859_6', # ARABIC_CHARSET
161 : 'iso8859_7', # GREEK_CHARSET
177 : 'iso8859_8', # HEBREW_CHARSET
162 : 'iso8859_9', # TURKISH_CHARSET
222 : 'iso8859_11', # THAI_CHARSET - hmm not in python 2.2...
186 : 'iso8859_13', # BALTIC_CHARSET
204 : 'cp1251', # RUSSIAN_CHARSET
255 : 'cp437', # OEM_CHARSET
128 : 'cp932', # SHIFTJIS_CHARSET
134 : 'cp936', # GB2312_CHARSET
129 : 'cp949', # HANGUL_CHARSET
136 : 'cp950', # CHINESEBIG5_CHARSET
1 : None, # DEFAULT_CHARSET
2 : None, # SYMBOL_CHARSET
130 : None, # JOHAB_CHARSET
163 : None, # VIETNAMESE_CHARSET
77 : None, # MAC_CHARSET
}
locale_table = {
0x0436 : ('iso8859_1', "Afrikaans", "Western Europe & US"),
0x041c : ('iso8859_2', "Albanian", "Central Europe"),
0x0401 : ('iso8859_6', "Arabic_Saudi_Arabia", "Arabic"),
0x0801 : ('iso8859_6', "Arabic_Iraq", "Arabic"),
0x0c01 : ('iso8859_6', "Arabic_Egypt", "Arabic"),
0x1001 : ('iso8859_6', "Arabic_Libya", "Arabic"),
0x1401 : ('iso8859_6', "Arabic_Algeria", "Arabic"),
0x1801 : ('iso8859_6', "Arabic_Morocco", "Arabic"),
0x1c01 : ('iso8859_6', "Arabic_Tunisia", "Arabic"),
0x2001 : ('iso8859_6', "Arabic_Oman", "Arabic"),
0x2401 : ('iso8859_6', "Arabic_Yemen", "Arabic"),
0x2801 : ('iso8859_6', "Arabic_Syria", "Arabic"),
0x2c01 : ('iso8859_6', "Arabic_Jordan", "Arabic"),
0x3001 : ('iso8859_6', "Arabic_Lebanon", "Arabic"),
0x3401 : ('iso8859_6', "Arabic_Kuwait", "Arabic"),
0x3801 : ('iso8859_6', "Arabic_UAE", "Arabic"),
0x3c01 : ('iso8859_6', "Arabic_Bahrain", "Arabic"),
0x4001 : ('iso8859_6', "Arabic_Qatar", "Arabic"),
0x042b : (None, "Armenian","Armenian"),
0x042c : ('iso8859_9', "Azeri_Latin", "Turkish"),
0x082c : ('cp1251', "Azeri_Cyrillic", "Cyrillic"),
0x042d : ('iso8859_1', "Basque", "Western Europe & US"),
0x0423 : ('cp1251', "Belarusian", "Cyrillic"),
0x0402 : ('cp1251', "Bulgarian", "Cyrillic"),
0x0403 : ('iso8859_1', "Catalan", "Western Europe & US"),
0x0404 : ('cp950', "Chinese_Taiwan", "Traditional Chinese"),
0x0804 : ('cp936', "Chinese_PRC", "Simplified Chinese"),
0x0c04 : ('cp950', "Chinese_Hong_Kong", "Traditional Chinese"),
0x1004 : ('cp936', "Chinese_Singapore", "Simplified Chinese"),
0x1404 : ('cp950', "Chinese_Macau", "Traditional Chinese"),
0x041a : ('iso8859_2', "Croatian", "Central Europe"),
0x0405 : ('iso8859_2', "Czech", "Central Europe"),
0x0406 : ('iso8859_1', "Danish", "Western Europe & US"),
0x0413 : ('iso8859_1', "Dutch_Standard", "Western Europe & US"),
0x0813 : ('iso8859_1', "Dutch_Belgian", "Western Europe & US"),
0x0409 : ('iso8859_1', "English_United_States", "Western Europe & US"),
0x0809 : ('iso8859_1', "English_United_Kingdom", "Western Europe & US"),
0x0c09 : ('iso8859_1', "English_Australian", "Western Europe & US"),
0x1009 : ('iso8859_1', "English_Canadian", "Western Europe & US"),
0x1409 : ('iso8859_1', "English_New_Zealand", "Western Europe & US"),
0x1809 : ('iso8859_1', "English_Irish", "Western Europe & US"),
0x1c09 : ('iso8859_1', "English_South_Africa", "Western Europe & US"),
0x2009 : ('iso8859_1', "English_Jamaica", "Western Europe & US"),
0x2409 : ('iso8859_1', "English_Caribbean", "Western Europe & US"),
0x2809 : ('iso8859_1', "English_Belize", "Western Europe & US"),
0x2c09 : ('iso8859_1', "English_Trinidad", "Western Europe & US"),
0x3009 : ('iso8859_1', "English_Zimbabwe", "Western Europe & US"),
0x3409 : ('iso8859_1', "English_Philippines", "Western Europe & US"),
0x0425 : ('iso8859_13',"Estonian", "Baltic",),
0x0438 : ('iso8859_1', "Faeroese", "Western Europe & US"),
0x0429 : ('iso8859_6', "Farsi", "Arabic"),
0x040b : ('iso8859_1', "Finnish", "Western Europe & US"),
0x040c : ('iso8859_1', "French_Standard", "Western Europe & US"),
0x080c : ('iso8859_1', "French_Belgian", "Western Europe & US"),
0x0c0c : ('iso8859_1', "French_Canadian", "Western Europe & US"),
0x100c : ('iso8859_1', "French_Swiss", "Western Europe & US"),
0x140c : ('iso8859_1', "French_Luxembourg", "Western Europe & US"),
0x180c : ('iso8859_1', "French_Monaco", "Western Europe & US"),
0x0437 : (None, "Georgian", "Georgian"),
0x0407 : ('iso8859_1', "German_Standard", "Western Europe & US"),
0x0807 : ('iso8859_1', "German_Swiss", "Western Europe & US"),
0x0c07 : ('iso8859_1', "German_Austrian", "Western Europe & US"),
0x1007 : ('iso8859_1', "German_Luxembourg", "Western Europe & US"),
0x1407 : ('iso8859_1', "German_Liechtenstein", "Western Europe & US"),
0x0408 : ('iso8859_7', "Greek", "Greek"),
0x040d : ('iso8859_8', "Hebrew", "Hebrew"),
0x0439 : (None, "Hindi", "Indic"),
0x040e : ('iso8859_2', "Hungarian", "Central Europe"),
0x040f : ('iso8859_1', "Icelandic", "Western Europe & US"),
0x0421 : ('iso8859_1', "Indonesian", "Western Europe & US"),
0x0410 : ('iso8859_1', "Italian_Standard", "Western Europe & US"),
0x0810 : ('iso8859_1', "Italian_Swiss", "Western Europe & US"),
0x0411 : ('cp932', "Japanese", "Japanese"),
0x043f : ('cp1251', "Kazakh", "Cyrillic"),
0x0457 : (None, "Konkani", "Indic"),
0x0412 : ('cp949', "Korean", "Korean"),
0x0426 : ('iso8859_13',"Latvian", "Baltic",),
0x0427 : ('iso8859_13',"Lithuanian", "Baltic",),
0x042f : ('cp1251', "Macedonian", "Cyrillic"),
0x043e : ('iso8859_1', "Malay_Malaysia", "Western Europe & US"),
0x083e : ('iso8859_1', "Malay_Brunei_Darussalam", "Western Europe & US"),
0x044e : (None, "Marathi", "Indic"),
0x0414 : ('iso8859_1', "Norwegian_Bokmal", "Western Europe & US"),
0x0814 : ('iso8859_1', "Norwegian_Nynorsk", "Western Europe & US"),
0x0415 : ('iso8859_2', "Polish", "Central Europe"),
0x0416 : ('iso8859_1', "Portuguese_Brazilian", "Western Europe & US"),
0x0816 : ('iso8859_1', "Portuguese_Standard", "Western Europe & US"),
0x0418 : ('iso8859_2', "Romanian", "Central Europe"),
0x0419 : ('cp1251', "Russian", "Cyrillic"),
0x044f : (None, "Sanskrit", "Indic"),
0x081a : ('iso8859_2', "Serbian_Latin", "Central Europe"),
0x0c1a : ('cp1251', "Serbian_Cyrillic", "Cyrillic"),
0x041b : ('iso8859_2', "Slovak", "Central Europe"),
0x0424 : ('iso8859_2', "Slovenian", "Central Europe"),
0x040a : ('iso8859_1', "Spanish_Trad_Sort", "Western Europe & US"),
0x080a : ('iso8859_1', "Spanish_Mexican", "Western Europe & US"),
0x0c0a : ('iso8859_1', "Spanish_Modern_Sort", "Western Europe & US"),
0x100a : ('iso8859_1', "Spanish_Guatemala", "Western Europe & US"),
0x140a : ('iso8859_1', "Spanish_Costa_Rica", "Western Europe & US"),
0x180a : ('iso8859_1', "Spanish_Panama", "Western Europe & US"),
0x1c0a : ('iso8859_1', "Spanish_Dominican_Repub", "Western Europe & US"),
0x200a : ('iso8859_1', "Spanish_Venezuela", "Western Europe & US"),
0x240a : ('iso8859_1', "Spanish_Colombia", "Western Europe & US"),
0x280a : ('iso8859_1', "Spanish_Peru", "Western Europe & US"),
0x2c0a : ('iso8859_1', "Spanish_Argentina", "Western Europe & US"),
0x300a : ('iso8859_1', "Spanish_Ecuador", "Western Europe & US"),
0x340a : ('iso8859_1', "Spanish_Chile", "Western Europe & US"),
0x380a : ('iso8859_1', "Spanish_Uruguay", "Western Europe & US"),
0x3c0a : ('iso8859_1', "Spanish_Paraguay", "Western Europe & US"),
0x400a : ('iso8859_1', "Spanish_Bolivia", "Western Europe & US"),
0x440a : ('iso8859_1', "Spanish_El_Salvador", "Western Europe & US"),
0x480a : ('iso8859_1', "Spanish_Honduras", "Western Europe & US"),
0x4c0a : ('iso8859_1', "Spanish_Nicaragua", "Western Europe & US"),
0x500a : ('iso8859_1', "Spanish_Puerto_Rico", "Western Europe & US"),
0x0441 : ('iso8859_1', "Swahili", "Western Europe & US"),
0x041d : ('iso8859_1', "Swedish", "Western Europe & US"),
0x081d : ('iso8859_1', "Swedish_Finland", "Western Europe & US"),
0x0449 : (None, "Tamil", "Indic"),
0x0444 : ('cp1251', "Tatar", "Cyrillic"),
0x041e : ('iso8859_11',"Thai", "Thai"),
0x041f : ('iso8859_9', "Turkish", "Turkish"),
0x0422 : ('cp1251', "Ukrainian", "Cyrillic"),
0x0420 : ('iso8859_6', "Urdu", "Arabic"),
0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"),
0x0843 : ('cp1251', "Uzbek_Cyrillic", "Cyrillic"),
0x042a : (None, "Vietnamese", "Vietnamese")
}
class CHMFile:
"A class to manage access to CHM files."
filename = ""
file = None
title = ""
home = "/"
index = None
topics = None
encoding = None
lcid = None
binaryindex = None
def __init__(self):
self.searchable = 0
def LoadCHM(self, archiveName):
'''Loads a CHM archive.
This function will also call GetArchiveInfo to obtain information
such as the index file name and the topics file. It returns 1 on
success, and 0 if it fails.
'''
if (self.filename != None):
self.CloseCHM()
self.file = chmlib.chm_open(archiveName)
if (self.file == None):
return 0
self.filename = archiveName
self.GetArchiveInfo()
return 1
def CloseCHM(self):
'''Closes the CHM archive.
This function will close the CHM file, if it is open. All variables
are also reset.
'''
if (self.filename != None):
chmlib.chm_close(self.file)
self.file = None
self.filename = ''
self.title = ""
self.home = "/"
self.index = None
self.topics = None
self.encoding = None
def GetArchiveInfo(self):
'''Obtains information on CHM archive.
This function checks the /#SYSTEM file inside the CHM archive to
obtain the index, home page, topics, encoding and title. It is called
from LoadCHM.
'''
#extra.is_searchable crashed...
#self.searchable = extra.is_searchable (self.file)
self.searchable = False
self.lcid = None
result, ui = chmlib.chm_resolve_object(self.file, '/#SYSTEM')
if (result != chmlib.CHM_RESOLVE_SUCCESS):
sys.stderr.write('GetArchiveInfo: #SYSTEM does not exist\n')
return 0
size, text = chmlib.chm_retrieve_object(self.file, ui, 4l, ui.length)
if (size == 0):
sys.stderr.write('GetArchiveInfo: file size = 0\n')
return 0
buff = array.array('B', text)
index = 0
while (index < size):
cursor = buff[index] + (buff[index+1] * 256)
if (cursor == 0):
index += 2
cursor = buff[index] + (buff[index+1] * 256)
index += 2
self.topics = '/' + text[index:index+cursor-1]
elif (cursor == 1):
index += 2
cursor = buff[index] + (buff[index+1] * 256)
index += 2
self.index = '/' + text[index:index+cursor-1]
elif (cursor == 2):
index += 2
cursor = buff[index] + (buff[index+1] * 256)
index += 2
self.home = '/' + text[index:index+cursor-1]
elif (cursor == 3):
index += 2
cursor = buff[index] + (buff[index+1] * 256)
index += 2
self.title = text[index:index+cursor-1]
elif (cursor == 4):
index += 2
cursor = buff[index] + (buff[index+1] * 256)
index += 2
self.lcid = buff[index] + (buff[index+1] * 256)
elif (cursor == 6):
index += 2
cursor = buff[index] + (buff[index+1] * 256)
index += 2
tmp = text[index:index+cursor-1]
if not self.topics:
tmp1 = '/' + tmp + '.hhc'
tmp2 = '/' + tmp + '.hhk'
res1, ui1 = chmlib.chm_resolve_object(self.file, tmp1)
res2, ui2 = chmlib.chm_resolve_object(self.file, tmp2)
if (not self.topics) and \
(res1 == chmlib.CHM_RESOLVE_SUCCESS):
self.topics = '/' + tmp + '.hhc'
if (not self.index) and \
(res2 == chmlib.CHM_RESOLVE_SUCCESS):
self.index = '/' + tmp + '.hhk'
elif (cursor == 16):
index += 2
cursor = buff[index] + (buff[index+1] * 256)
index += 2
self.encoding = text[index:index+cursor-1]
else:
index += 2
cursor = buff[index] + (buff[index+1] * 256)
index += 2
index += cursor
self.GetWindowsInfo()
if not self.lcid:
self.lcid = extra.get_lcid (self.file)
return 1
def GetTopicsTree(self):
'''Reads and returns the topics tree.
This auxiliary function reads and returns the topics tree file
contents for the CHM archive.
'''
if (self.topics == None):
return None
if self.topics:
res, ui = chmlib.chm_resolve_object(self.file, self.topics)
if (res != chmlib.CHM_RESOLVE_SUCCESS):
return None
size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
if (size == 0):
sys.stderr.write('GetTopicsTree: file size = 0\n')
return None
return text
def GetIndex(self):
'''Reads and returns the index tree.
This auxiliary function reads and returns the index tree file
contents for the CHM archive.
'''
if (self.index == None):
return None
if self.index:
res, ui = chmlib.chm_resolve_object(self.file, self.index)
if (res != chmlib.CHM_RESOLVE_SUCCESS):
return None
size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
if (size == 0):
sys.stderr.write('GetIndex: file size = 0\n')
return None
return text
def ResolveObject(self, document):
'''Tries to locate a document in the archive.
This function tries to locate the document inside the archive. It
returns a tuple where the first element is zero if the function
was successful, and the second is the UnitInfo for that document.
The UnitInfo is used to retrieve the document contents
'''
if self.file:
#path = os.path.abspath(document)
path = document
return chmlib.chm_resolve_object(self.file, path)
else:
return (1, None)
def RetrieveObject(self, ui, start = -1, length = -1):
'''Retrieves the contents of a document.
This function takes a UnitInfo and two optional arguments, the first
being the start address and the second is the length. These define
the amount of data to be read from the archive.
'''
if self.file and ui:
if length == -1:
len = ui.length
else:
len = length
if start == -1:
st = 0l
else:
st = long(start)
return chmlib.chm_retrieve_object(self.file, ui, st, len)
else:
return (0, '')
def Search(self, text, wholewords=0, titleonly=0):
'''Performs full-text search on the archive.
The first parameter is the word to look for, the second
indicates if the search should be for whole words only, and
the third parameter indicates if the search should be
restricted to page titles.
This method will return a tuple, the first item
indicating if the search results were partial, and the second
item being a dictionary containing the results.'''
if text and text != '' and self.file:
return extra.search (self.file, text, wholewords,
titleonly)
else:
return None
def IsSearchable(self):
'''Indicates if the full-text search is available for this
archive - this flag is updated when GetArchiveInfo is called'''
return self.searchable
def GetEncoding(self):
'''Returns a string that can be used with the codecs python package
to encode or decode the files in the chm archive. If an error is
found, or if it is not possible to find the encoding, None is
returned.'''
if self.encoding:
vals = string.split(self.encoding, ',')
if len(vals) > 2:
try:
return charset_table[int(vals[2])]
except KeyError:
pass
return None
def GetLCID(self):
'''Returns the archive Locale ID'''
if self.lcid in locale_table:
return locale_table[self.lcid]
else:
return None
def GetDWORD(self, buff, idx=0):
'''Internal method.
Reads a double word (4 bytes) from a buffer.
'''
result = buff[idx] + (buff[idx+1]<<8) + (buff[idx+2]<<16) + \
(buff[idx+3]<<24)
if result == 0xFFFFFFFF:
result = 0
return result
def GetString(self, text, idx):
'''Internal method.
Retrieves a string from the #STRINGS buffer.
'''
next = string.find(text, '\x00', idx)
chunk = text[idx:next]
return chunk
def GetWindowsInfo(self):
'''Gets information from the #WINDOWS file.
Checks the #WINDOWS file to see if it has any info that was
not found in #SYSTEM (topics, index or default page.
'''
result, ui = chmlib.chm_resolve_object(self.file, '/#WINDOWS')
if (result != chmlib.CHM_RESOLVE_SUCCESS):
return -1
size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, 8)
if (size < 8):
return -2
buff = array.array('B', text)
num_entries = self.GetDWORD(buff, 0)
entry_size = self.GetDWORD(buff, 4)
if num_entries < 1:
return -3
size, text = chmlib.chm_retrieve_object(self.file, ui, 8l, entry_size)
if (size < entry_size):
return -4
buff = array.array('B', text)
toc_index = self.GetDWORD(buff, 0x60)
idx_index = self.GetDWORD(buff, 0x64)
dft_index = self.GetDWORD(buff, 0x68)
result, ui = chmlib.chm_resolve_object(self.file, '/#STRINGS')
if (result != chmlib.CHM_RESOLVE_SUCCESS):
return -5
size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
if (size == 0):
return -6
if (not self.topics):
self.topics = self.GetString(text, toc_index)
if not self.topics.startswith("/"):
self.topics = "/" + self.topics
if (not self.index):
self.index = self.GetString(text, idx_index)
if not self.index.startswith("/"):
self.index = "/" + self.index
if (dft_index != 0):
self.home = self.GetString(text, dft_index)
if not self.home.startswith("/"):
self.home = "/" + self.home

View File

@ -0,0 +1,93 @@
# This file was created automatically by SWIG.
# Don't modify this file, modify the SWIG interface instead.
# This file is compatible with both classic and new-style classes.
import _chmlib
def _swig_setattr(self,class_type,name,value):
if (name == "this"):
if isinstance(value, class_type):
self.__dict__[name] = value.this
if hasattr(value,"thisown"): self.__dict__["thisown"] = value.thisown
del value.thisown
return
method = class_type.__swig_setmethods__.get(name,None)
if method: return method(self,value)
self.__dict__[name] = value
def _swig_getattr(self,class_type,name):
method = class_type.__swig_getmethods__.get(name,None)
if method: return method(self)
raise AttributeError,name
import types
try:
_object = types.ObjectType
_newclass = 1
except AttributeError:
class _object : pass
_newclass = 0
CHM_UNCOMPRESSED = _chmlib.CHM_UNCOMPRESSED
CHM_COMPRESSED = _chmlib.CHM_COMPRESSED
CHM_MAX_PATHLEN = _chmlib.CHM_MAX_PATHLEN
class chmUnitInfo(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, chmUnitInfo, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, chmUnitInfo, name)
__swig_setmethods__["start"] = _chmlib.chmUnitInfo_start_set
__swig_getmethods__["start"] = _chmlib.chmUnitInfo_start_get
if _newclass:start = property(_chmlib.chmUnitInfo_start_get,_chmlib.chmUnitInfo_start_set)
__swig_setmethods__["length"] = _chmlib.chmUnitInfo_length_set
__swig_getmethods__["length"] = _chmlib.chmUnitInfo_length_get
if _newclass:length = property(_chmlib.chmUnitInfo_length_get,_chmlib.chmUnitInfo_length_set)
__swig_setmethods__["space"] = _chmlib.chmUnitInfo_space_set
__swig_getmethods__["space"] = _chmlib.chmUnitInfo_space_get
if _newclass:space = property(_chmlib.chmUnitInfo_space_get,_chmlib.chmUnitInfo_space_set)
__swig_setmethods__["path"] = _chmlib.chmUnitInfo_path_set
__swig_getmethods__["path"] = _chmlib.chmUnitInfo_path_get
if _newclass:path = property(_chmlib.chmUnitInfo_path_get,_chmlib.chmUnitInfo_path_set)
def __init__(self,*args):
_swig_setattr(self, chmUnitInfo, 'this', apply(_chmlib.new_chmUnitInfo,args))
_swig_setattr(self, chmUnitInfo, 'thisown', 1)
def __del__(self, destroy= _chmlib.delete_chmUnitInfo):
try:
if self.thisown: destroy(self)
except: pass
def __repr__(self):
return "<C chmUnitInfo instance at %s>" % (self.this,)
class chmUnitInfoPtr(chmUnitInfo):
def __init__(self,this):
_swig_setattr(self, chmUnitInfo, 'this', this)
if not hasattr(self,"thisown"): _swig_setattr(self, chmUnitInfo, 'thisown', 0)
_swig_setattr(self, chmUnitInfo,self.__class__,chmUnitInfo)
_chmlib.chmUnitInfo_swigregister(chmUnitInfoPtr)
chm_open = _chmlib.chm_open
chm_close = _chmlib.chm_close
CHM_PARAM_MAX_BLOCKS_CACHED = _chmlib.CHM_PARAM_MAX_BLOCKS_CACHED
chm_set_param = _chmlib.chm_set_param
CHM_RESOLVE_SUCCESS = _chmlib.CHM_RESOLVE_SUCCESS
CHM_RESOLVE_FAILURE = _chmlib.CHM_RESOLVE_FAILURE
chm_resolve_object = _chmlib.chm_resolve_object
chm_retrieve_object = _chmlib.chm_retrieve_object
CHM_ENUMERATE_NORMAL = _chmlib.CHM_ENUMERATE_NORMAL
CHM_ENUMERATE_META = _chmlib.CHM_ENUMERATE_META
CHM_ENUMERATE_SPECIAL = _chmlib.CHM_ENUMERATE_SPECIAL
CHM_ENUMERATE_FILES = _chmlib.CHM_ENUMERATE_FILES
CHM_ENUMERATE_DIRS = _chmlib.CHM_ENUMERATE_DIRS
CHM_ENUMERATE_ALL = _chmlib.CHM_ENUMERATE_ALL
CHM_ENUMERATOR_FAILURE = _chmlib.CHM_ENUMERATOR_FAILURE
CHM_ENUMERATOR_CONTINUE = _chmlib.CHM_ENUMERATOR_CONTINUE
CHM_ENUMERATOR_SUCCESS = _chmlib.CHM_ENUMERATOR_SUCCESS
chm_enumerate = _chmlib.chm_enumerate
chm_enumerate_dir = _chmlib.chm_enumerate_dir

Binary file not shown.

View File

@ -0,0 +1,348 @@
from __future__ import with_statement
''' CHM File decoding support '''
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.'
import sys, logging, os, re, shutil, subprocess, uuid
from shutil import rmtree
from tempfile import mkdtemp
from mimetypes import guess_type as guess_mimetype
from htmlentitydefs import name2codepoint
from pprint import PrettyPrinter
from BeautifulSoup import BeautifulSoup
from lxml import html, etree
from calibre.ebooks.chm.chm.chm import CHMFile
from calibre.ebooks.chm.chm.chmlib import (
CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
chm_enumerate, chm_retrieve_object,
)
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.utils.config import OptionParser
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator, Guide
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
def match_string(s1, s2_already_lowered):
if s1 is not None and s2_already_lowered is not None:
if s1.lower()==s2_already_lowered:
return True
return False
def option_parser():
parser = OptionParser(usage=_('%prog [options] mybook.chm'))
parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
parser.add_option('--verbose', default=False, action='store_true', dest='verbose')
parser.add_option("-t", "--title", action="store", type="string", \
dest="title", help=_("Set the book title"))
parser.add_option('--title-sort', action='store', type='string', default=None,
dest='title_sort', help=_('Set sort key for the title'))
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help=_("Set the author"))
parser.add_option('--author-sort', action='store', type='string', default=None,
dest='author_sort', help=_('Set sort key for the author'))
parser.add_option("-c", "--category", action="store", type="string", \
dest="category", help=_("The category this book belongs"
" to. E.g.: History"))
parser.add_option("--thumbnail", action="store", type="string", \
dest="thumbnail", help=_("Path to a graphic that will be"
" set as this files' thumbnail"))
parser.add_option("--comment", action="store", type="string", \
dest="freetext", help=_("Path to a txt file containing a comment."))
parser.add_option("--get-thumbnail", action="store_true", \
dest="get_thumbnail", default=False, \
help=_("Extract thumbnail from LRF file"))
parser.add_option('--publisher', default=None, help=_('Set the publisher'))
parser.add_option('--classification', default=None, help=_('Set the book classification'))
parser.add_option('--creator', default=None, help=_('Set the book creator'))
parser.add_option('--producer', default=None, help=_('Set the book producer'))
parser.add_option('--get-cover', action='store_true', default=False,
help=_('Extract cover from LRF file. Note that the LRF format has no defined cover, so we use some heuristics to guess the cover.'))
parser.add_option('--bookid', action='store', type='string', default=None,
dest='book_id', help=_('Set book ID'))
parser.add_option('--font-delta', action='store', type='int', default=0,
dest='font_delta', help=_('Set font delta'))
return parser
class CHMError(Exception):
pass
class CHMReader(CHMFile):
def __init__(self, input, log):
CHMFile.__init__(self)
if not self.LoadCHM(input):
raise CHMError("Unable to open CHM file '%s'"%(input,))
self.log = log
self._sourcechm = input
self._contents = None
self._playorder = 0
self._metadata = False
self._extracted = False
# location of '.hhc' file, which is the CHM TOC.
self.root, ext = os.path.splitext(self.topics.lstrip('/'))
self.hhc_path = self.root + ".hhc"
def _parse_toc(self, ul, basedir=os.getcwdu()):
toc = TOC(play_order=self._playorder, base_path=basedir, text='')
self._playorder += 1
for li in ul('li', recursive=False):
href = li.object('param', {'name': 'Local'})[0]['value']
if href.count('#'):
href, frag = href.split('#')
else:
frag = None
name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
#print "========>", name
toc.add_item(href, frag, name, play_order=self._playorder)
self._playorder += 1
if li.ul:
child = self._parse_toc(li.ul)
child.parent = toc
toc.append(child)
#print toc
return toc
def GetFile(self, path):
# have to have abs paths for ResolveObject, but Contents() deliberately
# makes them relative. So we don't have to worry, re-add the leading /.
# note this path refers to the internal CHM structure
if path[0] != '/':
path = '/' + path
res, ui = self.ResolveObject(path)
if res != CHM_RESOLVE_SUCCESS:
raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
size, data = self.RetrieveObject(ui)
if size == 0:
raise CHMError("'%s' is zero bytes in length!"%(path,))
return data
def ExtractFiles(self, output_dir=os.getcwdu()):
for path in self.Contents():
lpath = os.path.join(output_dir, path)
self._ensure_dir(lpath)
data = self.GetFile(path)
with open(lpath, 'wb') as f:
if guess_mimetype(path)[0] == ('text/html'):
data = self._reformat(data)
f.write(data)
#subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
self._extracted = True
def _reformat(self, data):
try:
html = BeautifulSoup(data)
except UnicodeEncodeError:
# hit some strange encoding problems...
print "Unable to parse html for cleaning, leaving it :("
return data
# nuke javascript...
[s.extract() for s in html('script')]
# remove forward and back nav bars from the top/bottom of each page
# cos they really fuck with the flow of things and generally waste space
# since we can't use [a,b] syntax to select arbitrary items from a list
# we'll have to do this manually...
t = html('table')
if t:
if (t[0].previousSibling is None
or t[0].previousSibling.previousSibling is None):
t[0].extract()
if (t[-1].nextSibling is None
or t[-1].nextSibling.nextSibling is None):
t[-1].extract()
# for some very odd reason each page's content appears to be in a table
# too. and this table has sub-tables for random asides... grr.
# some images seem to be broken in some chm's :/
for img in html('img'):
try:
# some are supposedly "relative"... lies.
while img['src'].startswith('../'): img['src'] = img['src'][3:]
# some have ";<junk>" at the end.
img['src'] = img['src'].split(';')[0]
except KeyError:
# and some don't even have a src= ?!
pass
# now give back some pretty html.
return html.prettify()
def Contents(self):
if self._contents is not None:
return self._contents
paths = []
def get_paths(chm, ui, ctx):
# skip directories
# note this path refers to the internal CHM structure
if ui.path[-1] != '/':
# and make paths relative
paths.append(ui.path.lstrip('/'))
chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
self._contents = paths
return self._contents
def _ensure_dir(self, path):
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
def extract_content(self, output_dir=os.getcwdu()):
self.ExtractFiles(output_dir=output_dir)
class CHMInput(InputFormatPlugin):
name = 'CHM Input'
author = 'Kovid Goyal and Alex Bramley'
description = 'Convert CHM files to OEB'
file_types = set(['chm'])
options = set([
OptionRecommendation(name='dummy_option', recommended_value=False,
help=_('dummy option until real options are determined.')),
])
def _chmtohtml(self, output_dir, chm_path, no_images, log):
log.debug('Opening CHM file')
rdr = CHMReader(chm_path, log)
log.debug('Extracting CHM to %s' % output_dir)
rdr.extract_content(output_dir)
return rdr.hhc_path
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.metadata.chm import get_metadata_
log.debug('Processing CHM...')
tdir = mkdtemp(prefix='chm2oeb_')
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
no_images = False #options.no_images
chm_name = stream.name
#chm_data = stream.read()
#closing stream so CHM can be opened by external library
stream.close()
log.debug('tdir=%s' % tdir)
log.debug('stream.name=%s' % stream.name)
mainname = self._chmtohtml(tdir, chm_name, no_images, log)
mainpath = os.path.join(tdir, mainname)
metadata = get_metadata_(tdir)
cwd = os.getcwdu()
odi = options.debug_pipeline
options.debug_pipeline = None
# try a custom conversion:
oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
options.debug_pipeline = odi
#log.debug('DEBUG: Not removing tempdir %s' % tdir)
shutil.rmtree(tdir)
return oeb
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
from calibre.ebooks.conversion.plumber import create_oebbook
from calibre.ebooks.oeb.base import DirContainer, \
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, \
xpath
from calibre import guess_type
import cssutils
oeb = create_oebbook(log, None, opts, self,
encoding=opts.input_encoding, populate=False)
self.oeb = oeb
metadata = oeb.metadata
if mi.title:
metadata.add('title', mi.title)
if mi.authors:
for a in mi.authors:
metadata.add('creator', a, attrib={'role':'aut'})
if mi.publisher:
metadata.add('publisher', mi.publisher)
if mi.isbn:
metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'})
if not metadata.language:
oeb.logger.warn(u'Language not specified')
metadata.add('language', get_lang())
if not metadata.creator:
oeb.logger.warn('Creator not specified')
metadata.add('creator', self.oeb.translate(__('Unknown')))
if not metadata.title:
oeb.logger.warn('Title not specified')
metadata.add('title', self.oeb.translate(__('Unknown')))
bookid = str(uuid.uuid4())
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.oeb.uid = metadata.identifier[0]
break
hhcdata = self._read_file(hhcpath)
hhcroot = html.fromstring(hhcdata)
chapters = self._process_nodes(hhcroot)
#print "============================="
#print "Printing hhcroot"
#print etree.tostring(hhcroot, pretty_print=True)
#print "============================="
log.debug('Found %d section nodes' % len(chapters))
if len(chapters) > 0:
path0 = chapters[0][1]
subpath = os.path.dirname(path0)
htmlpath = os.path.join(basedir, subpath)
oeb.container = DirContainer(htmlpath, log)
for chapter in chapters:
title = chapter[0]
basename = os.path.basename(chapter[1])
self._add_item(oeb, title, basename)
oeb.container = DirContainer(htmlpath, oeb.log)
return oeb
def _read_file(self, name):
f = open(name, 'rb')
data = f.read()
f.close()
return data
def _visit_node(self, node, chapters):
# check that node is a normal node (not a comment, DOCTYPE, etc.)
# (normal nodes have string tags)
if isinstance(node.tag, basestring):
if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
for child in node:
if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
chapter_title = child.attrib['value']
if match_string(child.tag,'param') and match_string(child.attrib['name'],'local'):
chapter_path = child.attrib['value']
if chapter_title is not None and chapter_path is not None:
chapter = [chapter_title, chapter_path]
chapters.append(chapter)
def _process_nodes(self, root):
chapters = []
for node in root.iter():
self._visit_node(node, chapters)
return chapters
def _add_item(self, oeb, title, path):
bname = os.path.basename(path)
id, href = oeb.manifest.generate(id='html',
href=ascii_filename(bname))
item = oeb.manifest.add(id, href, 'text/html')
item.html_input_href = bname
oeb.spine.add(item, True)
oeb.toc.add(title, item.href)