diff --git a/src/calibre/ebooks/chm/__init__.py b/src/calibre/ebooks/chm/__init__.py new file mode 100644 index 0000000000..d7d77da4b6 --- /dev/null +++ b/src/calibre/ebooks/chm/__init__.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Used for chm input +''' diff --git a/src/calibre/ebooks/chm/chm/__init__.py b/src/calibre/ebooks/chm/chm/__init__.py new file mode 100644 index 0000000000..83fcb5c50e --- /dev/null +++ b/src/calibre/ebooks/chm/chm/__init__.py @@ -0,0 +1,34 @@ +## Copyright (C) 2003-2006 Rubens Ramos + +## pychm is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. + +## You should have received a copy of the GNU General Public +## License along with this program; see the file COPYING. If not, +## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +## Boston, MA 02111-1307, USA + +## $Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $ + +''' + chm - A package to manipulate CHM files + + The chm package provides four modules: chm, chmlib, extra and + _chmlib. _chmlib and chmlib are very low level libraries generated + from SWIG interface files, and are simple wrappers around the API + defined by the C library chmlib. + The extra module adds full-text search support. + the chm module provides some higher level classes to simplify + access to the CHM files information. +''' +__all__ = ["chm", "chmlib", "_chmlib", "extra"] +__version__ = "0.8.4" +__revision__ = "$Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $" + diff --git a/src/calibre/ebooks/chm/chm/_chmlib.pyd b/src/calibre/ebooks/chm/chm/_chmlib.pyd new file mode 100644 index 0000000000..ef963bf873 Binary files /dev/null and b/src/calibre/ebooks/chm/chm/_chmlib.pyd differ diff --git a/src/calibre/ebooks/chm/chm/chm.py b/src/calibre/ebooks/chm/chm/chm.py new file mode 100644 index 0000000000..bed89af944 --- /dev/null +++ b/src/calibre/ebooks/chm/chm/chm.py @@ -0,0 +1,508 @@ +## Copyright (C) 2003-2006 Rubens Ramos + +## Based on code by: +## Copyright (C) 2003 Razvan Cojocaru + +## pychm is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. + +## You should have received a copy of the GNU General Public +## License along with this program; see the file COPYING. If not, +## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +## Boston, MA 02111-1307, USA + +## $Id: chm.py,v 1.12 2006/08/07 12:31:51 rubensr Exp $ + +''' + chm - A high-level front end for the chmlib python module. + + The chm module provides high level access to the functionality + included in chmlib. It encapsulates functions in the CHMFile class, and + provides some additional features, such as the ability to obtain + the contents tree of a CHM archive. + +''' + +import chmlib +import extra +import array +import string +import os.path +import sys + +charset_table = { + 0 : 'iso8859_1', # ANSI_CHARSET + 238 : 'iso8859_2', # EASTEUROPE_CHARSET + 178 : 'iso8859_6', # ARABIC_CHARSET + 161 : 'iso8859_7', # GREEK_CHARSET + 177 : 'iso8859_8', # HEBREW_CHARSET + 162 : 'iso8859_9', # TURKISH_CHARSET + 222 : 'iso8859_11', # THAI_CHARSET - hmm not in python 2.2... + 186 : 'iso8859_13', # BALTIC_CHARSET + 204 : 'cp1251', # RUSSIAN_CHARSET + 255 : 'cp437', # OEM_CHARSET + 128 : 'cp932', # SHIFTJIS_CHARSET + 134 : 'cp936', # GB2312_CHARSET + 129 : 'cp949', # HANGUL_CHARSET + 136 : 'cp950', # CHINESEBIG5_CHARSET + 1 : None, # DEFAULT_CHARSET + 2 : None, # SYMBOL_CHARSET + 130 : None, # JOHAB_CHARSET + 163 : None, # VIETNAMESE_CHARSET + 77 : None, # MAC_CHARSET +} + +locale_table = { + 0x0436 : ('iso8859_1', "Afrikaans", "Western Europe & US"), + 0x041c : ('iso8859_2', "Albanian", "Central Europe"), + 0x0401 : ('iso8859_6', "Arabic_Saudi_Arabia", "Arabic"), + 0x0801 : ('iso8859_6', "Arabic_Iraq", "Arabic"), + 0x0c01 : ('iso8859_6', "Arabic_Egypt", "Arabic"), + 0x1001 : ('iso8859_6', "Arabic_Libya", "Arabic"), + 0x1401 : ('iso8859_6', "Arabic_Algeria", "Arabic"), + 0x1801 : ('iso8859_6', "Arabic_Morocco", "Arabic"), + 0x1c01 : ('iso8859_6', "Arabic_Tunisia", "Arabic"), + 0x2001 : ('iso8859_6', "Arabic_Oman", "Arabic"), + 0x2401 : ('iso8859_6', "Arabic_Yemen", "Arabic"), + 0x2801 : ('iso8859_6', "Arabic_Syria", "Arabic"), + 0x2c01 : ('iso8859_6', "Arabic_Jordan", "Arabic"), + 0x3001 : ('iso8859_6', "Arabic_Lebanon", "Arabic"), + 0x3401 : ('iso8859_6', "Arabic_Kuwait", "Arabic"), + 0x3801 : ('iso8859_6', "Arabic_UAE", "Arabic"), + 0x3c01 : ('iso8859_6', "Arabic_Bahrain", "Arabic"), + 0x4001 : ('iso8859_6', "Arabic_Qatar", "Arabic"), + 0x042b : (None, "Armenian","Armenian"), + 0x042c : ('iso8859_9', "Azeri_Latin", "Turkish"), + 0x082c : ('cp1251', "Azeri_Cyrillic", "Cyrillic"), + 0x042d : ('iso8859_1', "Basque", "Western Europe & US"), + 0x0423 : ('cp1251', "Belarusian", "Cyrillic"), + 0x0402 : ('cp1251', "Bulgarian", "Cyrillic"), + 0x0403 : ('iso8859_1', "Catalan", "Western Europe & US"), + 0x0404 : ('cp950', "Chinese_Taiwan", "Traditional Chinese"), + 0x0804 : ('cp936', "Chinese_PRC", "Simplified Chinese"), + 0x0c04 : ('cp950', "Chinese_Hong_Kong", "Traditional Chinese"), + 0x1004 : ('cp936', "Chinese_Singapore", "Simplified Chinese"), + 0x1404 : ('cp950', "Chinese_Macau", "Traditional Chinese"), + 0x041a : ('iso8859_2', "Croatian", "Central Europe"), + 0x0405 : ('iso8859_2', "Czech", "Central Europe"), + 0x0406 : ('iso8859_1', "Danish", "Western Europe & US"), + 0x0413 : ('iso8859_1', "Dutch_Standard", "Western Europe & US"), + 0x0813 : ('iso8859_1', "Dutch_Belgian", "Western Europe & US"), + 0x0409 : ('iso8859_1', "English_United_States", "Western Europe & US"), + 0x0809 : ('iso8859_1', "English_United_Kingdom", "Western Europe & US"), + 0x0c09 : ('iso8859_1', "English_Australian", "Western Europe & US"), + 0x1009 : ('iso8859_1', "English_Canadian", "Western Europe & US"), + 0x1409 : ('iso8859_1', "English_New_Zealand", "Western Europe & US"), + 0x1809 : ('iso8859_1', "English_Irish", "Western Europe & US"), + 0x1c09 : ('iso8859_1', "English_South_Africa", "Western Europe & US"), + 0x2009 : ('iso8859_1', "English_Jamaica", "Western Europe & US"), + 0x2409 : ('iso8859_1', "English_Caribbean", "Western Europe & US"), + 0x2809 : ('iso8859_1', "English_Belize", "Western Europe & US"), + 0x2c09 : ('iso8859_1', "English_Trinidad", "Western Europe & US"), + 0x3009 : ('iso8859_1', "English_Zimbabwe", "Western Europe & US"), + 0x3409 : ('iso8859_1', "English_Philippines", "Western Europe & US"), + 0x0425 : ('iso8859_13',"Estonian", "Baltic",), + 0x0438 : ('iso8859_1', "Faeroese", "Western Europe & US"), + 0x0429 : ('iso8859_6', "Farsi", "Arabic"), + 0x040b : ('iso8859_1', "Finnish", "Western Europe & US"), + 0x040c : ('iso8859_1', "French_Standard", "Western Europe & US"), + 0x080c : ('iso8859_1', "French_Belgian", "Western Europe & US"), + 0x0c0c : ('iso8859_1', "French_Canadian", "Western Europe & US"), + 0x100c : ('iso8859_1', "French_Swiss", "Western Europe & US"), + 0x140c : ('iso8859_1', "French_Luxembourg", "Western Europe & US"), + 0x180c : ('iso8859_1', "French_Monaco", "Western Europe & US"), + 0x0437 : (None, "Georgian", "Georgian"), + 0x0407 : ('iso8859_1', "German_Standard", "Western Europe & US"), + 0x0807 : ('iso8859_1', "German_Swiss", "Western Europe & US"), + 0x0c07 : ('iso8859_1', "German_Austrian", "Western Europe & US"), + 0x1007 : ('iso8859_1', "German_Luxembourg", "Western Europe & US"), + 0x1407 : ('iso8859_1', "German_Liechtenstein", "Western Europe & US"), + 0x0408 : ('iso8859_7', "Greek", "Greek"), + 0x040d : ('iso8859_8', "Hebrew", "Hebrew"), + 0x0439 : (None, "Hindi", "Indic"), + 0x040e : ('iso8859_2', "Hungarian", "Central Europe"), + 0x040f : ('iso8859_1', "Icelandic", "Western Europe & US"), + 0x0421 : ('iso8859_1', "Indonesian", "Western Europe & US"), + 0x0410 : ('iso8859_1', "Italian_Standard", "Western Europe & US"), + 0x0810 : ('iso8859_1', "Italian_Swiss", "Western Europe & US"), + 0x0411 : ('cp932', "Japanese", "Japanese"), + 0x043f : ('cp1251', "Kazakh", "Cyrillic"), + 0x0457 : (None, "Konkani", "Indic"), + 0x0412 : ('cp949', "Korean", "Korean"), + 0x0426 : ('iso8859_13',"Latvian", "Baltic",), + 0x0427 : ('iso8859_13',"Lithuanian", "Baltic",), + 0x042f : ('cp1251', "Macedonian", "Cyrillic"), + 0x043e : ('iso8859_1', "Malay_Malaysia", "Western Europe & US"), + 0x083e : ('iso8859_1', "Malay_Brunei_Darussalam", "Western Europe & US"), + 0x044e : (None, "Marathi", "Indic"), + 0x0414 : ('iso8859_1', "Norwegian_Bokmal", "Western Europe & US"), + 0x0814 : ('iso8859_1', "Norwegian_Nynorsk", "Western Europe & US"), + 0x0415 : ('iso8859_2', "Polish", "Central Europe"), + 0x0416 : ('iso8859_1', "Portuguese_Brazilian", "Western Europe & US"), + 0x0816 : ('iso8859_1', "Portuguese_Standard", "Western Europe & US"), + 0x0418 : ('iso8859_2', "Romanian", "Central Europe"), + 0x0419 : ('cp1251', "Russian", "Cyrillic"), + 0x044f : (None, "Sanskrit", "Indic"), + 0x081a : ('iso8859_2', "Serbian_Latin", "Central Europe"), + 0x0c1a : ('cp1251', "Serbian_Cyrillic", "Cyrillic"), + 0x041b : ('iso8859_2', "Slovak", "Central Europe"), + 0x0424 : ('iso8859_2', "Slovenian", "Central Europe"), + 0x040a : ('iso8859_1', "Spanish_Trad_Sort", "Western Europe & US"), + 0x080a : ('iso8859_1', "Spanish_Mexican", "Western Europe & US"), + 0x0c0a : ('iso8859_1', "Spanish_Modern_Sort", "Western Europe & US"), + 0x100a : ('iso8859_1', "Spanish_Guatemala", "Western Europe & US"), + 0x140a : ('iso8859_1', "Spanish_Costa_Rica", "Western Europe & US"), + 0x180a : ('iso8859_1', "Spanish_Panama", "Western Europe & US"), + 0x1c0a : ('iso8859_1', "Spanish_Dominican_Repub", "Western Europe & US"), + 0x200a : ('iso8859_1', "Spanish_Venezuela", "Western Europe & US"), + 0x240a : ('iso8859_1', "Spanish_Colombia", "Western Europe & US"), + 0x280a : ('iso8859_1', "Spanish_Peru", "Western Europe & US"), + 0x2c0a : ('iso8859_1', "Spanish_Argentina", "Western Europe & US"), + 0x300a : ('iso8859_1', "Spanish_Ecuador", "Western Europe & US"), + 0x340a : ('iso8859_1', "Spanish_Chile", "Western Europe & US"), + 0x380a : ('iso8859_1', "Spanish_Uruguay", "Western Europe & US"), + 0x3c0a : ('iso8859_1', "Spanish_Paraguay", "Western Europe & US"), + 0x400a : ('iso8859_1', "Spanish_Bolivia", "Western Europe & US"), + 0x440a : ('iso8859_1', "Spanish_El_Salvador", "Western Europe & US"), + 0x480a : ('iso8859_1', "Spanish_Honduras", "Western Europe & US"), + 0x4c0a : ('iso8859_1', "Spanish_Nicaragua", "Western Europe & US"), + 0x500a : ('iso8859_1', "Spanish_Puerto_Rico", "Western Europe & US"), + 0x0441 : ('iso8859_1', "Swahili", "Western Europe & US"), + 0x041d : ('iso8859_1', "Swedish", "Western Europe & US"), + 0x081d : ('iso8859_1', "Swedish_Finland", "Western Europe & US"), + 0x0449 : (None, "Tamil", "Indic"), + 0x0444 : ('cp1251', "Tatar", "Cyrillic"), + 0x041e : ('iso8859_11',"Thai", "Thai"), + 0x041f : ('iso8859_9', "Turkish", "Turkish"), + 0x0422 : ('cp1251', "Ukrainian", "Cyrillic"), + 0x0420 : ('iso8859_6', "Urdu", "Arabic"), + 0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"), + 0x0843 : ('cp1251', "Uzbek_Cyrillic", "Cyrillic"), + 0x042a : (None, "Vietnamese", "Vietnamese") +} + +class CHMFile: + "A class to manage access to CHM files." + filename = "" + file = None + title = "" + home = "/" + index = None + topics = None + encoding = None + lcid = None + binaryindex = None + + def __init__(self): + self.searchable = 0 + + def LoadCHM(self, archiveName): + '''Loads a CHM archive. + This function will also call GetArchiveInfo to obtain information + such as the index file name and the topics file. It returns 1 on + success, and 0 if it fails. + ''' + if (self.filename != None): + self.CloseCHM() + + self.file = chmlib.chm_open(archiveName) + if (self.file == None): + return 0 + + self.filename = archiveName + self.GetArchiveInfo() + + return 1 + + def CloseCHM(self): + '''Closes the CHM archive. + This function will close the CHM file, if it is open. All variables + are also reset. + ''' + if (self.filename != None): + chmlib.chm_close(self.file) + self.file = None + self.filename = '' + self.title = "" + self.home = "/" + self.index = None + self.topics = None + self.encoding = None + + def GetArchiveInfo(self): + '''Obtains information on CHM archive. + This function checks the /#SYSTEM file inside the CHM archive to + obtain the index, home page, topics, encoding and title. It is called + from LoadCHM. + ''' + + #extra.is_searchable crashed... + #self.searchable = extra.is_searchable (self.file) + self.searchable = False + self.lcid = None + + result, ui = chmlib.chm_resolve_object(self.file, '/#SYSTEM') + if (result != chmlib.CHM_RESOLVE_SUCCESS): + sys.stderr.write('GetArchiveInfo: #SYSTEM does not exist\n') + return 0 + + size, text = chmlib.chm_retrieve_object(self.file, ui, 4l, ui.length) + if (size == 0): + sys.stderr.write('GetArchiveInfo: file size = 0\n') + return 0 + + buff = array.array('B', text) + + index = 0 + while (index < size): + cursor = buff[index] + (buff[index+1] * 256) + + if (cursor == 0): + index += 2 + cursor = buff[index] + (buff[index+1] * 256) + index += 2 + self.topics = '/' + text[index:index+cursor-1] + elif (cursor == 1): + index += 2 + cursor = buff[index] + (buff[index+1] * 256) + index += 2 + self.index = '/' + text[index:index+cursor-1] + elif (cursor == 2): + index += 2 + cursor = buff[index] + (buff[index+1] * 256) + index += 2 + self.home = '/' + text[index:index+cursor-1] + elif (cursor == 3): + index += 2 + cursor = buff[index] + (buff[index+1] * 256) + index += 2 + self.title = text[index:index+cursor-1] + elif (cursor == 4): + index += 2 + cursor = buff[index] + (buff[index+1] * 256) + index += 2 + self.lcid = buff[index] + (buff[index+1] * 256) + elif (cursor == 6): + index += 2 + cursor = buff[index] + (buff[index+1] * 256) + index += 2 + tmp = text[index:index+cursor-1] + if not self.topics: + tmp1 = '/' + tmp + '.hhc' + tmp2 = '/' + tmp + '.hhk' + res1, ui1 = chmlib.chm_resolve_object(self.file, tmp1) + res2, ui2 = chmlib.chm_resolve_object(self.file, tmp2) + if (not self.topics) and \ + (res1 == chmlib.CHM_RESOLVE_SUCCESS): + self.topics = '/' + tmp + '.hhc' + if (not self.index) and \ + (res2 == chmlib.CHM_RESOLVE_SUCCESS): + self.index = '/' + tmp + '.hhk' + elif (cursor == 16): + index += 2 + cursor = buff[index] + (buff[index+1] * 256) + index += 2 + self.encoding = text[index:index+cursor-1] + else: + index += 2 + cursor = buff[index] + (buff[index+1] * 256) + index += 2 + index += cursor + + self.GetWindowsInfo() + + if not self.lcid: + self.lcid = extra.get_lcid (self.file) + + return 1 + + def GetTopicsTree(self): + '''Reads and returns the topics tree. + This auxiliary function reads and returns the topics tree file + contents for the CHM archive. + ''' + if (self.topics == None): + return None + + if self.topics: + res, ui = chmlib.chm_resolve_object(self.file, self.topics) + if (res != chmlib.CHM_RESOLVE_SUCCESS): + return None + + size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length) + if (size == 0): + sys.stderr.write('GetTopicsTree: file size = 0\n') + return None + return text + + def GetIndex(self): + '''Reads and returns the index tree. + This auxiliary function reads and returns the index tree file + contents for the CHM archive. + ''' + if (self.index == None): + return None + + if self.index: + res, ui = chmlib.chm_resolve_object(self.file, self.index) + if (res != chmlib.CHM_RESOLVE_SUCCESS): + return None + + size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length) + if (size == 0): + sys.stderr.write('GetIndex: file size = 0\n') + return None + return text + + def ResolveObject(self, document): + '''Tries to locate a document in the archive. + This function tries to locate the document inside the archive. It + returns a tuple where the first element is zero if the function + was successful, and the second is the UnitInfo for that document. + The UnitInfo is used to retrieve the document contents + ''' + if self.file: + #path = os.path.abspath(document) + path = document + return chmlib.chm_resolve_object(self.file, path) + else: + return (1, None) + + def RetrieveObject(self, ui, start = -1, length = -1): + '''Retrieves the contents of a document. + This function takes a UnitInfo and two optional arguments, the first + being the start address and the second is the length. These define + the amount of data to be read from the archive. + ''' + if self.file and ui: + if length == -1: + len = ui.length + else: + len = length + if start == -1: + st = 0l + else: + st = long(start) + return chmlib.chm_retrieve_object(self.file, ui, st, len) + else: + return (0, '') + + def Search(self, text, wholewords=0, titleonly=0): + '''Performs full-text search on the archive. + The first parameter is the word to look for, the second + indicates if the search should be for whole words only, and + the third parameter indicates if the search should be + restricted to page titles. + This method will return a tuple, the first item + indicating if the search results were partial, and the second + item being a dictionary containing the results.''' + if text and text != '' and self.file: + return extra.search (self.file, text, wholewords, + titleonly) + else: + return None + + def IsSearchable(self): + '''Indicates if the full-text search is available for this + archive - this flag is updated when GetArchiveInfo is called''' + return self.searchable + + def GetEncoding(self): + '''Returns a string that can be used with the codecs python package + to encode or decode the files in the chm archive. If an error is + found, or if it is not possible to find the encoding, None is + returned.''' + if self.encoding: + vals = string.split(self.encoding, ',') + if len(vals) > 2: + try: + return charset_table[int(vals[2])] + except KeyError: + pass + return None + + def GetLCID(self): + '''Returns the archive Locale ID''' + if self.lcid in locale_table: + return locale_table[self.lcid] + else: + return None + + def GetDWORD(self, buff, idx=0): + '''Internal method. + Reads a double word (4 bytes) from a buffer. + ''' + result = buff[idx] + (buff[idx+1]<<8) + (buff[idx+2]<<16) + \ + (buff[idx+3]<<24) + + if result == 0xFFFFFFFF: + result = 0 + + return result + + def GetString(self, text, idx): + '''Internal method. + Retrieves a string from the #STRINGS buffer. + ''' + next = string.find(text, '\x00', idx) + chunk = text[idx:next] + return chunk + + def GetWindowsInfo(self): + '''Gets information from the #WINDOWS file. + Checks the #WINDOWS file to see if it has any info that was + not found in #SYSTEM (topics, index or default page. + ''' + result, ui = chmlib.chm_resolve_object(self.file, '/#WINDOWS') + if (result != chmlib.CHM_RESOLVE_SUCCESS): + return -1 + + size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, 8) + if (size < 8): + return -2 + + buff = array.array('B', text) + num_entries = self.GetDWORD(buff, 0) + entry_size = self.GetDWORD(buff, 4) + + if num_entries < 1: + return -3 + + size, text = chmlib.chm_retrieve_object(self.file, ui, 8l, entry_size) + if (size < entry_size): + return -4 + + buff = array.array('B', text) + toc_index = self.GetDWORD(buff, 0x60) + idx_index = self.GetDWORD(buff, 0x64) + dft_index = self.GetDWORD(buff, 0x68) + + result, ui = chmlib.chm_resolve_object(self.file, '/#STRINGS') + if (result != chmlib.CHM_RESOLVE_SUCCESS): + return -5 + + size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length) + if (size == 0): + return -6 + + if (not self.topics): + self.topics = self.GetString(text, toc_index) + if not self.topics.startswith("/"): + self.topics = "/" + self.topics + + if (not self.index): + self.index = self.GetString(text, idx_index) + if not self.index.startswith("/"): + self.index = "/" + self.index + + if (dft_index != 0): + self.home = self.GetString(text, dft_index) + if not self.home.startswith("/"): + self.home = "/" + self.home diff --git a/src/calibre/ebooks/chm/chm/chmlib.py b/src/calibre/ebooks/chm/chm/chmlib.py new file mode 100644 index 0000000000..98d3372b57 --- /dev/null +++ b/src/calibre/ebooks/chm/chm/chmlib.py @@ -0,0 +1,93 @@ +# This file was created automatically by SWIG. +# Don't modify this file, modify the SWIG interface instead. +# This file is compatible with both classic and new-style classes. +import _chmlib +def _swig_setattr(self,class_type,name,value): + if (name == "this"): + if isinstance(value, class_type): + self.__dict__[name] = value.this + if hasattr(value,"thisown"): self.__dict__["thisown"] = value.thisown + del value.thisown + return + method = class_type.__swig_setmethods__.get(name,None) + if method: return method(self,value) + self.__dict__[name] = value + +def _swig_getattr(self,class_type,name): + method = class_type.__swig_getmethods__.get(name,None) + if method: return method(self) + raise AttributeError,name + +import types +try: + _object = types.ObjectType + _newclass = 1 +except AttributeError: + class _object : pass + _newclass = 0 + + +CHM_UNCOMPRESSED = _chmlib.CHM_UNCOMPRESSED +CHM_COMPRESSED = _chmlib.CHM_COMPRESSED +CHM_MAX_PATHLEN = _chmlib.CHM_MAX_PATHLEN +class chmUnitInfo(_object): + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, chmUnitInfo, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, chmUnitInfo, name) + __swig_setmethods__["start"] = _chmlib.chmUnitInfo_start_set + __swig_getmethods__["start"] = _chmlib.chmUnitInfo_start_get + if _newclass:start = property(_chmlib.chmUnitInfo_start_get,_chmlib.chmUnitInfo_start_set) + __swig_setmethods__["length"] = _chmlib.chmUnitInfo_length_set + __swig_getmethods__["length"] = _chmlib.chmUnitInfo_length_get + if _newclass:length = property(_chmlib.chmUnitInfo_length_get,_chmlib.chmUnitInfo_length_set) + __swig_setmethods__["space"] = _chmlib.chmUnitInfo_space_set + __swig_getmethods__["space"] = _chmlib.chmUnitInfo_space_get + if _newclass:space = property(_chmlib.chmUnitInfo_space_get,_chmlib.chmUnitInfo_space_set) + __swig_setmethods__["path"] = _chmlib.chmUnitInfo_path_set + __swig_getmethods__["path"] = _chmlib.chmUnitInfo_path_get + if _newclass:path = property(_chmlib.chmUnitInfo_path_get,_chmlib.chmUnitInfo_path_set) + def __init__(self,*args): + _swig_setattr(self, chmUnitInfo, 'this', apply(_chmlib.new_chmUnitInfo,args)) + _swig_setattr(self, chmUnitInfo, 'thisown', 1) + def __del__(self, destroy= _chmlib.delete_chmUnitInfo): + try: + if self.thisown: destroy(self) + except: pass + def __repr__(self): + return "" % (self.this,) + +class chmUnitInfoPtr(chmUnitInfo): + def __init__(self,this): + _swig_setattr(self, chmUnitInfo, 'this', this) + if not hasattr(self,"thisown"): _swig_setattr(self, chmUnitInfo, 'thisown', 0) + _swig_setattr(self, chmUnitInfo,self.__class__,chmUnitInfo) +_chmlib.chmUnitInfo_swigregister(chmUnitInfoPtr) + +chm_open = _chmlib.chm_open + +chm_close = _chmlib.chm_close + +CHM_PARAM_MAX_BLOCKS_CACHED = _chmlib.CHM_PARAM_MAX_BLOCKS_CACHED +chm_set_param = _chmlib.chm_set_param + +CHM_RESOLVE_SUCCESS = _chmlib.CHM_RESOLVE_SUCCESS +CHM_RESOLVE_FAILURE = _chmlib.CHM_RESOLVE_FAILURE +chm_resolve_object = _chmlib.chm_resolve_object + +chm_retrieve_object = _chmlib.chm_retrieve_object + +CHM_ENUMERATE_NORMAL = _chmlib.CHM_ENUMERATE_NORMAL +CHM_ENUMERATE_META = _chmlib.CHM_ENUMERATE_META +CHM_ENUMERATE_SPECIAL = _chmlib.CHM_ENUMERATE_SPECIAL +CHM_ENUMERATE_FILES = _chmlib.CHM_ENUMERATE_FILES +CHM_ENUMERATE_DIRS = _chmlib.CHM_ENUMERATE_DIRS +CHM_ENUMERATE_ALL = _chmlib.CHM_ENUMERATE_ALL +CHM_ENUMERATOR_FAILURE = _chmlib.CHM_ENUMERATOR_FAILURE +CHM_ENUMERATOR_CONTINUE = _chmlib.CHM_ENUMERATOR_CONTINUE +CHM_ENUMERATOR_SUCCESS = _chmlib.CHM_ENUMERATOR_SUCCESS +chm_enumerate = _chmlib.chm_enumerate + +chm_enumerate_dir = _chmlib.chm_enumerate_dir + + diff --git a/src/calibre/ebooks/chm/chm/extra.pyd b/src/calibre/ebooks/chm/chm/extra.pyd new file mode 100644 index 0000000000..fe5a58f23f Binary files /dev/null and b/src/calibre/ebooks/chm/chm/extra.pyd differ diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py new file mode 100644 index 0000000000..8bb6f03aa7 --- /dev/null +++ b/src/calibre/ebooks/chm/input.py @@ -0,0 +1,348 @@ +from __future__ import with_statement +''' CHM File decoding support ''' +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ,' \ + ' and Alex Bramley .' + +import sys, logging, os, re, shutil, subprocess, uuid +from shutil import rmtree +from tempfile import mkdtemp +from mimetypes import guess_type as guess_mimetype +from htmlentitydefs import name2codepoint +from pprint import PrettyPrinter + +from BeautifulSoup import BeautifulSoup +from lxml import html, etree +from calibre.ebooks.chm.chm.chm import CHMFile +from calibre.ebooks.chm.chm.chmlib import ( + CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL, + chm_enumerate, chm_retrieve_object, +) + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.utils.config import OptionParser +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata.opf2 import OPFCreator, Guide +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file +from calibre.utils.localization import get_lang +from calibre.utils.filenames import ascii_filename + + +def match_string(s1, s2_already_lowered): + if s1 is not None and s2_already_lowered is not None: + if s1.lower()==s2_already_lowered: + return True + return False + +def option_parser(): + parser = OptionParser(usage=_('%prog [options] mybook.chm')) + parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output') + parser.add_option('--verbose', default=False, action='store_true', dest='verbose') + parser.add_option("-t", "--title", action="store", type="string", \ + dest="title", help=_("Set the book title")) + parser.add_option('--title-sort', action='store', type='string', default=None, + dest='title_sort', help=_('Set sort key for the title')) + parser.add_option("-a", "--author", action="store", type="string", \ + dest="author", help=_("Set the author")) + parser.add_option('--author-sort', action='store', type='string', default=None, + dest='author_sort', help=_('Set sort key for the author')) + parser.add_option("-c", "--category", action="store", type="string", \ + dest="category", help=_("The category this book belongs" + " to. E.g.: History")) + parser.add_option("--thumbnail", action="store", type="string", \ + dest="thumbnail", help=_("Path to a graphic that will be" + " set as this files' thumbnail")) + parser.add_option("--comment", action="store", type="string", \ + dest="freetext", help=_("Path to a txt file containing a comment.")) + parser.add_option("--get-thumbnail", action="store_true", \ + dest="get_thumbnail", default=False, \ + help=_("Extract thumbnail from LRF file")) + parser.add_option('--publisher', default=None, help=_('Set the publisher')) + parser.add_option('--classification', default=None, help=_('Set the book classification')) + parser.add_option('--creator', default=None, help=_('Set the book creator')) + parser.add_option('--producer', default=None, help=_('Set the book producer')) + parser.add_option('--get-cover', action='store_true', default=False, + help=_('Extract cover from LRF file. Note that the LRF format has no defined cover, so we use some heuristics to guess the cover.')) + parser.add_option('--bookid', action='store', type='string', default=None, + dest='book_id', help=_('Set book ID')) + parser.add_option('--font-delta', action='store', type='int', default=0, + dest='font_delta', help=_('Set font delta')) + return parser + +class CHMError(Exception): + pass + +class CHMReader(CHMFile): + def __init__(self, input, log): + CHMFile.__init__(self) + if not self.LoadCHM(input): + raise CHMError("Unable to open CHM file '%s'"%(input,)) + self.log = log + self._sourcechm = input + self._contents = None + self._playorder = 0 + self._metadata = False + self._extracted = False + + # location of '.hhc' file, which is the CHM TOC. + self.root, ext = os.path.splitext(self.topics.lstrip('/')) + self.hhc_path = self.root + ".hhc" + + + def _parse_toc(self, ul, basedir=os.getcwdu()): + toc = TOC(play_order=self._playorder, base_path=basedir, text='') + self._playorder += 1 + for li in ul('li', recursive=False): + href = li.object('param', {'name': 'Local'})[0]['value'] + if href.count('#'): + href, frag = href.split('#') + else: + frag = None + name = self._deentity(li.object('param', {'name': 'Name'})[0]['value']) + #print "========>", name + toc.add_item(href, frag, name, play_order=self._playorder) + self._playorder += 1 + if li.ul: + child = self._parse_toc(li.ul) + child.parent = toc + toc.append(child) + #print toc + return toc + + + def GetFile(self, path): + # have to have abs paths for ResolveObject, but Contents() deliberately + # makes them relative. So we don't have to worry, re-add the leading /. + # note this path refers to the internal CHM structure + if path[0] != '/': + path = '/' + path + res, ui = self.ResolveObject(path) + if res != CHM_RESOLVE_SUCCESS: + raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename)) + size, data = self.RetrieveObject(ui) + if size == 0: + raise CHMError("'%s' is zero bytes in length!"%(path,)) + return data + + def ExtractFiles(self, output_dir=os.getcwdu()): + for path in self.Contents(): + lpath = os.path.join(output_dir, path) + self._ensure_dir(lpath) + data = self.GetFile(path) + with open(lpath, 'wb') as f: + if guess_mimetype(path)[0] == ('text/html'): + data = self._reformat(data) + f.write(data) + #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir]) + self._extracted = True + + def _reformat(self, data): + try: + html = BeautifulSoup(data) + except UnicodeEncodeError: + # hit some strange encoding problems... + print "Unable to parse html for cleaning, leaving it :(" + return data + # nuke javascript... + [s.extract() for s in html('script')] + # remove forward and back nav bars from the top/bottom of each page + # cos they really fuck with the flow of things and generally waste space + # since we can't use [a,b] syntax to select arbitrary items from a list + # we'll have to do this manually... + t = html('table') + if t: + if (t[0].previousSibling is None + or t[0].previousSibling.previousSibling is None): + t[0].extract() + if (t[-1].nextSibling is None + or t[-1].nextSibling.nextSibling is None): + t[-1].extract() + # for some very odd reason each page's content appears to be in a table + # too. and this table has sub-tables for random asides... grr. + + # some images seem to be broken in some chm's :/ + for img in html('img'): + try: + # some are supposedly "relative"... lies. + while img['src'].startswith('../'): img['src'] = img['src'][3:] + # some have ";" at the end. + img['src'] = img['src'].split(';')[0] + except KeyError: + # and some don't even have a src= ?! + pass + # now give back some pretty html. + return html.prettify() + + def Contents(self): + if self._contents is not None: + return self._contents + paths = [] + def get_paths(chm, ui, ctx): + # skip directories + # note this path refers to the internal CHM structure + if ui.path[-1] != '/': + # and make paths relative + paths.append(ui.path.lstrip('/')) + chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None) + self._contents = paths + return self._contents + + def _ensure_dir(self, path): + dir = os.path.dirname(path) + if not os.path.isdir(dir): + os.makedirs(dir) + + def extract_content(self, output_dir=os.getcwdu()): + self.ExtractFiles(output_dir=output_dir) + + +class CHMInput(InputFormatPlugin): + + name = 'CHM Input' + author = 'Kovid Goyal and Alex Bramley' + description = 'Convert CHM files to OEB' + file_types = set(['chm']) + + options = set([ + OptionRecommendation(name='dummy_option', recommended_value=False, + help=_('dummy option until real options are determined.')), + ]) + + def _chmtohtml(self, output_dir, chm_path, no_images, log): + log.debug('Opening CHM file') + rdr = CHMReader(chm_path, log) + log.debug('Extracting CHM to %s' % output_dir) + rdr.extract_content(output_dir) + return rdr.hhc_path + + + def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.metadata.chm import get_metadata_ + + log.debug('Processing CHM...') + tdir = mkdtemp(prefix='chm2oeb_') + from calibre.customize.ui import plugin_for_input_format + html_input = plugin_for_input_format('html') + for opt in html_input.options: + setattr(options, opt.option.name, opt.recommended_value) + options.input_encoding = 'utf-8' + no_images = False #options.no_images + chm_name = stream.name + #chm_data = stream.read() + + #closing stream so CHM can be opened by external library + stream.close() + log.debug('tdir=%s' % tdir) + log.debug('stream.name=%s' % stream.name) + mainname = self._chmtohtml(tdir, chm_name, no_images, log) + mainpath = os.path.join(tdir, mainname) + + metadata = get_metadata_(tdir) + + cwd = os.getcwdu() + odi = options.debug_pipeline + options.debug_pipeline = None + # try a custom conversion: + oeb = self._create_oebbook(mainpath, tdir, options, log, metadata) + options.debug_pipeline = odi + #log.debug('DEBUG: Not removing tempdir %s' % tdir) + shutil.rmtree(tdir) + return oeb + + def _create_oebbook(self, hhcpath, basedir, opts, log, mi): + from calibre.ebooks.conversion.plumber import create_oebbook + from calibre.ebooks.oeb.base import DirContainer, \ + rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, \ + xpath + from calibre import guess_type + import cssutils + oeb = create_oebbook(log, None, opts, self, + encoding=opts.input_encoding, populate=False) + self.oeb = oeb + + metadata = oeb.metadata + if mi.title: + metadata.add('title', mi.title) + if mi.authors: + for a in mi.authors: + metadata.add('creator', a, attrib={'role':'aut'}) + if mi.publisher: + metadata.add('publisher', mi.publisher) + if mi.isbn: + metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'}) + if not metadata.language: + oeb.logger.warn(u'Language not specified') + metadata.add('language', get_lang()) + if not metadata.creator: + oeb.logger.warn('Creator not specified') + metadata.add('creator', self.oeb.translate(__('Unknown'))) + if not metadata.title: + oeb.logger.warn('Title not specified') + metadata.add('title', self.oeb.translate(__('Unknown'))) + + bookid = str(uuid.uuid4()) + metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') + for ident in metadata.identifier: + if 'id' in ident.attrib: + self.oeb.uid = metadata.identifier[0] + break + + hhcdata = self._read_file(hhcpath) + hhcroot = html.fromstring(hhcdata) + chapters = self._process_nodes(hhcroot) + #print "=============================" + #print "Printing hhcroot" + #print etree.tostring(hhcroot, pretty_print=True) + #print "=============================" + log.debug('Found %d section nodes' % len(chapters)) + + if len(chapters) > 0: + path0 = chapters[0][1] + subpath = os.path.dirname(path0) + htmlpath = os.path.join(basedir, subpath) + + oeb.container = DirContainer(htmlpath, log) + for chapter in chapters: + title = chapter[0] + basename = os.path.basename(chapter[1]) + self._add_item(oeb, title, basename) + + oeb.container = DirContainer(htmlpath, oeb.log) + return oeb + + def _read_file(self, name): + f = open(name, 'rb') + data = f.read() + f.close() + return data + + def _visit_node(self, node, chapters): + # check that node is a normal node (not a comment, DOCTYPE, etc.) + # (normal nodes have string tags) + if isinstance(node.tag, basestring): + if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'): + for child in node: + if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'): + chapter_title = child.attrib['value'] + if match_string(child.tag,'param') and match_string(child.attrib['name'],'local'): + chapter_path = child.attrib['value'] + if chapter_title is not None and chapter_path is not None: + chapter = [chapter_title, chapter_path] + chapters.append(chapter) + + def _process_nodes(self, root): + chapters = [] + for node in root.iter(): + self._visit_node(node, chapters) + return chapters + + def _add_item(self, oeb, title, path): + bname = os.path.basename(path) + id, href = oeb.manifest.generate(id='html', + href=ascii_filename(bname)) + item = oeb.manifest.add(id, href, 'text/html') + item.html_input_href = bname + oeb.spine.add(item, True) + oeb.toc.add(title, item.href) +