mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Initial CHM changes
This commit is contained in:
parent
647ad206c9
commit
7a74dc3410
8
src/calibre/ebooks/chm/__init__.py
Normal file
8
src/calibre/ebooks/chm/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Used for chm input
|
||||
'''
|
34
src/calibre/ebooks/chm/chm/__init__.py
Normal file
34
src/calibre/ebooks/chm/chm/__init__.py
Normal file
@ -0,0 +1,34 @@
|
||||
## Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
|
||||
|
||||
## pychm is free software; you can redistribute it and/or
|
||||
## modify it under the terms of the GNU General Public License as
|
||||
## published by the Free Software Foundation; either version 2 of the
|
||||
## License, or (at your option) any later version.
|
||||
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
## General Public License for more details.
|
||||
|
||||
## You should have received a copy of the GNU General Public
|
||||
## License along with this program; see the file COPYING. If not,
|
||||
## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
## Boston, MA 02111-1307, USA
|
||||
|
||||
## $Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $
|
||||
|
||||
'''
|
||||
chm - A package to manipulate CHM files
|
||||
|
||||
The chm package provides four modules: chm, chmlib, extra and
|
||||
_chmlib. _chmlib and chmlib are very low level libraries generated
|
||||
from SWIG interface files, and are simple wrappers around the API
|
||||
defined by the C library chmlib.
|
||||
The extra module adds full-text search support.
|
||||
the chm module provides some higher level classes to simplify
|
||||
access to the CHM files information.
|
||||
'''
|
||||
__all__ = ["chm", "chmlib", "_chmlib", "extra"]
|
||||
__version__ = "0.8.4"
|
||||
__revision__ = "$Id: __init__.py,v 1.8 2006/06/18 10:50:43 rubensr Exp $"
|
||||
|
BIN
src/calibre/ebooks/chm/chm/_chmlib.pyd
Normal file
BIN
src/calibre/ebooks/chm/chm/_chmlib.pyd
Normal file
Binary file not shown.
508
src/calibre/ebooks/chm/chm/chm.py
Normal file
508
src/calibre/ebooks/chm/chm/chm.py
Normal file
@ -0,0 +1,508 @@
|
||||
## Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
|
||||
|
||||
## Based on code by:
|
||||
## Copyright (C) 2003 Razvan Cojocaru <razvanco@gmx.net>
|
||||
|
||||
## pychm is free software; you can redistribute it and/or
|
||||
## modify it under the terms of the GNU General Public License as
|
||||
## published by the Free Software Foundation; either version 2 of the
|
||||
## License, or (at your option) any later version.
|
||||
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
## General Public License for more details.
|
||||
|
||||
## You should have received a copy of the GNU General Public
|
||||
## License along with this program; see the file COPYING. If not,
|
||||
## write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
## Boston, MA 02111-1307, USA
|
||||
|
||||
## $Id: chm.py,v 1.12 2006/08/07 12:31:51 rubensr Exp $
|
||||
|
||||
'''
|
||||
chm - A high-level front end for the chmlib python module.
|
||||
|
||||
The chm module provides high level access to the functionality
|
||||
included in chmlib. It encapsulates functions in the CHMFile class, and
|
||||
provides some additional features, such as the ability to obtain
|
||||
the contents tree of a CHM archive.
|
||||
|
||||
'''
|
||||
|
||||
import chmlib
|
||||
import extra
|
||||
import array
|
||||
import string
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
charset_table = {
|
||||
0 : 'iso8859_1', # ANSI_CHARSET
|
||||
238 : 'iso8859_2', # EASTEUROPE_CHARSET
|
||||
178 : 'iso8859_6', # ARABIC_CHARSET
|
||||
161 : 'iso8859_7', # GREEK_CHARSET
|
||||
177 : 'iso8859_8', # HEBREW_CHARSET
|
||||
162 : 'iso8859_9', # TURKISH_CHARSET
|
||||
222 : 'iso8859_11', # THAI_CHARSET - hmm not in python 2.2...
|
||||
186 : 'iso8859_13', # BALTIC_CHARSET
|
||||
204 : 'cp1251', # RUSSIAN_CHARSET
|
||||
255 : 'cp437', # OEM_CHARSET
|
||||
128 : 'cp932', # SHIFTJIS_CHARSET
|
||||
134 : 'cp936', # GB2312_CHARSET
|
||||
129 : 'cp949', # HANGUL_CHARSET
|
||||
136 : 'cp950', # CHINESEBIG5_CHARSET
|
||||
1 : None, # DEFAULT_CHARSET
|
||||
2 : None, # SYMBOL_CHARSET
|
||||
130 : None, # JOHAB_CHARSET
|
||||
163 : None, # VIETNAMESE_CHARSET
|
||||
77 : None, # MAC_CHARSET
|
||||
}
|
||||
|
||||
locale_table = {
|
||||
0x0436 : ('iso8859_1', "Afrikaans", "Western Europe & US"),
|
||||
0x041c : ('iso8859_2', "Albanian", "Central Europe"),
|
||||
0x0401 : ('iso8859_6', "Arabic_Saudi_Arabia", "Arabic"),
|
||||
0x0801 : ('iso8859_6', "Arabic_Iraq", "Arabic"),
|
||||
0x0c01 : ('iso8859_6', "Arabic_Egypt", "Arabic"),
|
||||
0x1001 : ('iso8859_6', "Arabic_Libya", "Arabic"),
|
||||
0x1401 : ('iso8859_6', "Arabic_Algeria", "Arabic"),
|
||||
0x1801 : ('iso8859_6', "Arabic_Morocco", "Arabic"),
|
||||
0x1c01 : ('iso8859_6', "Arabic_Tunisia", "Arabic"),
|
||||
0x2001 : ('iso8859_6', "Arabic_Oman", "Arabic"),
|
||||
0x2401 : ('iso8859_6', "Arabic_Yemen", "Arabic"),
|
||||
0x2801 : ('iso8859_6', "Arabic_Syria", "Arabic"),
|
||||
0x2c01 : ('iso8859_6', "Arabic_Jordan", "Arabic"),
|
||||
0x3001 : ('iso8859_6', "Arabic_Lebanon", "Arabic"),
|
||||
0x3401 : ('iso8859_6', "Arabic_Kuwait", "Arabic"),
|
||||
0x3801 : ('iso8859_6', "Arabic_UAE", "Arabic"),
|
||||
0x3c01 : ('iso8859_6', "Arabic_Bahrain", "Arabic"),
|
||||
0x4001 : ('iso8859_6', "Arabic_Qatar", "Arabic"),
|
||||
0x042b : (None, "Armenian","Armenian"),
|
||||
0x042c : ('iso8859_9', "Azeri_Latin", "Turkish"),
|
||||
0x082c : ('cp1251', "Azeri_Cyrillic", "Cyrillic"),
|
||||
0x042d : ('iso8859_1', "Basque", "Western Europe & US"),
|
||||
0x0423 : ('cp1251', "Belarusian", "Cyrillic"),
|
||||
0x0402 : ('cp1251', "Bulgarian", "Cyrillic"),
|
||||
0x0403 : ('iso8859_1', "Catalan", "Western Europe & US"),
|
||||
0x0404 : ('cp950', "Chinese_Taiwan", "Traditional Chinese"),
|
||||
0x0804 : ('cp936', "Chinese_PRC", "Simplified Chinese"),
|
||||
0x0c04 : ('cp950', "Chinese_Hong_Kong", "Traditional Chinese"),
|
||||
0x1004 : ('cp936', "Chinese_Singapore", "Simplified Chinese"),
|
||||
0x1404 : ('cp950', "Chinese_Macau", "Traditional Chinese"),
|
||||
0x041a : ('iso8859_2', "Croatian", "Central Europe"),
|
||||
0x0405 : ('iso8859_2', "Czech", "Central Europe"),
|
||||
0x0406 : ('iso8859_1', "Danish", "Western Europe & US"),
|
||||
0x0413 : ('iso8859_1', "Dutch_Standard", "Western Europe & US"),
|
||||
0x0813 : ('iso8859_1', "Dutch_Belgian", "Western Europe & US"),
|
||||
0x0409 : ('iso8859_1', "English_United_States", "Western Europe & US"),
|
||||
0x0809 : ('iso8859_1', "English_United_Kingdom", "Western Europe & US"),
|
||||
0x0c09 : ('iso8859_1', "English_Australian", "Western Europe & US"),
|
||||
0x1009 : ('iso8859_1', "English_Canadian", "Western Europe & US"),
|
||||
0x1409 : ('iso8859_1', "English_New_Zealand", "Western Europe & US"),
|
||||
0x1809 : ('iso8859_1', "English_Irish", "Western Europe & US"),
|
||||
0x1c09 : ('iso8859_1', "English_South_Africa", "Western Europe & US"),
|
||||
0x2009 : ('iso8859_1', "English_Jamaica", "Western Europe & US"),
|
||||
0x2409 : ('iso8859_1', "English_Caribbean", "Western Europe & US"),
|
||||
0x2809 : ('iso8859_1', "English_Belize", "Western Europe & US"),
|
||||
0x2c09 : ('iso8859_1', "English_Trinidad", "Western Europe & US"),
|
||||
0x3009 : ('iso8859_1', "English_Zimbabwe", "Western Europe & US"),
|
||||
0x3409 : ('iso8859_1', "English_Philippines", "Western Europe & US"),
|
||||
0x0425 : ('iso8859_13',"Estonian", "Baltic",),
|
||||
0x0438 : ('iso8859_1', "Faeroese", "Western Europe & US"),
|
||||
0x0429 : ('iso8859_6', "Farsi", "Arabic"),
|
||||
0x040b : ('iso8859_1', "Finnish", "Western Europe & US"),
|
||||
0x040c : ('iso8859_1', "French_Standard", "Western Europe & US"),
|
||||
0x080c : ('iso8859_1', "French_Belgian", "Western Europe & US"),
|
||||
0x0c0c : ('iso8859_1', "French_Canadian", "Western Europe & US"),
|
||||
0x100c : ('iso8859_1', "French_Swiss", "Western Europe & US"),
|
||||
0x140c : ('iso8859_1', "French_Luxembourg", "Western Europe & US"),
|
||||
0x180c : ('iso8859_1', "French_Monaco", "Western Europe & US"),
|
||||
0x0437 : (None, "Georgian", "Georgian"),
|
||||
0x0407 : ('iso8859_1', "German_Standard", "Western Europe & US"),
|
||||
0x0807 : ('iso8859_1', "German_Swiss", "Western Europe & US"),
|
||||
0x0c07 : ('iso8859_1', "German_Austrian", "Western Europe & US"),
|
||||
0x1007 : ('iso8859_1', "German_Luxembourg", "Western Europe & US"),
|
||||
0x1407 : ('iso8859_1', "German_Liechtenstein", "Western Europe & US"),
|
||||
0x0408 : ('iso8859_7', "Greek", "Greek"),
|
||||
0x040d : ('iso8859_8', "Hebrew", "Hebrew"),
|
||||
0x0439 : (None, "Hindi", "Indic"),
|
||||
0x040e : ('iso8859_2', "Hungarian", "Central Europe"),
|
||||
0x040f : ('iso8859_1', "Icelandic", "Western Europe & US"),
|
||||
0x0421 : ('iso8859_1', "Indonesian", "Western Europe & US"),
|
||||
0x0410 : ('iso8859_1', "Italian_Standard", "Western Europe & US"),
|
||||
0x0810 : ('iso8859_1', "Italian_Swiss", "Western Europe & US"),
|
||||
0x0411 : ('cp932', "Japanese", "Japanese"),
|
||||
0x043f : ('cp1251', "Kazakh", "Cyrillic"),
|
||||
0x0457 : (None, "Konkani", "Indic"),
|
||||
0x0412 : ('cp949', "Korean", "Korean"),
|
||||
0x0426 : ('iso8859_13',"Latvian", "Baltic",),
|
||||
0x0427 : ('iso8859_13',"Lithuanian", "Baltic",),
|
||||
0x042f : ('cp1251', "Macedonian", "Cyrillic"),
|
||||
0x043e : ('iso8859_1', "Malay_Malaysia", "Western Europe & US"),
|
||||
0x083e : ('iso8859_1', "Malay_Brunei_Darussalam", "Western Europe & US"),
|
||||
0x044e : (None, "Marathi", "Indic"),
|
||||
0x0414 : ('iso8859_1', "Norwegian_Bokmal", "Western Europe & US"),
|
||||
0x0814 : ('iso8859_1', "Norwegian_Nynorsk", "Western Europe & US"),
|
||||
0x0415 : ('iso8859_2', "Polish", "Central Europe"),
|
||||
0x0416 : ('iso8859_1', "Portuguese_Brazilian", "Western Europe & US"),
|
||||
0x0816 : ('iso8859_1', "Portuguese_Standard", "Western Europe & US"),
|
||||
0x0418 : ('iso8859_2', "Romanian", "Central Europe"),
|
||||
0x0419 : ('cp1251', "Russian", "Cyrillic"),
|
||||
0x044f : (None, "Sanskrit", "Indic"),
|
||||
0x081a : ('iso8859_2', "Serbian_Latin", "Central Europe"),
|
||||
0x0c1a : ('cp1251', "Serbian_Cyrillic", "Cyrillic"),
|
||||
0x041b : ('iso8859_2', "Slovak", "Central Europe"),
|
||||
0x0424 : ('iso8859_2', "Slovenian", "Central Europe"),
|
||||
0x040a : ('iso8859_1', "Spanish_Trad_Sort", "Western Europe & US"),
|
||||
0x080a : ('iso8859_1', "Spanish_Mexican", "Western Europe & US"),
|
||||
0x0c0a : ('iso8859_1', "Spanish_Modern_Sort", "Western Europe & US"),
|
||||
0x100a : ('iso8859_1', "Spanish_Guatemala", "Western Europe & US"),
|
||||
0x140a : ('iso8859_1', "Spanish_Costa_Rica", "Western Europe & US"),
|
||||
0x180a : ('iso8859_1', "Spanish_Panama", "Western Europe & US"),
|
||||
0x1c0a : ('iso8859_1', "Spanish_Dominican_Repub", "Western Europe & US"),
|
||||
0x200a : ('iso8859_1', "Spanish_Venezuela", "Western Europe & US"),
|
||||
0x240a : ('iso8859_1', "Spanish_Colombia", "Western Europe & US"),
|
||||
0x280a : ('iso8859_1', "Spanish_Peru", "Western Europe & US"),
|
||||
0x2c0a : ('iso8859_1', "Spanish_Argentina", "Western Europe & US"),
|
||||
0x300a : ('iso8859_1', "Spanish_Ecuador", "Western Europe & US"),
|
||||
0x340a : ('iso8859_1', "Spanish_Chile", "Western Europe & US"),
|
||||
0x380a : ('iso8859_1', "Spanish_Uruguay", "Western Europe & US"),
|
||||
0x3c0a : ('iso8859_1', "Spanish_Paraguay", "Western Europe & US"),
|
||||
0x400a : ('iso8859_1', "Spanish_Bolivia", "Western Europe & US"),
|
||||
0x440a : ('iso8859_1', "Spanish_El_Salvador", "Western Europe & US"),
|
||||
0x480a : ('iso8859_1', "Spanish_Honduras", "Western Europe & US"),
|
||||
0x4c0a : ('iso8859_1', "Spanish_Nicaragua", "Western Europe & US"),
|
||||
0x500a : ('iso8859_1', "Spanish_Puerto_Rico", "Western Europe & US"),
|
||||
0x0441 : ('iso8859_1', "Swahili", "Western Europe & US"),
|
||||
0x041d : ('iso8859_1', "Swedish", "Western Europe & US"),
|
||||
0x081d : ('iso8859_1', "Swedish_Finland", "Western Europe & US"),
|
||||
0x0449 : (None, "Tamil", "Indic"),
|
||||
0x0444 : ('cp1251', "Tatar", "Cyrillic"),
|
||||
0x041e : ('iso8859_11',"Thai", "Thai"),
|
||||
0x041f : ('iso8859_9', "Turkish", "Turkish"),
|
||||
0x0422 : ('cp1251', "Ukrainian", "Cyrillic"),
|
||||
0x0420 : ('iso8859_6', "Urdu", "Arabic"),
|
||||
0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"),
|
||||
0x0843 : ('cp1251', "Uzbek_Cyrillic", "Cyrillic"),
|
||||
0x042a : (None, "Vietnamese", "Vietnamese")
|
||||
}
|
||||
|
||||
class CHMFile:
|
||||
"A class to manage access to CHM files."
|
||||
filename = ""
|
||||
file = None
|
||||
title = ""
|
||||
home = "/"
|
||||
index = None
|
||||
topics = None
|
||||
encoding = None
|
||||
lcid = None
|
||||
binaryindex = None
|
||||
|
||||
def __init__(self):
|
||||
self.searchable = 0
|
||||
|
||||
def LoadCHM(self, archiveName):
|
||||
'''Loads a CHM archive.
|
||||
This function will also call GetArchiveInfo to obtain information
|
||||
such as the index file name and the topics file. It returns 1 on
|
||||
success, and 0 if it fails.
|
||||
'''
|
||||
if (self.filename != None):
|
||||
self.CloseCHM()
|
||||
|
||||
self.file = chmlib.chm_open(archiveName)
|
||||
if (self.file == None):
|
||||
return 0
|
||||
|
||||
self.filename = archiveName
|
||||
self.GetArchiveInfo()
|
||||
|
||||
return 1
|
||||
|
||||
def CloseCHM(self):
|
||||
'''Closes the CHM archive.
|
||||
This function will close the CHM file, if it is open. All variables
|
||||
are also reset.
|
||||
'''
|
||||
if (self.filename != None):
|
||||
chmlib.chm_close(self.file)
|
||||
self.file = None
|
||||
self.filename = ''
|
||||
self.title = ""
|
||||
self.home = "/"
|
||||
self.index = None
|
||||
self.topics = None
|
||||
self.encoding = None
|
||||
|
||||
def GetArchiveInfo(self):
|
||||
'''Obtains information on CHM archive.
|
||||
This function checks the /#SYSTEM file inside the CHM archive to
|
||||
obtain the index, home page, topics, encoding and title. It is called
|
||||
from LoadCHM.
|
||||
'''
|
||||
|
||||
#extra.is_searchable crashed...
|
||||
#self.searchable = extra.is_searchable (self.file)
|
||||
self.searchable = False
|
||||
self.lcid = None
|
||||
|
||||
result, ui = chmlib.chm_resolve_object(self.file, '/#SYSTEM')
|
||||
if (result != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
sys.stderr.write('GetArchiveInfo: #SYSTEM does not exist\n')
|
||||
return 0
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 4l, ui.length)
|
||||
if (size == 0):
|
||||
sys.stderr.write('GetArchiveInfo: file size = 0\n')
|
||||
return 0
|
||||
|
||||
buff = array.array('B', text)
|
||||
|
||||
index = 0
|
||||
while (index < size):
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
|
||||
if (cursor == 0):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.topics = '/' + text[index:index+cursor-1]
|
||||
elif (cursor == 1):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.index = '/' + text[index:index+cursor-1]
|
||||
elif (cursor == 2):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.home = '/' + text[index:index+cursor-1]
|
||||
elif (cursor == 3):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.title = text[index:index+cursor-1]
|
||||
elif (cursor == 4):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.lcid = buff[index] + (buff[index+1] * 256)
|
||||
elif (cursor == 6):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
tmp = text[index:index+cursor-1]
|
||||
if not self.topics:
|
||||
tmp1 = '/' + tmp + '.hhc'
|
||||
tmp2 = '/' + tmp + '.hhk'
|
||||
res1, ui1 = chmlib.chm_resolve_object(self.file, tmp1)
|
||||
res2, ui2 = chmlib.chm_resolve_object(self.file, tmp2)
|
||||
if (not self.topics) and \
|
||||
(res1 == chmlib.CHM_RESOLVE_SUCCESS):
|
||||
self.topics = '/' + tmp + '.hhc'
|
||||
if (not self.index) and \
|
||||
(res2 == chmlib.CHM_RESOLVE_SUCCESS):
|
||||
self.index = '/' + tmp + '.hhk'
|
||||
elif (cursor == 16):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.encoding = text[index:index+cursor-1]
|
||||
else:
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
index += cursor
|
||||
|
||||
self.GetWindowsInfo()
|
||||
|
||||
if not self.lcid:
|
||||
self.lcid = extra.get_lcid (self.file)
|
||||
|
||||
return 1
|
||||
|
||||
def GetTopicsTree(self):
|
||||
'''Reads and returns the topics tree.
|
||||
This auxiliary function reads and returns the topics tree file
|
||||
contents for the CHM archive.
|
||||
'''
|
||||
if (self.topics == None):
|
||||
return None
|
||||
|
||||
if self.topics:
|
||||
res, ui = chmlib.chm_resolve_object(self.file, self.topics)
|
||||
if (res != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
return None
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
|
||||
if (size == 0):
|
||||
sys.stderr.write('GetTopicsTree: file size = 0\n')
|
||||
return None
|
||||
return text
|
||||
|
||||
def GetIndex(self):
|
||||
'''Reads and returns the index tree.
|
||||
This auxiliary function reads and returns the index tree file
|
||||
contents for the CHM archive.
|
||||
'''
|
||||
if (self.index == None):
|
||||
return None
|
||||
|
||||
if self.index:
|
||||
res, ui = chmlib.chm_resolve_object(self.file, self.index)
|
||||
if (res != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
return None
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
|
||||
if (size == 0):
|
||||
sys.stderr.write('GetIndex: file size = 0\n')
|
||||
return None
|
||||
return text
|
||||
|
||||
def ResolveObject(self, document):
|
||||
'''Tries to locate a document in the archive.
|
||||
This function tries to locate the document inside the archive. It
|
||||
returns a tuple where the first element is zero if the function
|
||||
was successful, and the second is the UnitInfo for that document.
|
||||
The UnitInfo is used to retrieve the document contents
|
||||
'''
|
||||
if self.file:
|
||||
#path = os.path.abspath(document)
|
||||
path = document
|
||||
return chmlib.chm_resolve_object(self.file, path)
|
||||
else:
|
||||
return (1, None)
|
||||
|
||||
def RetrieveObject(self, ui, start = -1, length = -1):
|
||||
'''Retrieves the contents of a document.
|
||||
This function takes a UnitInfo and two optional arguments, the first
|
||||
being the start address and the second is the length. These define
|
||||
the amount of data to be read from the archive.
|
||||
'''
|
||||
if self.file and ui:
|
||||
if length == -1:
|
||||
len = ui.length
|
||||
else:
|
||||
len = length
|
||||
if start == -1:
|
||||
st = 0l
|
||||
else:
|
||||
st = long(start)
|
||||
return chmlib.chm_retrieve_object(self.file, ui, st, len)
|
||||
else:
|
||||
return (0, '')
|
||||
|
||||
def Search(self, text, wholewords=0, titleonly=0):
|
||||
'''Performs full-text search on the archive.
|
||||
The first parameter is the word to look for, the second
|
||||
indicates if the search should be for whole words only, and
|
||||
the third parameter indicates if the search should be
|
||||
restricted to page titles.
|
||||
This method will return a tuple, the first item
|
||||
indicating if the search results were partial, and the second
|
||||
item being a dictionary containing the results.'''
|
||||
if text and text != '' and self.file:
|
||||
return extra.search (self.file, text, wholewords,
|
||||
titleonly)
|
||||
else:
|
||||
return None
|
||||
|
||||
def IsSearchable(self):
|
||||
'''Indicates if the full-text search is available for this
|
||||
archive - this flag is updated when GetArchiveInfo is called'''
|
||||
return self.searchable
|
||||
|
||||
def GetEncoding(self):
|
||||
'''Returns a string that can be used with the codecs python package
|
||||
to encode or decode the files in the chm archive. If an error is
|
||||
found, or if it is not possible to find the encoding, None is
|
||||
returned.'''
|
||||
if self.encoding:
|
||||
vals = string.split(self.encoding, ',')
|
||||
if len(vals) > 2:
|
||||
try:
|
||||
return charset_table[int(vals[2])]
|
||||
except KeyError:
|
||||
pass
|
||||
return None
|
||||
|
||||
def GetLCID(self):
|
||||
'''Returns the archive Locale ID'''
|
||||
if self.lcid in locale_table:
|
||||
return locale_table[self.lcid]
|
||||
else:
|
||||
return None
|
||||
|
||||
def GetDWORD(self, buff, idx=0):
|
||||
'''Internal method.
|
||||
Reads a double word (4 bytes) from a buffer.
|
||||
'''
|
||||
result = buff[idx] + (buff[idx+1]<<8) + (buff[idx+2]<<16) + \
|
||||
(buff[idx+3]<<24)
|
||||
|
||||
if result == 0xFFFFFFFF:
|
||||
result = 0
|
||||
|
||||
return result
|
||||
|
||||
def GetString(self, text, idx):
|
||||
'''Internal method.
|
||||
Retrieves a string from the #STRINGS buffer.
|
||||
'''
|
||||
next = string.find(text, '\x00', idx)
|
||||
chunk = text[idx:next]
|
||||
return chunk
|
||||
|
||||
def GetWindowsInfo(self):
|
||||
'''Gets information from the #WINDOWS file.
|
||||
Checks the #WINDOWS file to see if it has any info that was
|
||||
not found in #SYSTEM (topics, index or default page.
|
||||
'''
|
||||
result, ui = chmlib.chm_resolve_object(self.file, '/#WINDOWS')
|
||||
if (result != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
return -1
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, 8)
|
||||
if (size < 8):
|
||||
return -2
|
||||
|
||||
buff = array.array('B', text)
|
||||
num_entries = self.GetDWORD(buff, 0)
|
||||
entry_size = self.GetDWORD(buff, 4)
|
||||
|
||||
if num_entries < 1:
|
||||
return -3
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 8l, entry_size)
|
||||
if (size < entry_size):
|
||||
return -4
|
||||
|
||||
buff = array.array('B', text)
|
||||
toc_index = self.GetDWORD(buff, 0x60)
|
||||
idx_index = self.GetDWORD(buff, 0x64)
|
||||
dft_index = self.GetDWORD(buff, 0x68)
|
||||
|
||||
result, ui = chmlib.chm_resolve_object(self.file, '/#STRINGS')
|
||||
if (result != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
return -5
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 0l, ui.length)
|
||||
if (size == 0):
|
||||
return -6
|
||||
|
||||
if (not self.topics):
|
||||
self.topics = self.GetString(text, toc_index)
|
||||
if not self.topics.startswith("/"):
|
||||
self.topics = "/" + self.topics
|
||||
|
||||
if (not self.index):
|
||||
self.index = self.GetString(text, idx_index)
|
||||
if not self.index.startswith("/"):
|
||||
self.index = "/" + self.index
|
||||
|
||||
if (dft_index != 0):
|
||||
self.home = self.GetString(text, dft_index)
|
||||
if not self.home.startswith("/"):
|
||||
self.home = "/" + self.home
|
93
src/calibre/ebooks/chm/chm/chmlib.py
Normal file
93
src/calibre/ebooks/chm/chm/chmlib.py
Normal file
@ -0,0 +1,93 @@
|
||||
# This file was created automatically by SWIG.
|
||||
# Don't modify this file, modify the SWIG interface instead.
|
||||
# This file is compatible with both classic and new-style classes.
|
||||
import _chmlib
|
||||
def _swig_setattr(self,class_type,name,value):
|
||||
if (name == "this"):
|
||||
if isinstance(value, class_type):
|
||||
self.__dict__[name] = value.this
|
||||
if hasattr(value,"thisown"): self.__dict__["thisown"] = value.thisown
|
||||
del value.thisown
|
||||
return
|
||||
method = class_type.__swig_setmethods__.get(name,None)
|
||||
if method: return method(self,value)
|
||||
self.__dict__[name] = value
|
||||
|
||||
def _swig_getattr(self,class_type,name):
|
||||
method = class_type.__swig_getmethods__.get(name,None)
|
||||
if method: return method(self)
|
||||
raise AttributeError,name
|
||||
|
||||
import types
|
||||
try:
|
||||
_object = types.ObjectType
|
||||
_newclass = 1
|
||||
except AttributeError:
|
||||
class _object : pass
|
||||
_newclass = 0
|
||||
|
||||
|
||||
CHM_UNCOMPRESSED = _chmlib.CHM_UNCOMPRESSED
|
||||
CHM_COMPRESSED = _chmlib.CHM_COMPRESSED
|
||||
CHM_MAX_PATHLEN = _chmlib.CHM_MAX_PATHLEN
|
||||
class chmUnitInfo(_object):
|
||||
__swig_setmethods__ = {}
|
||||
__setattr__ = lambda self, name, value: _swig_setattr(self, chmUnitInfo, name, value)
|
||||
__swig_getmethods__ = {}
|
||||
__getattr__ = lambda self, name: _swig_getattr(self, chmUnitInfo, name)
|
||||
__swig_setmethods__["start"] = _chmlib.chmUnitInfo_start_set
|
||||
__swig_getmethods__["start"] = _chmlib.chmUnitInfo_start_get
|
||||
if _newclass:start = property(_chmlib.chmUnitInfo_start_get,_chmlib.chmUnitInfo_start_set)
|
||||
__swig_setmethods__["length"] = _chmlib.chmUnitInfo_length_set
|
||||
__swig_getmethods__["length"] = _chmlib.chmUnitInfo_length_get
|
||||
if _newclass:length = property(_chmlib.chmUnitInfo_length_get,_chmlib.chmUnitInfo_length_set)
|
||||
__swig_setmethods__["space"] = _chmlib.chmUnitInfo_space_set
|
||||
__swig_getmethods__["space"] = _chmlib.chmUnitInfo_space_get
|
||||
if _newclass:space = property(_chmlib.chmUnitInfo_space_get,_chmlib.chmUnitInfo_space_set)
|
||||
__swig_setmethods__["path"] = _chmlib.chmUnitInfo_path_set
|
||||
__swig_getmethods__["path"] = _chmlib.chmUnitInfo_path_get
|
||||
if _newclass:path = property(_chmlib.chmUnitInfo_path_get,_chmlib.chmUnitInfo_path_set)
|
||||
def __init__(self,*args):
|
||||
_swig_setattr(self, chmUnitInfo, 'this', apply(_chmlib.new_chmUnitInfo,args))
|
||||
_swig_setattr(self, chmUnitInfo, 'thisown', 1)
|
||||
def __del__(self, destroy= _chmlib.delete_chmUnitInfo):
|
||||
try:
|
||||
if self.thisown: destroy(self)
|
||||
except: pass
|
||||
def __repr__(self):
|
||||
return "<C chmUnitInfo instance at %s>" % (self.this,)
|
||||
|
||||
class chmUnitInfoPtr(chmUnitInfo):
|
||||
def __init__(self,this):
|
||||
_swig_setattr(self, chmUnitInfo, 'this', this)
|
||||
if not hasattr(self,"thisown"): _swig_setattr(self, chmUnitInfo, 'thisown', 0)
|
||||
_swig_setattr(self, chmUnitInfo,self.__class__,chmUnitInfo)
|
||||
_chmlib.chmUnitInfo_swigregister(chmUnitInfoPtr)
|
||||
|
||||
chm_open = _chmlib.chm_open
|
||||
|
||||
chm_close = _chmlib.chm_close
|
||||
|
||||
CHM_PARAM_MAX_BLOCKS_CACHED = _chmlib.CHM_PARAM_MAX_BLOCKS_CACHED
|
||||
chm_set_param = _chmlib.chm_set_param
|
||||
|
||||
CHM_RESOLVE_SUCCESS = _chmlib.CHM_RESOLVE_SUCCESS
|
||||
CHM_RESOLVE_FAILURE = _chmlib.CHM_RESOLVE_FAILURE
|
||||
chm_resolve_object = _chmlib.chm_resolve_object
|
||||
|
||||
chm_retrieve_object = _chmlib.chm_retrieve_object
|
||||
|
||||
CHM_ENUMERATE_NORMAL = _chmlib.CHM_ENUMERATE_NORMAL
|
||||
CHM_ENUMERATE_META = _chmlib.CHM_ENUMERATE_META
|
||||
CHM_ENUMERATE_SPECIAL = _chmlib.CHM_ENUMERATE_SPECIAL
|
||||
CHM_ENUMERATE_FILES = _chmlib.CHM_ENUMERATE_FILES
|
||||
CHM_ENUMERATE_DIRS = _chmlib.CHM_ENUMERATE_DIRS
|
||||
CHM_ENUMERATE_ALL = _chmlib.CHM_ENUMERATE_ALL
|
||||
CHM_ENUMERATOR_FAILURE = _chmlib.CHM_ENUMERATOR_FAILURE
|
||||
CHM_ENUMERATOR_CONTINUE = _chmlib.CHM_ENUMERATOR_CONTINUE
|
||||
CHM_ENUMERATOR_SUCCESS = _chmlib.CHM_ENUMERATOR_SUCCESS
|
||||
chm_enumerate = _chmlib.chm_enumerate
|
||||
|
||||
chm_enumerate_dir = _chmlib.chm_enumerate_dir
|
||||
|
||||
|
BIN
src/calibre/ebooks/chm/chm/extra.pyd
Normal file
BIN
src/calibre/ebooks/chm/chm/extra.pyd
Normal file
Binary file not shown.
348
src/calibre/ebooks/chm/input.py
Normal file
348
src/calibre/ebooks/chm/input.py
Normal file
@ -0,0 +1,348 @@
|
||||
from __future__ import with_statement
|
||||
''' CHM File decoding support '''
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
|
||||
' and Alex Bramley <a.bramley at gmail.com>.'
|
||||
|
||||
import sys, logging, os, re, shutil, subprocess, uuid
|
||||
from shutil import rmtree
|
||||
from tempfile import mkdtemp
|
||||
from mimetypes import guess_type as guess_mimetype
|
||||
from htmlentitydefs import name2codepoint
|
||||
from pprint import PrettyPrinter
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from lxml import html, etree
|
||||
from calibre.ebooks.chm.chm.chm import CHMFile
|
||||
from calibre.ebooks.chm.chm.chmlib import (
|
||||
CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
|
||||
chm_enumerate, chm_retrieve_object,
|
||||
)
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator, Guide
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
|
||||
|
||||
def match_string(s1, s2_already_lowered):
|
||||
if s1 is not None and s2_already_lowered is not None:
|
||||
if s1.lower()==s2_already_lowered:
|
||||
return True
|
||||
return False
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(usage=_('%prog [options] mybook.chm'))
|
||||
parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
|
||||
parser.add_option('--verbose', default=False, action='store_true', dest='verbose')
|
||||
parser.add_option("-t", "--title", action="store", type="string", \
|
||||
dest="title", help=_("Set the book title"))
|
||||
parser.add_option('--title-sort', action='store', type='string', default=None,
|
||||
dest='title_sort', help=_('Set sort key for the title'))
|
||||
parser.add_option("-a", "--author", action="store", type="string", \
|
||||
dest="author", help=_("Set the author"))
|
||||
parser.add_option('--author-sort', action='store', type='string', default=None,
|
||||
dest='author_sort', help=_('Set sort key for the author'))
|
||||
parser.add_option("-c", "--category", action="store", type="string", \
|
||||
dest="category", help=_("The category this book belongs"
|
||||
" to. E.g.: History"))
|
||||
parser.add_option("--thumbnail", action="store", type="string", \
|
||||
dest="thumbnail", help=_("Path to a graphic that will be"
|
||||
" set as this files' thumbnail"))
|
||||
parser.add_option("--comment", action="store", type="string", \
|
||||
dest="freetext", help=_("Path to a txt file containing a comment."))
|
||||
parser.add_option("--get-thumbnail", action="store_true", \
|
||||
dest="get_thumbnail", default=False, \
|
||||
help=_("Extract thumbnail from LRF file"))
|
||||
parser.add_option('--publisher', default=None, help=_('Set the publisher'))
|
||||
parser.add_option('--classification', default=None, help=_('Set the book classification'))
|
||||
parser.add_option('--creator', default=None, help=_('Set the book creator'))
|
||||
parser.add_option('--producer', default=None, help=_('Set the book producer'))
|
||||
parser.add_option('--get-cover', action='store_true', default=False,
|
||||
help=_('Extract cover from LRF file. Note that the LRF format has no defined cover, so we use some heuristics to guess the cover.'))
|
||||
parser.add_option('--bookid', action='store', type='string', default=None,
|
||||
dest='book_id', help=_('Set book ID'))
|
||||
parser.add_option('--font-delta', action='store', type='int', default=0,
|
||||
dest='font_delta', help=_('Set font delta'))
|
||||
return parser
|
||||
|
||||
class CHMError(Exception):
|
||||
pass
|
||||
|
||||
class CHMReader(CHMFile):
|
||||
def __init__(self, input, log):
|
||||
CHMFile.__init__(self)
|
||||
if not self.LoadCHM(input):
|
||||
raise CHMError("Unable to open CHM file '%s'"%(input,))
|
||||
self.log = log
|
||||
self._sourcechm = input
|
||||
self._contents = None
|
||||
self._playorder = 0
|
||||
self._metadata = False
|
||||
self._extracted = False
|
||||
|
||||
# location of '.hhc' file, which is the CHM TOC.
|
||||
self.root, ext = os.path.splitext(self.topics.lstrip('/'))
|
||||
self.hhc_path = self.root + ".hhc"
|
||||
|
||||
|
||||
def _parse_toc(self, ul, basedir=os.getcwdu()):
|
||||
toc = TOC(play_order=self._playorder, base_path=basedir, text='')
|
||||
self._playorder += 1
|
||||
for li in ul('li', recursive=False):
|
||||
href = li.object('param', {'name': 'Local'})[0]['value']
|
||||
if href.count('#'):
|
||||
href, frag = href.split('#')
|
||||
else:
|
||||
frag = None
|
||||
name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
|
||||
#print "========>", name
|
||||
toc.add_item(href, frag, name, play_order=self._playorder)
|
||||
self._playorder += 1
|
||||
if li.ul:
|
||||
child = self._parse_toc(li.ul)
|
||||
child.parent = toc
|
||||
toc.append(child)
|
||||
#print toc
|
||||
return toc
|
||||
|
||||
|
||||
def GetFile(self, path):
|
||||
# have to have abs paths for ResolveObject, but Contents() deliberately
|
||||
# makes them relative. So we don't have to worry, re-add the leading /.
|
||||
# note this path refers to the internal CHM structure
|
||||
if path[0] != '/':
|
||||
path = '/' + path
|
||||
res, ui = self.ResolveObject(path)
|
||||
if res != CHM_RESOLVE_SUCCESS:
|
||||
raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
|
||||
size, data = self.RetrieveObject(ui)
|
||||
if size == 0:
|
||||
raise CHMError("'%s' is zero bytes in length!"%(path,))
|
||||
return data
|
||||
|
||||
def ExtractFiles(self, output_dir=os.getcwdu()):
|
||||
for path in self.Contents():
|
||||
lpath = os.path.join(output_dir, path)
|
||||
self._ensure_dir(lpath)
|
||||
data = self.GetFile(path)
|
||||
with open(lpath, 'wb') as f:
|
||||
if guess_mimetype(path)[0] == ('text/html'):
|
||||
data = self._reformat(data)
|
||||
f.write(data)
|
||||
#subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
|
||||
self._extracted = True
|
||||
|
||||
def _reformat(self, data):
|
||||
try:
|
||||
html = BeautifulSoup(data)
|
||||
except UnicodeEncodeError:
|
||||
# hit some strange encoding problems...
|
||||
print "Unable to parse html for cleaning, leaving it :("
|
||||
return data
|
||||
# nuke javascript...
|
||||
[s.extract() for s in html('script')]
|
||||
# remove forward and back nav bars from the top/bottom of each page
|
||||
# cos they really fuck with the flow of things and generally waste space
|
||||
# since we can't use [a,b] syntax to select arbitrary items from a list
|
||||
# we'll have to do this manually...
|
||||
t = html('table')
|
||||
if t:
|
||||
if (t[0].previousSibling is None
|
||||
or t[0].previousSibling.previousSibling is None):
|
||||
t[0].extract()
|
||||
if (t[-1].nextSibling is None
|
||||
or t[-1].nextSibling.nextSibling is None):
|
||||
t[-1].extract()
|
||||
# for some very odd reason each page's content appears to be in a table
|
||||
# too. and this table has sub-tables for random asides... grr.
|
||||
|
||||
# some images seem to be broken in some chm's :/
|
||||
for img in html('img'):
|
||||
try:
|
||||
# some are supposedly "relative"... lies.
|
||||
while img['src'].startswith('../'): img['src'] = img['src'][3:]
|
||||
# some have ";<junk>" at the end.
|
||||
img['src'] = img['src'].split(';')[0]
|
||||
except KeyError:
|
||||
# and some don't even have a src= ?!
|
||||
pass
|
||||
# now give back some pretty html.
|
||||
return html.prettify()
|
||||
|
||||
def Contents(self):
|
||||
if self._contents is not None:
|
||||
return self._contents
|
||||
paths = []
|
||||
def get_paths(chm, ui, ctx):
|
||||
# skip directories
|
||||
# note this path refers to the internal CHM structure
|
||||
if ui.path[-1] != '/':
|
||||
# and make paths relative
|
||||
paths.append(ui.path.lstrip('/'))
|
||||
chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
|
||||
self._contents = paths
|
||||
return self._contents
|
||||
|
||||
def _ensure_dir(self, path):
|
||||
dir = os.path.dirname(path)
|
||||
if not os.path.isdir(dir):
|
||||
os.makedirs(dir)
|
||||
|
||||
def extract_content(self, output_dir=os.getcwdu()):
|
||||
self.ExtractFiles(output_dir=output_dir)
|
||||
|
||||
|
||||
class CHMInput(InputFormatPlugin):
|
||||
|
||||
name = 'CHM Input'
|
||||
author = 'Kovid Goyal and Alex Bramley'
|
||||
description = 'Convert CHM files to OEB'
|
||||
file_types = set(['chm'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='dummy_option', recommended_value=False,
|
||||
help=_('dummy option until real options are determined.')),
|
||||
])
|
||||
|
||||
def _chmtohtml(self, output_dir, chm_path, no_images, log):
|
||||
log.debug('Opening CHM file')
|
||||
rdr = CHMReader(chm_path, log)
|
||||
log.debug('Extracting CHM to %s' % output_dir)
|
||||
rdr.extract_content(output_dir)
|
||||
return rdr.hhc_path
|
||||
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.metadata.chm import get_metadata_
|
||||
|
||||
log.debug('Processing CHM...')
|
||||
tdir = mkdtemp(prefix='chm2oeb_')
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
no_images = False #options.no_images
|
||||
chm_name = stream.name
|
||||
#chm_data = stream.read()
|
||||
|
||||
#closing stream so CHM can be opened by external library
|
||||
stream.close()
|
||||
log.debug('tdir=%s' % tdir)
|
||||
log.debug('stream.name=%s' % stream.name)
|
||||
mainname = self._chmtohtml(tdir, chm_name, no_images, log)
|
||||
mainpath = os.path.join(tdir, mainname)
|
||||
|
||||
metadata = get_metadata_(tdir)
|
||||
|
||||
cwd = os.getcwdu()
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# try a custom conversion:
|
||||
oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
|
||||
options.debug_pipeline = odi
|
||||
#log.debug('DEBUG: Not removing tempdir %s' % tdir)
|
||||
shutil.rmtree(tdir)
|
||||
return oeb
|
||||
|
||||
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
from calibre.ebooks.oeb.base import DirContainer, \
|
||||
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, \
|
||||
xpath
|
||||
from calibre import guess_type
|
||||
import cssutils
|
||||
oeb = create_oebbook(log, None, opts, self,
|
||||
encoding=opts.input_encoding, populate=False)
|
||||
self.oeb = oeb
|
||||
|
||||
metadata = oeb.metadata
|
||||
if mi.title:
|
||||
metadata.add('title', mi.title)
|
||||
if mi.authors:
|
||||
for a in mi.authors:
|
||||
metadata.add('creator', a, attrib={'role':'aut'})
|
||||
if mi.publisher:
|
||||
metadata.add('publisher', mi.publisher)
|
||||
if mi.isbn:
|
||||
metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'})
|
||||
if not metadata.language:
|
||||
oeb.logger.warn(u'Language not specified')
|
||||
metadata.add('language', get_lang())
|
||||
if not metadata.creator:
|
||||
oeb.logger.warn('Creator not specified')
|
||||
metadata.add('creator', self.oeb.translate(__('Unknown')))
|
||||
if not metadata.title:
|
||||
oeb.logger.warn('Title not specified')
|
||||
metadata.add('title', self.oeb.translate(__('Unknown')))
|
||||
|
||||
bookid = str(uuid.uuid4())
|
||||
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
||||
for ident in metadata.identifier:
|
||||
if 'id' in ident.attrib:
|
||||
self.oeb.uid = metadata.identifier[0]
|
||||
break
|
||||
|
||||
hhcdata = self._read_file(hhcpath)
|
||||
hhcroot = html.fromstring(hhcdata)
|
||||
chapters = self._process_nodes(hhcroot)
|
||||
#print "============================="
|
||||
#print "Printing hhcroot"
|
||||
#print etree.tostring(hhcroot, pretty_print=True)
|
||||
#print "============================="
|
||||
log.debug('Found %d section nodes' % len(chapters))
|
||||
|
||||
if len(chapters) > 0:
|
||||
path0 = chapters[0][1]
|
||||
subpath = os.path.dirname(path0)
|
||||
htmlpath = os.path.join(basedir, subpath)
|
||||
|
||||
oeb.container = DirContainer(htmlpath, log)
|
||||
for chapter in chapters:
|
||||
title = chapter[0]
|
||||
basename = os.path.basename(chapter[1])
|
||||
self._add_item(oeb, title, basename)
|
||||
|
||||
oeb.container = DirContainer(htmlpath, oeb.log)
|
||||
return oeb
|
||||
|
||||
def _read_file(self, name):
|
||||
f = open(name, 'rb')
|
||||
data = f.read()
|
||||
f.close()
|
||||
return data
|
||||
|
||||
def _visit_node(self, node, chapters):
|
||||
# check that node is a normal node (not a comment, DOCTYPE, etc.)
|
||||
# (normal nodes have string tags)
|
||||
if isinstance(node.tag, basestring):
|
||||
if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
|
||||
for child in node:
|
||||
if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
|
||||
chapter_title = child.attrib['value']
|
||||
if match_string(child.tag,'param') and match_string(child.attrib['name'],'local'):
|
||||
chapter_path = child.attrib['value']
|
||||
if chapter_title is not None and chapter_path is not None:
|
||||
chapter = [chapter_title, chapter_path]
|
||||
chapters.append(chapter)
|
||||
|
||||
def _process_nodes(self, root):
|
||||
chapters = []
|
||||
for node in root.iter():
|
||||
self._visit_node(node, chapters)
|
||||
return chapters
|
||||
|
||||
def _add_item(self, oeb, title, path):
|
||||
bname = os.path.basename(path)
|
||||
id, href = oeb.manifest.generate(id='html',
|
||||
href=ascii_filename(bname))
|
||||
item = oeb.manifest.add(id, href, 'text/html')
|
||||
item.html_input_href = bname
|
||||
oeb.spine.add(item, True)
|
||||
oeb.toc.add(title, item.href)
|
||||
|
Loading…
x
Reference in New Issue
Block a user