palmdoc pdb input.

This commit is contained in:
John Schember 2009-05-03 12:15:36 -04:00
parent 5c40057adf
commit e447b69bd2
4 changed files with 73 additions and 4 deletions

View File

@ -7,13 +7,13 @@ __docformat__ = 'restructuredtext en'
from calibre.ebooks.pdb.ereader.reader import Reader as eReader
from calibre.ebooks.pdb.ztxt.reader import Reader as zTXT
#from calibre.ebooks.pdb.palmdoc.reader import Reader as PalmDoc
from calibre.ebooks.pdb.palmdoc.reader import Reader as PalmDoc
FORMATS = {
'PNPdPPrs' : eReader,
'PNRdPPrs' : eReader,
'zTXTGPlm' : zTXT,
# 'TEXtREAd' : PalmDoc,
'TEXtREAd' : PalmDoc,
}
IDENTITY_TO_NAME = {

View File

@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
'''
Read content from palmdoc pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, struct, zlib
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.compression, = struct.unpack('>H', raw[0:2])
self.num_records, = struct.unpack('>H', raw[8:10])
class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None):
self.stream = stream
self.log = log
self.encoding = encoding
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
if self.header_record.compression == 1:
return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding)
if self.header_record.compression == 2:
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
return ''
def extract_content(self, output_dir):
txt = ''
for i in range(1, self.header_record.num_records + 1):
txt += self.decompress_text(i)
html = txt_to_markdown(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')

View File

@ -8,11 +8,10 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import StringIO, os, struct, zlib
import os, struct, zlib
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ztxt import zTXTError
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer
class HeaderRecord(object):