Minor cleanups in pdb code

This commit is contained in:
Kovid Goyal 2009-05-04 10:47:38 -07:00
parent a8ee3491df
commit 624390db05
2 changed files with 37 additions and 36 deletions

View File

@ -8,7 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, re, sys, struct, zlib
import os, re, struct, zlib
from calibre import CurrentDir
from calibre.ebooks import DRMError
@ -30,7 +30,7 @@ class HeaderRecord(object):
def __init__(self, raw):
self.version, = struct.unpack('>H', raw[0:2])
self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
@ -39,17 +39,17 @@ class HeaderRecord(object):
self.footnote_offset, = struct.unpack('>H', raw[48:50])
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
self.last_data_offset, = struct.unpack('>H', raw[52:54])
self.num_text_pages = self.non_text_offset - 1
self.num_image_pages = self.metadata_offset - self.image_data_offset
class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None):
self.log = log
self.encoding = encoding
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
@ -61,17 +61,17 @@ class Reader(FormatReader):
raise DRMError('eReader DRM is not supported.')
else:
raise EreaderError('Unknown book version %i.' % self.header_record.version)
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
if self.header_record.version == 2:
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
if self.header_record.version == 10:
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
def get_image(self, number):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', ''
@ -79,7 +79,7 @@ class Reader(FormatReader):
name = data[4:4+32].strip('\x00')
img = data[62:]
return name, img
def get_text_page(self, number):
'''
Only palmdoc and zlib compressed are supported. The text is
@ -88,21 +88,21 @@ class Reader(FormatReader):
'''
if number not in range(1, self.header_record.num_text_pages + 1):
return ''
return self.decompress_text(number)
def extract_content(self, output_dir):
output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
html = u'<html><head><title></title></head><body>'
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
html += pml_to_html(self.get_text_page(i))
if self.header_record.footnote_rec > 0:
html += '<br /><h1>%s</h1>' % _('Footnotes')
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
@ -110,8 +110,8 @@ class Reader(FormatReader):
self.log.debug('Extracting footnote page %i' % i)
html += '<dl>'
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
html += '</dl>'
html += '</dl>'
if self.header_record.sidebar_rec > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar')
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
@ -120,9 +120,9 @@ class Reader(FormatReader):
html += '<dl>'
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
html += '</dl>'
html += '</body></html>'
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html')
@ -138,19 +138,19 @@ class Reader(FormatReader):
with open(name, 'wb') as imgf:
self.log.debug('Writing image %s to images/' % name)
imgf.write(img)
opf_path = self.create_opf(output_dir, images)
return opf_path
def create_opf(self, output_dir, images):
mi = MetaInformation(None, None)
with CurrentDir(output_dir):
opf = OPFCreator(output_dir, mi)
manifest = [('index.html', None)]
for i in images:
manifest.append((os.path.join('images/', i), None))
@ -158,21 +158,21 @@ class Reader(FormatReader):
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)
return os.path.join(output_dir, 'metadata.opf')
def dump_pml(self):
'''
This is primarily used for debugging and 3rd party tools to
get the plm markup that comprises the text in the file.
'''
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
pml += self.get_text_page(i)
return pml
def dump_images(self, output_dir):
'''
This is primarily used for debugging and 3rd party tools to
@ -181,7 +181,7 @@ class Reader(FormatReader):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with CurrentDir(output_dir):
with CurrentDir(output_dir):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
with open(name, 'wb') as imgf:

View File

@ -8,7 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, re, struct, time
import re, struct, time
class PdbHeaderReader(object):
@ -53,7 +53,8 @@ class PdbHeaderReader(object):
start = self.section_offset(number)
if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size
self.stream.seek(0, 2)
end = self.stream.tell()
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
@ -65,18 +66,18 @@ class PdbHeaderBuilder(object):
def __init__(self, identity, title):
self.identity = identity.ljust(3, '\x00')[:8]
self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32]
def build_header(self, section_lengths, out_stream):
'''
section_lengths = Lenght of each section in file.
'''
now = int(time.time())
nrecords = len(section_lengths)
out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords))
offset = 78 + (8 * nrecords) + 2
for id, record in enumerate(section_lengths):
out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0))