Minor cleanups in pdb code

This commit is contained in:
Kovid Goyal 2009-05-04 10:47:38 -07:00
parent a8ee3491df
commit 624390db05
2 changed files with 37 additions and 36 deletions

View File

@ -8,7 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, re, sys, struct, zlib import os, re, struct, zlib
from calibre import CurrentDir from calibre import CurrentDir
from calibre.ebooks import DRMError from calibre.ebooks import DRMError
@ -30,7 +30,7 @@ class HeaderRecord(object):
def __init__(self, raw): def __init__(self, raw):
self.version, = struct.unpack('>H', raw[0:2]) self.version, = struct.unpack('>H', raw[0:2])
self.non_text_offset, = struct.unpack('>H', raw[12:14]) self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.footnote_rec, = struct.unpack('>H', raw[28:30]) self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32]) self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34]) self.bookmark_offset, = struct.unpack('>H', raw[32:34])
@ -39,17 +39,17 @@ class HeaderRecord(object):
self.footnote_offset, = struct.unpack('>H', raw[48:50]) self.footnote_offset, = struct.unpack('>H', raw[48:50])
self.sidebar_offset, = struct.unpack('>H', raw[50:52]) self.sidebar_offset, = struct.unpack('>H', raw[50:52])
self.last_data_offset, = struct.unpack('>H', raw[52:54]) self.last_data_offset, = struct.unpack('>H', raw[52:54])
self.num_text_pages = self.non_text_offset - 1 self.num_text_pages = self.non_text_offset - 1
self.num_image_pages = self.metadata_offset - self.image_data_offset self.num_image_pages = self.metadata_offset - self.image_data_offset
class Reader(FormatReader): class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None): def __init__(self, header, stream, log, encoding=None):
self.log = log self.log = log
self.encoding = encoding self.encoding = encoding
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
self.sections.append(header.section_data(i)) self.sections.append(header.section_data(i))
@ -61,17 +61,17 @@ class Reader(FormatReader):
raise DRMError('eReader DRM is not supported.') raise DRMError('eReader DRM is not supported.')
else: else:
raise EreaderError('Unknown book version %i.' % self.header_record.version) raise EreaderError('Unknown book version %i.' % self.header_record.version)
def section_data(self, number): def section_data(self, number):
return self.sections[number] return self.sections[number]
def decompress_text(self, number): def decompress_text(self, number):
if self.header_record.version == 2: if self.header_record.version == 2:
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
if self.header_record.version == 10: if self.header_record.version == 10:
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
def get_image(self, number): def get_image(self, number):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', '' return 'empty', ''
@ -79,7 +79,7 @@ class Reader(FormatReader):
name = data[4:4+32].strip('\x00') name = data[4:4+32].strip('\x00')
img = data[62:] img = data[62:]
return name, img return name, img
def get_text_page(self, number): def get_text_page(self, number):
''' '''
Only palmdoc and zlib compressed are supported. The text is Only palmdoc and zlib compressed are supported. The text is
@ -88,21 +88,21 @@ class Reader(FormatReader):
''' '''
if number not in range(1, self.header_record.num_text_pages + 1): if number not in range(1, self.header_record.num_text_pages + 1):
return '' return ''
return self.decompress_text(number) return self.decompress_text(number)
def extract_content(self, output_dir): def extract_content(self, output_dir):
output_dir = os.path.abspath(output_dir) output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
html = u'<html><head><title></title></head><body>' html = u'<html><head><title></title></head><body>'
for i in range(1, self.header_record.num_text_pages + 1): for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i) self.log.debug('Extracting text page %i' % i)
html += pml_to_html(self.get_text_page(i)) html += pml_to_html(self.get_text_page(i))
if self.header_record.footnote_rec > 0: if self.header_record.footnote_rec > 0:
html += '<br /><h1>%s</h1>' % _('Footnotes') html += '<br /><h1>%s</h1>' % _('Footnotes')
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
@ -110,8 +110,8 @@ class Reader(FormatReader):
self.log.debug('Extracting footnote page %i' % i) self.log.debug('Extracting footnote page %i' % i)
html += '<dl>' html += '<dl>'
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
html += '</dl>' html += '</dl>'
if self.header_record.sidebar_rec > 0: if self.header_record.sidebar_rec > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar') html += '<br /><h1>%s</h1>' % _('Sidebar')
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
@ -120,9 +120,9 @@ class Reader(FormatReader):
html += '<dl>' html += '<dl>'
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
html += '</dl>' html += '</dl>'
html += '</body></html>' html += '</body></html>'
with CurrentDir(output_dir): with CurrentDir(output_dir):
with open('index.html', 'wb') as index: with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html') self.log.debug('Writing text to index.html')
@ -138,19 +138,19 @@ class Reader(FormatReader):
with open(name, 'wb') as imgf: with open(name, 'wb') as imgf:
self.log.debug('Writing image %s to images/' % name) self.log.debug('Writing image %s to images/' % name)
imgf.write(img) imgf.write(img)
opf_path = self.create_opf(output_dir, images) opf_path = self.create_opf(output_dir, images)
return opf_path return opf_path
def create_opf(self, output_dir, images): def create_opf(self, output_dir, images):
mi = MetaInformation(None, None) mi = MetaInformation(None, None)
with CurrentDir(output_dir): with CurrentDir(output_dir):
opf = OPFCreator(output_dir, mi) opf = OPFCreator(output_dir, mi)
manifest = [('index.html', None)] manifest = [('index.html', None)]
for i in images: for i in images:
manifest.append((os.path.join('images/', i), None)) manifest.append((os.path.join('images/', i), None))
@ -158,21 +158,21 @@ class Reader(FormatReader):
opf.create_spine(['index.html']) opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile: with open('metadata.opf', 'wb') as opffile:
opf.render(opffile) opf.render(opffile)
return os.path.join(output_dir, 'metadata.opf') return os.path.join(output_dir, 'metadata.opf')
def dump_pml(self): def dump_pml(self):
''' '''
This is primarily used for debugging and 3rd party tools to This is primarily used for debugging and 3rd party tools to
get the plm markup that comprises the text in the file. get the plm markup that comprises the text in the file.
''' '''
pml = '' pml = ''
for i in range(1, self.header_record.num_text_pages + 1): for i in range(1, self.header_record.num_text_pages + 1):
pml += self.get_text_page(i) pml += self.get_text_page(i)
return pml return pml
def dump_images(self, output_dir): def dump_images(self, output_dir):
''' '''
This is primarily used for debugging and 3rd party tools to This is primarily used for debugging and 3rd party tools to
@ -181,7 +181,7 @@ class Reader(FormatReader):
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
with CurrentDir(output_dir): with CurrentDir(output_dir):
for i in range(0, self.header_record.num_image_pages): for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i) name, img = self.get_image(self.header_record.image_data_offset + i)
with open(name, 'wb') as imgf: with open(name, 'wb') as imgf:

View File

@ -8,7 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, re, struct, time import re, struct, time
class PdbHeaderReader(object): class PdbHeaderReader(object):
@ -53,7 +53,8 @@ class PdbHeaderReader(object):
start = self.section_offset(number) start = self.section_offset(number)
if number == self.num_sections -1: if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size self.stream.seek(0, 2)
end = self.stream.tell()
else: else:
end = self.section_offset(number + 1) end = self.section_offset(number + 1)
self.stream.seek(start) self.stream.seek(start)
@ -65,18 +66,18 @@ class PdbHeaderBuilder(object):
def __init__(self, identity, title): def __init__(self, identity, title):
self.identity = identity.ljust(3, '\x00')[:8] self.identity = identity.ljust(3, '\x00')[:8]
self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32] self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32]
def build_header(self, section_lengths, out_stream): def build_header(self, section_lengths, out_stream):
''' '''
section_lengths = Lenght of each section in file. section_lengths = Lenght of each section in file.
''' '''
now = int(time.time()) now = int(time.time())
nrecords = len(section_lengths) nrecords = len(section_lengths)
out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0)) out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords)) out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords))
offset = 78 + (8 * nrecords) + 2 offset = 78 + (8 * nrecords) + 2
for id, record in enumerate(section_lengths): for id, record in enumerate(section_lengths):
out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0)) out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0))