diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index a1ab0a7a65..13e204fd5e 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -8,7 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, re, sys, struct, zlib +import os, re, struct, zlib from calibre import CurrentDir from calibre.ebooks import DRMError @@ -30,7 +30,7 @@ class HeaderRecord(object): def __init__(self, raw): self.version, = struct.unpack('>H', raw[0:2]) - self.non_text_offset, = struct.unpack('>H', raw[12:14]) + self.non_text_offset, = struct.unpack('>H', raw[12:14]) self.footnote_rec, = struct.unpack('>H', raw[28:30]) self.sidebar_rec, = struct.unpack('>H', raw[30:32]) self.bookmark_offset, = struct.unpack('>H', raw[32:34]) @@ -39,17 +39,17 @@ class HeaderRecord(object): self.footnote_offset, = struct.unpack('>H', raw[48:50]) self.sidebar_offset, = struct.unpack('>H', raw[50:52]) self.last_data_offset, = struct.unpack('>H', raw[52:54]) - + self.num_text_pages = self.non_text_offset - 1 self.num_image_pages = self.metadata_offset - self.image_data_offset - + class Reader(FormatReader): def __init__(self, header, stream, log, encoding=None): self.log = log self.encoding = encoding - + self.sections = [] for i in range(header.num_sections): self.sections.append(header.section_data(i)) @@ -61,17 +61,17 @@ class Reader(FormatReader): raise DRMError('eReader DRM is not supported.') else: raise EreaderError('Unknown book version %i.' % self.header_record.version) - + def section_data(self, number): return self.sections[number] - + def decompress_text(self, number): if self.header_record.version == 2: return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) if self.header_record.version == 10: return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) - + def get_image(self, number): if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: return 'empty', '' @@ -79,7 +79,7 @@ class Reader(FormatReader): name = data[4:4+32].strip('\x00') img = data[62:] return name, img - + def get_text_page(self, number): ''' Only palmdoc and zlib compressed are supported. The text is @@ -88,21 +88,21 @@ class Reader(FormatReader): ''' if number not in range(1, self.header_record.num_text_pages + 1): return '' - + return self.decompress_text(number) def extract_content(self, output_dir): output_dir = os.path.abspath(output_dir) - + if not os.path.exists(output_dir): os.makedirs(output_dir) - + html = u'' - + for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) html += pml_to_html(self.get_text_page(i)) - + if self.header_record.footnote_rec > 0: html += '

%s

' % _('Footnotes') footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) @@ -110,8 +110,8 @@ class Reader(FormatReader): self.log.debug('Extracting footnote page %i' % i) html += '
' html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) - html += '
' - + html += '' + if self.header_record.sidebar_rec > 0: html += '

%s

' % _('Sidebar') sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) @@ -120,9 +120,9 @@ class Reader(FormatReader): html += '
' html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) html += '
' - + html += '' - + with CurrentDir(output_dir): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') @@ -138,19 +138,19 @@ class Reader(FormatReader): with open(name, 'wb') as imgf: self.log.debug('Writing image %s to images/' % name) imgf.write(img) - + opf_path = self.create_opf(output_dir, images) - + return opf_path - + def create_opf(self, output_dir, images): mi = MetaInformation(None, None) - + with CurrentDir(output_dir): opf = OPFCreator(output_dir, mi) - + manifest = [('index.html', None)] - + for i in images: manifest.append((os.path.join('images/', i), None)) @@ -158,21 +158,21 @@ class Reader(FormatReader): opf.create_spine(['index.html']) with open('metadata.opf', 'wb') as opffile: opf.render(opffile) - + return os.path.join(output_dir, 'metadata.opf') - + def dump_pml(self): ''' This is primarily used for debugging and 3rd party tools to get the plm markup that comprises the text in the file. ''' pml = '' - + for i in range(1, self.header_record.num_text_pages + 1): pml += self.get_text_page(i) - + return pml - + def dump_images(self, output_dir): ''' This is primarily used for debugging and 3rd party tools to @@ -181,7 +181,7 @@ class Reader(FormatReader): if not os.path.exists(output_dir): os.makedirs(output_dir) - with CurrentDir(output_dir): + with CurrentDir(output_dir): for i in range(0, self.header_record.num_image_pages): name, img = self.get_image(self.header_record.image_data_offset + i) with open(name, 'wb') as imgf: diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index 8a9b7b105c..48c39fc0ad 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -8,7 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, re, struct, time +import re, struct, time class PdbHeaderReader(object): @@ -53,7 +53,8 @@ class PdbHeaderReader(object): start = self.section_offset(number) if number == self.num_sections -1: - end = os.stat(self.stream.name).st_size + self.stream.seek(0, 2) + end = self.stream.tell() else: end = self.section_offset(number + 1) self.stream.seek(start) @@ -65,18 +66,18 @@ class PdbHeaderBuilder(object): def __init__(self, identity, title): self.identity = identity.ljust(3, '\x00')[:8] self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32] - + def build_header(self, section_lengths, out_stream): ''' section_lengths = Lenght of each section in file. ''' - + now = int(time.time()) nrecords = len(section_lengths) - + out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0)) out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords)) - + offset = 78 + (8 * nrecords) + 2 for id, record in enumerate(section_lengths): out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0))