From d7e20bb1e578b5556a7e7c5dd127c282ddb028ab Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 3 May 2009 15:05:55 -0400 Subject: [PATCH 1/3] PalmDoc pdb output. --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/pdb/__init__.py | 42 +++++++++------ src/calibre/ebooks/pdb/formatreader.py | 2 +- src/calibre/ebooks/pdb/formatwriter.py | 18 +++++++ src/calibre/ebooks/pdb/input.py | 1 - src/calibre/ebooks/pdb/output.py | 45 ++++++++++++++++ src/calibre/ebooks/pdb/palmdoc/writer.py | 67 ++++++++++++++++++++++++ src/calibre/ebooks/txt/output.py | 1 + 8 files changed, 160 insertions(+), 19 deletions(-) create mode 100644 src/calibre/ebooks/pdb/formatwriter.py create mode 100644 src/calibre/ebooks/pdb/output.py create mode 100644 src/calibre/ebooks/pdb/palmdoc/writer.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 2228424782..fd69a3745a 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -302,6 +302,7 @@ from calibre.web.feeds.input import RecipeInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.mobi.output import MOBIOutput +from calibre.ebooks.pdb.output import PDBOutput from calibre.ebooks.lrf.output import LRFOutput from calibre.ebooks.lit.output import LITOutput from calibre.ebooks.txt.output import TXTOutput @@ -323,7 +324,7 @@ from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput, - PMLOutput, MOBIOutput, LRFOutput, LITOutput] + PMLOutput, MOBIOutput, PDBOutput, LRFOutput, LITOutput] plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, JETBOOK] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index 1bf8ebeb89..f60cc91c93 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -5,15 +5,25 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -from calibre.ebooks.pdb.ereader.reader import Reader as eReader -from calibre.ebooks.pdb.ztxt.reader import Reader as zTXT -from calibre.ebooks.pdb.palmdoc.reader import Reader as PalmDoc +class PDBError(Exception): + pass + -FORMATS = { - 'PNPdPPrs' : eReader, - 'PNRdPPrs' : eReader, - 'zTXTGPlm' : zTXT, - 'TEXtREAd' : PalmDoc, +from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader +from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader +from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader + +FORMAT_READERS = { + 'PNPdPPrs' : ereader_reader, + 'PNRdPPrs' : ereader_reader, + 'zTXTGPlm' : ztxt_reader, + 'TEXtREAd' : palmdoc_reader, +} + +from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer + +FORMAT_WRITERS = { + 'doc' : palmdoc_writer, } IDENTITY_TO_NAME = { @@ -48,15 +58,15 @@ IDENTITY_TO_NAME = { 'BDOCWrdS' : 'WordSmith', } -class PDBError(Exception): - pass - - def get_reader(identity): ''' Returns None if no reader is found for the identity. ''' - if identity in FORMATS.keys(): - return FORMATS[identity] - else: - return None + return FORMAT_READERS.get(identity, None) + +def get_writer(extension): + ''' + Returns None if no writer is found for extension. + ''' + return FORMAT_WRITERS.get(extension, None) + diff --git a/src/calibre/ebooks/pdb/formatreader.py b/src/calibre/ebooks/pdb/formatreader.py index 25abb462cf..bde6c9ae35 100644 --- a/src/calibre/ebooks/pdb/formatreader.py +++ b/src/calibre/ebooks/pdb/formatreader.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement + ''' Interface defining the necessary public functions for a pdb format reader. ''' diff --git a/src/calibre/ebooks/pdb/formatwriter.py b/src/calibre/ebooks/pdb/formatwriter.py new file mode 100644 index 0000000000..18b5f56219 --- /dev/null +++ b/src/calibre/ebooks/pdb/formatwriter.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +''' +Interface defining the necessary public functions for a pdb format writer. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + + +class FormatWriter(object): + + def __init__(self, opts, log): + raise NotImplementedError() + + def write_content(self, oeb_book, output_stream, ): + raise NotImplementedError() diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 1a7e32e3eb..68e709e8b8 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' diff --git a/src/calibre/ebooks/pdb/output.py b/src/calibre/ebooks/pdb/output.py new file mode 100644 index 0000000000..2095b64ab2 --- /dev/null +++ b/src/calibre/ebooks/pdb/output.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import OutputFormatPlugin +from calibre.ebooks.pdb import PDBError, get_writer + +class PDBOutput(OutputFormatPlugin): + + name = 'PDB Output' + author = 'John Schember' + file_type = 'pdb' + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + close = False + if not hasattr(output_path, 'write'): + # Determine the format to write based upon the sub extension + format = os.path.splitext(os.path.splitext(output_path)[0])[1][1:] + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + format = os.path.splitext(os.path.splitext(output_path.name)[0])[1][1:] + out_stream = output_path + + Writer = get_writer(format) + + if Writer is None: + raise PDBError('No writer avaliable for format %s.' % format) + + writer = Writer(opts, log) + + out_stream.seek(0) + out_stream.truncate() + + writer.write_content(oeb_book, out_stream) + + if close: + out_stream.close() + diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py new file mode 100644 index 0000000000..a93bc94d26 --- /dev/null +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +''' +Writer content to palmdoc pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import struct + +from calibre.ebooks.pdb.formatwriter import FormatWriter +from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata +from calibre.ebooks.mobi.palmdoc import compress_doc +from calibre.ebooks.pdb.header import PdbHeaderBuilder + +MAX_RECORD_SIZE = 4096 + +class Writer(FormatWriter): + + def __init__(self, opts, log): + self.opts = opts + self.log = log + + def write_content(self, oeb_book, out_stream): + title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') + + txt_records, txt_length = self._generate_text(oeb_book.spine) + header_record = self._header_record(txt_length, len(txt_records)) + + section_lengths = [len(header_record)] + for i in range(0, len(txt_records)): + txt_records[i] = compress_doc(txt_records[i].encode('utf-8')) + section_lengths.append(len(txt_records[i])) + + out_stream.seek(0) + hb = PdbHeaderBuilder('TEXtREAd', title) + hb.build_header(section_lengths, out_stream) + + for record in [header_record]+txt_records: + out_stream.write(record) + + def _generate_text(self, spine): + txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) + txt = txt_writer.dump(spine, TxtMetadata()) + + txt_length = len(txt) + + txt_records = [] + for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1): + txt_records.append(txt[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) + + return txt_records, txt_length + + def _header_record(self, txt_length, record_count): + record = '' + + record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression). + record += struct.pack('>H', 0) # [2:4], Always 0. + record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book. + record += struct.pack('>H', record_count) # [8:10], Number of PDB records used for the text of the book. + record += struct.pack('>H', MAX_RECORD_SIZE) # [10-12], Maximum size of each record containing text, always 4096. + record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text. + + return record + diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 62c07c3d04..219e1d3111 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- + __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' From 5b8974dbb1680e1ea796c884acc2e8d413fb0b10 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 3 May 2009 18:44:09 -0400 Subject: [PATCH 2/3] PDB ztxt writer. --- src/calibre/ebooks/pdb/__init__.py | 4 +- src/calibre/ebooks/pdb/palmdoc/writer.py | 2 +- src/calibre/ebooks/pdb/ztxt/writer.py | 78 ++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/pdb/ztxt/writer.py diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index f60cc91c93..70a12ceb96 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -21,9 +21,11 @@ FORMAT_READERS = { } from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer +from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer FORMAT_WRITERS = { - 'doc' : palmdoc_writer, + 'doc' : palmdoc_writer, + 'ztxt' : ztxt_writer, } IDENTITY_TO_NAME = { diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index a93bc94d26..705b01daee 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -34,7 +34,7 @@ class Writer(FormatWriter): txt_records[i] = compress_doc(txt_records[i].encode('utf-8')) section_lengths.append(len(txt_records[i])) - out_stream.seek(0) + out_stream.seek(0) hb = PdbHeaderBuilder('TEXtREAd', title) hb.build_header(section_lengths, out_stream) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py new file mode 100644 index 0000000000..81a0df48af --- /dev/null +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +''' +Writer content to ztxt pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import struct, zlib + +from calibre.ebooks.pdb.formatwriter import FormatWriter +from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata +from calibre.ebooks.pdb.header import PdbHeaderBuilder + +MAX_RECORD_SIZE = 8192 + +class Writer(FormatWriter): + + def __init__(self, opts, log): + self.opts = opts + self.log = log + + def write_content(self, oeb_book, out_stream): + title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') + + txt_records, txt_length = self._generate_text(oeb_book.spine) + + crc32 = 0 + section_lengths = [] + compressor = zlib.compressobj(9) + for i in range(0, len(txt_records)): + txt_records[i] = compressor.compress(txt_records[i].encode('utf-8')) + txt_records[i] = txt_records[i] + compressor.flush(zlib.Z_FULL_FLUSH) + section_lengths.append(len(txt_records[i])) + crc32 = zlib.crc32(txt_records[i], crc32) & 0xffffffff + + header_record = self._header_record(txt_length, len(txt_records), crc32) + section_lengths.insert(0, len(header_record)) + + out_stream.seek(0) + hb = PdbHeaderBuilder('zTXTGPlm', title) + hb.build_header(section_lengths, out_stream) + + for record in [header_record]+txt_records: + out_stream.write(record) + + def _generate_text(self, spine): + txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) + txt = txt_writer.dump(spine, TxtMetadata()) + + txt_length = len(txt) + + txt_records = [] + for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1): + txt_records.append(txt[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) + + return txt_records, txt_length + + def _header_record(self, txt_length, record_count, crc32): + record = '' + + record += struct.pack('>H', 0x012c) # [0:2], version. 0x012c = 1.44 + record += struct.pack('>H', record_count) # [2:4], Number of PDB records used for the text of the book. + record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book. + record += struct.pack('>H', MAX_RECORD_SIZE) # [8:10], Maximum size of each record containing text + record += struct.pack('>H', 0) # [10:12], Number of bookmarks. + record += struct.pack('>H', 0) # [12:14], Bookmark record. 0 if there are no bookmarks. + record += struct.pack('>H', 0) # [14:16], Number of annotations. + record += struct.pack('>H', 0) # [16:18], Annotation record. 0 if there are no annotations. + record += struct.pack('>B', 1) # [18:19], Flags. Bitmask, 0x01 = Random Access. 0x02 = Non-Uniform text block size. + record += struct.pack('>B', 0) # [19:20], Reserved. + record += struct.pack('>L', crc32) # [20:24], crc32 + record += struct.pack('>LL', 0, 0) # [24:32], padding + + return record + From d77f83f9a656fa90d3fefbc751bdaa30d06dff1c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 3 May 2009 19:07:21 -0400 Subject: [PATCH 3/3] ztxt reader: check version and flags (only newer ztxt format is supported). log info and debug for ztxt and palmdoc input/output. --- src/calibre/ebooks/pdb/palmdoc/reader.py | 3 +++ src/calibre/ebooks/pdb/palmdoc/writer.py | 2 ++ src/calibre/ebooks/pdb/ztxt/reader.py | 16 ++++++++++++++++ src/calibre/ebooks/pdb/ztxt/writer.py | 2 ++ 4 files changed, 23 insertions(+) diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index a5a58b4d81..ba35a2317e 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -53,9 +53,12 @@ class Reader(FormatReader): def extract_content(self, output_dir): txt = '' + self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): + self.log.debug('\tDecompressing text section %i' % i) txt += self.decompress_text(i) + self.log.info('Converting text to OEB...') html = txt_to_markdown(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 705b01daee..784eca5523 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -30,7 +30,9 @@ class Writer(FormatWriter): header_record = self._header_record(txt_length, len(txt_records)) section_lengths = [len(header_record)] + self.log.info('Compessing data...') for i in range(0, len(txt_records)): + self.log.debug('\tCompressing record %i' % i) txt_records[i] = compress_doc(txt_records[i].encode('utf-8')) section_lengths.append(len(txt_records[i])) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 3d96018def..ccc26a3fdc 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -14,6 +14,8 @@ from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer +SUPPORTED_VERSION = (1, 40) + class HeaderRecord(object): ''' The first record in the file is always the header record. It holds @@ -27,6 +29,7 @@ class HeaderRecord(object): self.num_records, = struct.unpack('>H', raw[2:4]) self.size, = struct.unpack('>L', raw[4:8]) self.record_size, = struct.unpack('>H', raw[8:10]) + self.flags, = struct.unpack('>B', raw[18:19]) class Reader(FormatReader): @@ -41,6 +44,16 @@ class Reader(FormatReader): self.sections.append(header.section_data(i)) self.header_record = HeaderRecord(self.section_data(0)) + + vmajor = (self.header_record.version & 0x0000FF00) >> 8 + vminor = self.header_record.version & 0x000000FF + if vmajor < 1 or (vmajor == 1 and vminor < 40): + raise zTXTError('Unsupported ztxt version (%i.%i). Only versions newer than %i.%i are supported.' % (vmajor, vminor, SUPPORTED_VERSION[0], SUPPORTED_VERSION[1])) + + if (self.header_record.flags & 0x01) == 0: + raise zTXTError('Only compression method 1 (random access) is supported') + + self.log.debug('Foud ztxt version: %i.%i' % (vmajor, vminor)) # Initalize the decompressor self.uncompressor = zlib.decompressobj() @@ -57,9 +70,12 @@ class Reader(FormatReader): def extract_content(self, output_dir): txt = '' + self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): + self.log.debug('\tDecompressing text section %i' % i) txt += self.decompress_text(i) + self.log.info('Converting text to OEB...') html = txt_to_markdown(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index 81a0df48af..6e974d1a67 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -30,7 +30,9 @@ class Writer(FormatWriter): crc32 = 0 section_lengths = [] compressor = zlib.compressobj(9) + self.log.info('Compressing data...') for i in range(0, len(txt_records)): + self.log.debug('\tCompressing record %i' % i) txt_records[i] = compressor.compress(txt_records[i].encode('utf-8')) txt_records[i] = txt_records[i] + compressor.flush(zlib.Z_FULL_FLUSH) section_lengths.append(len(txt_records[i]))