From 6f81378544f32c8651fb0bb57da713c541732066 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 3 May 2009 08:34:06 -0400 Subject: [PATCH 1/7] ... --- src/calibre/ebooks/pdb/ereader/__init__.py | 1 - src/calibre/ebooks/pdb/ereader/reader.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py index b39467c6e3..185a44d1a9 100644 --- a/src/calibre/ebooks/pdb/ereader/__init__.py +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index ecf5c706c4..a1ab0a7a65 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -16,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pml.pmlconverter import pml_to_html, \ - footnote_sidebar_to_html + footnote_sidebar_to_html from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.metadata.opf2 import OPFCreator From 9b9381da3aae3384b29975e352cacd371daaf8d1 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 3 May 2009 08:35:44 -0400 Subject: [PATCH 2/7] plucker identity --- src/calibre/ebooks/pdb/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index 8c4f45337f..c4e0349f7b 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -6,15 +6,18 @@ __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' from calibre.ebooks.pdb.ereader.reader import Reader as eReader +from calibre.ebooks.pdb.plucker.reader import Reader as Plucker FORMATS = { 'PNPdPPrs' : eReader, 'PNRdPPrs' : eReader, + 'DataPlkr' : Plucker, } IDENTITY_TO_NAME = { 'PNPdPPrs' : 'eReader', 'PNRdPPrs' : 'eReader', + 'DataPlkr' : 'Plucker', } class PDBError(Exception): From daf6e43523e22055a0ae9e0982900c2026872f12 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 3 May 2009 10:54:07 -0400 Subject: [PATCH 3/7] Fix but in pdfoutput. Add ztxt input. --- src/calibre/ebooks/pdb/__init__.py | 6 +- src/calibre/ebooks/pdb/ztxt/__init__.py | 11 ++++ src/calibre/ebooks/pdb/ztxt/reader.py | 78 +++++++++++++++++++++++++ src/calibre/ebooks/pdf/output.py | 6 +- src/calibre/ebooks/txt/input.py | 17 ++---- src/calibre/ebooks/txt/processor.py | 30 ++++++++++ 6 files changed, 131 insertions(+), 17 deletions(-) create mode 100644 src/calibre/ebooks/pdb/ztxt/__init__.py create mode 100644 src/calibre/ebooks/pdb/ztxt/reader.py create mode 100644 src/calibre/ebooks/txt/processor.py diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index c4e0349f7b..614d610078 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -6,18 +6,18 @@ __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' from calibre.ebooks.pdb.ereader.reader import Reader as eReader -from calibre.ebooks.pdb.plucker.reader import Reader as Plucker +from calibre.ebooks.pdb.ztxt.reader import Reader as zTXT FORMATS = { 'PNPdPPrs' : eReader, 'PNRdPPrs' : eReader, - 'DataPlkr' : Plucker, + 'zTXTGPlm' : zTXT, } IDENTITY_TO_NAME = { 'PNPdPPrs' : 'eReader', 'PNRdPPrs' : 'eReader', - 'DataPlkr' : 'Plucker', + 'zTXTGPlm' : 'zTXT', } class PDBError(Exception): diff --git a/src/calibre/ebooks/pdb/ztxt/__init__.py b/src/calibre/ebooks/pdb/ztxt/__init__.py new file mode 100644 index 0000000000..2c2028b74f --- /dev/null +++ b/src/calibre/ebooks/pdb/ztxt/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +class zTXTError(Exception): + pass + diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py new file mode 100644 index 0000000000..cfd5ba6e79 --- /dev/null +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +''' +Read content from ztxt pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import StringIO, os, struct, zlib + +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ebooks.pdb.ztxt import zTXTError +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer + +class HeaderRecord(object): + ''' + The first record in the file is always the header record. It holds + information related to the location of text, images, and so on + in the file. This is used in conjunction with the sections + defined in the file header. + ''' + + def __init__(self, raw): + self.version, = struct.unpack('>H', raw[0:2]) + self.num_records, = struct.unpack('>H', raw[2:4]) + self.size, = struct.unpack('>L', raw[4:8]) + self.record_size, = struct.unpack('>H', raw[8:10]) + self.crc32, = struct.unpack('>L', raw[18:22]) + + +class Reader(FormatReader): + + def __init__(self, header, stream, log, encoding=None): + self.log = log + self.encoding = encoding + + self.sections = [] + for i in range(header.num_sections): + self.sections.append(header.section_data(i)) + + self.header_record = HeaderRecord(self.section_data(0)) + + # Initalize the decompressor + self.uncompressor = zlib.decompressobj() + self.uncompressor.decompress(self.section_data(1)) + +# if self.header_record.version not in (1, 2) or self.header_record.uid != 1: +# raise zTXTError('Unknown book version %i.' % self.header_record.version) + + + def section_data(self, number): + return self.sections[number] + + def decompress_text(self, number): + if number == 1: + self.uncompressor = zlib.decompressobj() + return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) + + def extract_content(self, output_dir): + txt = '' + + for i in range(1, self.header_record.num_records + 1): + txt += self.decompress_text(i) + + html = txt_to_markdown(txt) + with open(os.path.join(output_dir, 'index.html'), 'wb') as index: + index.write(html.encode('utf-8')) + + mi = MetaInformation(_('Unknown'), _('Unknown')) + manifest = [('index.html', None)] + spine = ['index.html'] + opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) + + return os.path.join(output_dir, 'metadata.opf') + diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 4eb23877d9..ae44d270f7 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -62,12 +62,12 @@ class PDFOutput(OutputFormatPlugin): self.write(ImagePDFWriter, images) def convert_text(self, oeb_book): - with TemporaryDirectory('_pdf_out') as oebdir: + with TemporaryDirectory('_pdf_out') as oeb_dir: from calibre.customize.ui import plugin_for_output_format oeb_output = plugin_for_output_format('oeb') - oeb_output.convert(oeb, oeb_dir, self.input_plugin, self.opts, self.log) + oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log) - opfpath = glob.glob(os.path.join(oebdir, '*.opf'))[0] + opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0] opf = OPF(opfpath, os.path.dirname(opfpath)) self.write(PDFWriter, [s.path for s in opf.spine]) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 34fafc91fc..b94d3be467 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -8,8 +8,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.markdown import markdown -from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer class TXTInput(InputFormatPlugin): @@ -25,19 +24,15 @@ class TXTInput(InputFormatPlugin): ienc = options.input_encoding txt = stream.read().decode(ienc) - md = markdown.Markdown( - extensions=['footnotes', 'tables', 'toc'], - safe_mode=False,) - html = '</head><body>'+md.convert(txt)+'</body></html>' + html = txt_to_markdown(txt) with open('index.html', 'wb') as index: index.write(html.encode('utf-8')) from calibre.ebooks.metadata.meta import get_metadata mi = get_metadata(stream, 'txt') - opf = OPFCreator(os.getcwd(), mi) - opf.create_manifest([('index.html', None)]) - opf.create_spine(['index.html']) - with open('metadata.opf', 'wb') as opffile: - opf.render(opffile) + manifest = [('index.html', None)] + spine = ['index.html'] + opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi) return os.path.join(os.getcwd(), 'metadata.opf') + diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py new file mode 100644 index 0000000000..c8f2690622 --- /dev/null +++ b/src/calibre/ebooks/txt/processor.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +''' +Read content from txt file. +''' + +import os + +from calibre.ebooks.markdown import markdown +from calibre.ebooks.metadata.opf2 import OPFCreator + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +def txt_to_markdown(txt): + md = markdown.Markdown( + extensions=['footnotes', 'tables', 'toc'], + safe_mode=False,) + html = '<html><head><title /></head><body>'+md.convert(txt)+'</body></html>' + + return html + +def opf_writer(path, opf_name, manifest, spine, mi): + opf = OPFCreator(path, mi) + opf.create_manifest(manifest) + opf.create_spine(spine) + with open(os.path.join(path, opf_name), 'wb') as opffile: + opf.render(opffile) + From 6bfe619896f9aa9f97a9686d3b8df4c910416db5 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 3 May 2009 10:59:54 -0400 Subject: [PATCH 4/7] fix bug in pdb metadata. Allow ztxt to use pdb metadata info. --- src/calibre/ebooks/metadata/pdb.py | 2 +- src/calibre/ebooks/pdb/ztxt/reader.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdb.py b/src/calibre/ebooks/metadata/pdb.py index e473925b87..a6f7c6796b 100644 --- a/src/calibre/ebooks/metadata/pdb.py +++ b/src/calibre/ebooks/metadata/pdb.py @@ -29,7 +29,7 @@ def get_metadata(stream, extract_cover=True): MetadataReader = MREADER.get(pheader.ident, None) if MetadataReader is None: - return MetaInformation(_('Unknown'), [_('Unknown')]) + return MetaInformation(pheader.title, [_('Unknown')]) return MetadataReader(stream, extract_cover) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index cfd5ba6e79..42638669f4 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -34,6 +34,7 @@ class HeaderRecord(object): class Reader(FormatReader): def __init__(self, header, stream, log, encoding=None): + self.stream = stream self.log = log self.encoding = encoding @@ -69,7 +70,8 @@ class Reader(FormatReader): with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) - mi = MetaInformation(_('Unknown'), _('Unknown')) + from calibre.ebooks.metadata.meta import get_metadata + mi = get_metadata(self.stream, 'pdb') manifest = [('index.html', None)] spine = ['index.html'] opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) From aa7cd1c4d8f02758cb6a365b96167a848f2de0bc Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 3 May 2009 11:43:31 -0400 Subject: [PATCH 5/7] ztxt cleanup --- src/calibre/ebooks/pdb/ztxt/reader.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 42638669f4..19c04b66b4 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -28,7 +28,6 @@ class HeaderRecord(object): self.num_records, = struct.unpack('>H', raw[2:4]) self.size, = struct.unpack('>L', raw[4:8]) self.record_size, = struct.unpack('>H', raw[8:10]) - self.crc32, = struct.unpack('>L', raw[18:22]) class Reader(FormatReader): @@ -47,10 +46,6 @@ class Reader(FormatReader): # Initalize the decompressor self.uncompressor = zlib.decompressobj() self.uncompressor.decompress(self.section_data(1)) - -# if self.header_record.version not in (1, 2) or self.header_record.uid != 1: -# raise zTXTError('Unknown book version %i.' % self.header_record.version) - def section_data(self, number): return self.sections[number] From 5c40057adf085c13fd95b4f9a4e6f8ed5711c546 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 3 May 2009 12:07:45 -0400 Subject: [PATCH 6/7] pdb input: Better reporting of unknown formats --- src/calibre/ebooks/pdb/__init__.py | 28 ++++++++++++++++++++++++++++ src/calibre/ebooks/pdb/input.py | 2 +- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index 614d610078..a7fb2760fd 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -7,17 +7,45 @@ __docformat__ = 'restructuredtext en' from calibre.ebooks.pdb.ereader.reader import Reader as eReader from calibre.ebooks.pdb.ztxt.reader import Reader as zTXT +#from calibre.ebooks.pdb.palmdoc.reader import Reader as PalmDoc FORMATS = { 'PNPdPPrs' : eReader, 'PNRdPPrs' : eReader, 'zTXTGPlm' : zTXT, +# 'TEXtREAd' : PalmDoc, } IDENTITY_TO_NAME = { 'PNPdPPrs' : 'eReader', 'PNRdPPrs' : 'eReader', 'zTXTGPlm' : 'zTXT', + 'TEXtREAd' : 'PalmDOC', + + '.pdfADBE' : 'Adobe Reader', + 'BVokBDIC' : 'BDicty', + 'DB99DBOS' : 'DB (Database program)', + 'vIMGView' : 'FireViewer (ImageViewer)', + 'PmDBPmDB' : 'HanDBase', + 'InfoINDB' : 'InfoView', + 'ToGoToGo' : 'iSilo', + 'SDocSilX' : 'iSilo 3', + 'JbDbJBas' : 'JFile', + 'JfDbJFil' : 'JFile Pro', + 'DATALSdb' : 'LIST', + 'Mdb1Mdb1' : 'MobileDB', + 'BOOKMOBI' : 'MobiPocket', + 'DataPlkr' : 'Plucker', + 'DataSprd' : 'QuickSheet', + 'SM01SMem' : 'SuperMemo', + 'TEXtTlDc' : 'TealDoc', + 'InfoTlIf' : 'TealInfo', + 'DataTlMl' : 'TealMeal', + 'DataTlPt' : 'TealPaint', + 'dataTDBP' : 'ThinkDB', + 'TdatTide' : 'Tides', + 'ToRaTRPW' : 'TomeRaider', + 'BDOCWrdS' : 'WordSmith', } class PDBError(Exception): diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 24bc8a1025..1a7e32e3eb 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -24,7 +24,7 @@ class PDBInput(InputFormatPlugin): Reader = get_reader(header.ident) if Reader is None: - raise PDBError('Unknown format in pdb file. Identity is %s' % header.identity) + raise PDBError('No reader avaliable for format within container.\n Identity is %s. Book type is %s' % (header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown')))) log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) From e447b69bd2365051376fcfa262d19b520328cbe3 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 3 May 2009 12:15:36 -0400 Subject: [PATCH 7/7] palmdoc pdb input. --- src/calibre/ebooks/pdb/__init__.py | 4 +- src/calibre/ebooks/pdb/palmdoc/__init__.py | 0 src/calibre/ebooks/pdb/palmdoc/reader.py | 70 ++++++++++++++++++++++ src/calibre/ebooks/pdb/ztxt/reader.py | 3 +- 4 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 src/calibre/ebooks/pdb/palmdoc/__init__.py create mode 100644 src/calibre/ebooks/pdb/palmdoc/reader.py diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index a7fb2760fd..1bf8ebeb89 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -7,13 +7,13 @@ __docformat__ = 'restructuredtext en' from calibre.ebooks.pdb.ereader.reader import Reader as eReader from calibre.ebooks.pdb.ztxt.reader import Reader as zTXT -#from calibre.ebooks.pdb.palmdoc.reader import Reader as PalmDoc +from calibre.ebooks.pdb.palmdoc.reader import Reader as PalmDoc FORMATS = { 'PNPdPPrs' : eReader, 'PNRdPPrs' : eReader, 'zTXTGPlm' : zTXT, -# 'TEXtREAd' : PalmDoc, + 'TEXtREAd' : PalmDoc, } IDENTITY_TO_NAME = { diff --git a/src/calibre/ebooks/pdb/palmdoc/__init__.py b/src/calibre/ebooks/pdb/palmdoc/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py new file mode 100644 index 0000000000..a5a58b4d81 --- /dev/null +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- + +''' +Read content from palmdoc pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os, struct, zlib + +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ebooks.mobi.palmdoc import decompress_doc +from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer + +class HeaderRecord(object): + ''' + The first record in the file is always the header record. It holds + information related to the location of text, images, and so on + in the file. This is used in conjunction with the sections + defined in the file header. + ''' + + def __init__(self, raw): + self.compression, = struct.unpack('>H', raw[0:2]) + self.num_records, = struct.unpack('>H', raw[8:10]) + + +class Reader(FormatReader): + + def __init__(self, header, stream, log, encoding=None): + self.stream = stream + self.log = log + self.encoding = encoding + + self.sections = [] + for i in range(header.num_sections): + self.sections.append(header.section_data(i)) + + self.header_record = HeaderRecord(self.section_data(0)) + + def section_data(self, number): + return self.sections[number] + + def decompress_text(self, number): + if self.header_record.compression == 1: + return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) + if self.header_record.compression == 2: + return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) + return '' + + def extract_content(self, output_dir): + txt = '' + + for i in range(1, self.header_record.num_records + 1): + txt += self.decompress_text(i) + + html = txt_to_markdown(txt) + with open(os.path.join(output_dir, 'index.html'), 'wb') as index: + index.write(html.encode('utf-8')) + + from calibre.ebooks.metadata.meta import get_metadata + mi = get_metadata(self.stream, 'pdb') + manifest = [('index.html', None)] + spine = ['index.html'] + opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) + + return os.path.join(output_dir, 'metadata.opf') + diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 19c04b66b4..3d96018def 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -8,11 +8,10 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import StringIO, os, struct, zlib +import os, struct, zlib from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError -from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer class HeaderRecord(object):