From 68e7e1b1122b7b461dfe90f56062948084d7ee55 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 21 Apr 2009 19:09:03 -0400 Subject: [PATCH 1/4] initial ereader input --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/pdb/__init__.py | 26 +++ src/calibre/ebooks/pdb/ereader/__init__.py | 12 ++ .../ebooks/pdb/ereader/pmlconverter.py | 98 +++++++++ src/calibre/ebooks/pdb/ereader/reader.py | 199 ++++++++++++++++++ src/calibre/ebooks/pdb/header.py | 60 ++++++ src/calibre/ebooks/pdb/input.py | 32 +++ 7 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/pdb/__init__.py create mode 100644 src/calibre/ebooks/pdb/ereader/__init__.py create mode 100644 src/calibre/ebooks/pdb/ereader/pmlconverter.py create mode 100644 src/calibre/ebooks/pdb/ereader/reader.py create mode 100644 src/calibre/ebooks/pdb/header.py create mode 100644 src/calibre/ebooks/pdb/input.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 08824a3591..ade60fcc9f 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -278,6 +278,7 @@ class PDFMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput +from calibre.ebooks.pdb.input import PDBInput from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.lit.input import LITInput @@ -287,7 +288,7 @@ from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, +plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py new file mode 100644 index 0000000000..5e51a807e9 --- /dev/null +++ b/src/calibre/ebooks/pdb/__init__.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +from calibre.ebooks.pdb.ereader.reader import Reader as eReader + +FORMATS = { + 'PNPdPPrs' : eReader, + 'PNRdPPrs' : eReader, +} + +class PDBError(Exception): + pass + + +def get_reader(identity): + ''' + Returns None if no reader is found for the identity. + ''' + if identity in FORMATS.keys(): + return FORMATS[identity] + else: + return None diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py new file mode 100644 index 0000000000..f2f1761cad --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Write content to TXT. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +class EreaderError(Exception): + pass diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py new file mode 100644 index 0000000000..a85f1c84ac --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Convert pml markup to html +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import re + +from calibre.ebooks.htmlsymbols import HTML_SYMBOLS + +PML_HTML_RULES = [ + (re.compile('\\\\p'), lambda match: '

'), + (re.compile('\\\\x(?P.+?)\\\\x', re.DOTALL), lambda match: '

%s

' % match.group('text')), + (re.compile('\\\\X(?P[0-4])(?P.+?)\\\\X[0-4]', re.DOTALL), lambda match: '%i' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile('\\\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry + (re.compile('\\\\c(?P.+?)\\\\c', re.DOTALL), lambda match: '
%s
' % match.group('text')), + (re.compile('\\\\r(?P.+?)\\\\r', re.DOTALL), lambda match: '
%s
' % match.group('text')), + (re.compile('\\\\i(?P.+?)\\\\i', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile('\\\\u(?P.+?)\\\\u', re.DOTALL), lambda match: '
%s
' % match.group('text')), + (re.compile('\\\\o(?P.+?)\\\\o', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile('\\\\v(?P.+?)\\\\v', re.DOTALL), lambda match: '' % match.group('text')), + (re.compile('\\\\t(?P.+?)\\\\t', re.DOTALL), lambda match: '
%s
' % match.group('text')), + (re.compile('\\\\T="(?P\d+%*)"(?P.+?)$', re.MULTILINE), lambda match: '
%s
' % (match.group('val'), match.group('text'))), + (re.compile('\\\\w="(?P\d+)%"'), lambda match: '
' % match.group('val')), + (re.compile('\\\\n'), lambda match: ''), + (re.compile('\\\\s'), lambda match: ''), + (re.compile('\\\\b(?P.+?)\\\\b', re.DOTALL), lambda match: '%s' % match.group('text')), # \b is deprecated; \B should be used instead. + (re.compile('\\\\l(?P.+?)\\\\l', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile('\\\\B(?P.+?)\\\\B', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile('\\\\Sp(?P.+?)\\\\Sp', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile('\\\\Sb(?P.+?)\\\\Sb', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile('\\\\k(?P.+?)\\\\k', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile('\\\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), + (re.compile('\\\\U(?P\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))), + (re.compile('\\\\m="(?P.+?)"'), lambda match: '' % match.group('name')), + (re.compile('\\\\q="(?P#.+?)"(?P)\\\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile('\\\\Q="(?P.+?)"'), lambda match: '
' % match.group('target')), + (re.compile('\\\\-'), lambda match: ''), + # Todo: Footnotes need link. + (re.compile('\\\\Fn="(?P.+?)"(?P.+?)\\\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile('\\\\Sd="(?P.+?)"(?P.+?)\\\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile('\\\\I'), lambda match: ''), + + # eReader files are one paragraph per line. + # This forces the lines to wrap properly. + (re.compile('^(?P.+)$', re.MULTILINE), lambda match: '

%s

' % match.group('text')), + + # Remove unmatched plm codes. + (re.compile('(?<=[^\\\\])\\\\[pxcriouvtblBk]'), lambda match: ''), + (re.compile('(?<=[^\\\\])\\\\X[0-4]'), lambda match: ''), + (re.compile('(?<=[^\\\\])\\\\Sp'), lambda match: ''), + (re.compile('(?<=[^\\\\])\\\\Sb'), lambda match: ''), + + # Replace \\ with \. + (re.compile('\\\\\\\\'), lambda match: '\\'), +] + +FOOTNOTE_HTML_RULES = [ + (re.compile('(?P.+?)', re.DOTALL), lambda match: '
%s
') +] + +SIDEBAR_HTML_RULES = [ + (re.compile('(?P.+?)', re.DOTALL), lambda match: '') +] + + +def pml_to_html(pml): + html = pml + for rule in PML_HTML_RULES: + html = rule[0].sub(rule[1], html) + + for symbol in HTML_SYMBOLS.keys(): + if ord(symbol) > 128: + html = html.replace(symbol, HTML_SYMBOLS[symbol][len(HTML_SYMBOLS[symbol]) - 1]) + + return html + +def footnote_to_html(footnotes): + html = footnotes + for rule in FOOTNOTE_HTML_RULES: + html = rule[0].sub(rule[1], html) + + html = pml_to_html(html) + + return html + +def sidebar_to_html(sidebars): + html = sidebars + for rule in FOOTNOTE_HTML_RULES: + html = rule[0].sub(rule[1], html) + + html = pml_to_html(html) + + return html diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py new file mode 100644 index 0000000000..6883649921 --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Read content from ereader pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, sys, struct, zlib + +from calibre import CurrentDir +from calibre.ebooks import DRMError +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.pdb.ereader import EreaderError +from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \ + footnote_to_html, sidebar_to_html +from calibre.ebooks.mobi.palmdoc import decompress_doc +from calibre.ebooks.metadata.opf2 import OPFCreator + +class HeaderRecord(object): + ''' + The first record in the file is always the header record. It holds + information related to the location of text, images, and so on + in the file. This is used in conjunction with the sections + defined in the file header. + ''' + + def __init__(self, raw): + self.version, = struct.unpack('>H', raw[0:2]) + self.non_text_offset, = struct.unpack('>H', raw[12:14]) + self.footnote_rec, = struct.unpack('>H', raw[28:30]) + self.sidebar_rec, = struct.unpack('>H', raw[30:32]) + self.bookmark_offset, = struct.unpack('>H', raw[32:34]) + self.image_data_offset, = struct.unpack('>H', raw[40:42]) + self.metadata_offset, = struct.unpack('>H', raw[44:46]) + self.footnote_offset, = struct.unpack('>H', raw[48:50]) + self.sidebar_offset, = struct.unpack('>H', raw[50:52]) + self.last_data_offset, = struct.unpack('>H', raw[52:54]) + + self.num_text_pages = self.non_text_offset -1 + self.num_image_pages = self.metadata_offset - self.image_data_offset + + # Can't tell which is sidebar and footnote if they have same offset. + # They don't exist if offset is larget than last_record. + self.num_footnote_pages = self.sidebar_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 + self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 + + +class Reader(object): + + def __init__(self, header, stream): + raw = stream.read() + + self.sections = [] + for i in range(header.num_sections): + self.sections.append(header.section_data(i)) + + self.header_record = HeaderRecord(self.section_data(0)) + + if self.header_record.version not in (2, 10): + if self.header_record.version in (260, 272): + raise DRMError('eReader DRM is not supported.') + else: + raise EreaderError('Unknown book version %i.' % self.header_record.version) + + def section_data(self, number): + return self.sections[number] + + def decompress_text(self, number): + if self.header_record.version == 2: + return decompress_doc(self.section_data(number)).decode('cp1252') + if self.header_record.version == 10: + return zlib.decompress(self.section_data(number)).decode('cp1252') + + + def get_image(self, number): + if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: + return 'empty', '' + data = self.section_data(number) + name = data[4:4+32].strip('\0') + img = data[62:] + return name, img + + def get_text_page(self, number): + ''' + Only palmdoc and zlib compressed are supported. The text is + assumed to be encoded as Windows-1252. The encoding is part of + the eReader file spec and should always be this encoding. + ''' + if number < 1 or number > self.header_record.num_text_pages: + return '' + + return self.decompress_text(number) + + def get_footnote_page(self, number): + if number < self.header_record.footnote_offset or number > self.header_record.footnote_offset + self.header_record.num_footnote_pages - 1: + return '' + + return self.decompress_text(number) + + def get_sidebar_page(self, number): + if number < self.header_record.sidebar_offset or number > self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1: + return '' + + return self.decompress_text(number) + + def has_footnotes(self): + if self.header_record.num_footnote_pages > 1: + try: + content = self.decompress_text(self.header_record.footnote_offset) + + if content.contains(''): + return True + except: + pass + return False + + def has_sidebar(self): + if self.header_record.num_sidebar_pages > 1: + try: + content = self.decompress_text(self.header_record.sidebar_offset) + + if content.contains(''): + return True + except: + pass + return False + + def extract_content(self, output_dir): + output_dir = os.path.abspath(output_dir) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + html = '' + + for i in range(1, self.header_record.num_text_pages + 1): + html += pml_to_html(self.get_text_page(i)) + + # Untested: The num_.._pages variable may not be correct! + # Possibly use .._rec instead? + ''' + if has_footnotes(): + html += '

%s

' % _('Footnotes') + for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages): + html += footnote_to_html(self.get_footnote_page(i)) + + if has_sidebar(): + html += '

%s

' % _('Sidebar') + for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages): + html += sidebar_to_html(self.get_sidebar_page(i)) + ''' + + html += '' + + with CurrentDir(output_dir): + with open('index.html', 'wb') as index: + index.write(html.encode('utf-8')) + + if not os.path.exists(os.path.join(output_dir, 'images/')): + os.makedirs(os.path.join(output_dir, 'images/')) + images = [] + with CurrentDir(os.path.join(output_dir, 'images/')): + for i in range(0, self.header_record.num_image_pages): + name, img = self.get_image(self.header_record.image_data_offset + i) + images.append(name) + with open(name, 'wb') as imgf: + imgf.write(img) + + self.create_opf(output_dir, images) + + return os.path.join(output_dir, 'metadata.opf') + + def create_opf(self, output_dir, images): + mi = MetaInformation(None, None) + + with CurrentDir(output_dir): + opf = OPFCreator(output_dir, mi) + + manifest = [('index.html', None)] + + for i in images: + manifest.append((os.path.join('images/', i), None)) + + opf.create_manifest(manifest) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + def dump_pml(self): + pml = '' + + for i in range(1, self.header_record.num_text_pages + 1): + pml += self.get_text_page(i) + + return pml + diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py new file mode 100644 index 0000000000..a3aa56a718 --- /dev/null +++ b/src/calibre/ebooks/pdb/header.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Read the header data from a pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, struct + +class PdbHeader(object): + + def __init__(self, stream): + self.stream = stream + self.ident = self.identity() + self.num_sections = self.section_count() + self.title = self.name() + + def identity(self): + self.stream.seek(60) + ident = self.stream.read(8) + return ident + + def section_count(self): + self.stream.seek(76) + return struct.unpack('>H', self.stream.read(2))[0] + + def name(self): + self.stream.seek(0) + return self.stream.read(32).replace('\x00', '') + + def full_section_info(self, number): + if number > self.num_sections: + raise ValueError('Not a valid section number %i' % number) + + self.stream.seek(78+number*8) + offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0] + flags, val = a1, a2<<16 | a3<<8 | a4 + return (offset, flags, val) + + def section_offset(self, number): + if number > self.num_sections: + raise ValueError('Not a valid section number %i' % number) + + self.stream.seek(78+number*8) + return struct.unpack('>LBBBB', self.stream.read(8))[0] + + def section_data(self, number): + if number > self.num_sections: + raise ValueError('Not a valid section number %i' % number) + + start = self.section_offset(number) + if number == self.num_sections -1: + end = os.stat(self.stream.name).st_size + else: + end = self.section_offset(number + 1) + self.stream.seek(start) + return self.stream.read(end - start) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py new file mode 100644 index 0000000000..47125f28ab --- /dev/null +++ b/src/calibre/ebooks/pdb/input.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.pdb.header import PdbHeader +from calibre.ebooks.pdb import PDBError, get_reader + +class PDBInput(InputFormatPlugin): + + name = 'PDB Input' + author = 'John Schember' + description = 'Convert PDB to HTML' + file_types = set(['pdb']) + + def convert(self, stream, options, file_ext, log, + accelerators): + header = PdbHeader(stream) + Reader = get_reader(header.ident) + + if Reader is None: + raise PDBError('Unknown format identity is %s' % header.identity) + + reader = Reader(header, stream) + opf = reader.extract_content(os.getcwd()) + + return opf From e968f529dab1949ef65c840107c77bf36b8aeec1 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 21 Apr 2009 19:37:37 -0400 Subject: [PATCH 2/4] Working eReader input. --- src/calibre/ebooks/pdb/__init__.py | 5 +++++ src/calibre/ebooks/pdb/ereader/pmlconverter.py | 1 - src/calibre/ebooks/pdb/ereader/reader.py | 15 ++++++++++++--- src/calibre/ebooks/pdb/input.py | 8 +++++--- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index 5e51a807e9..8c4f45337f 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -12,6 +12,11 @@ FORMATS = { 'PNRdPPrs' : eReader, } +IDENTITY_TO_NAME = { + 'PNPdPPrs' : 'eReader', + 'PNRdPPrs' : 'eReader', +} + class PDBError(Exception): pass diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index a85f1c84ac..454510f699 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -40,7 +40,6 @@ PML_HTML_RULES = [ (re.compile('\\\\q="(?P#.+?)"(?P)\\\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile('\\\\Q="(?P.+?)"'), lambda match: '
' % match.group('target')), (re.compile('\\\\-'), lambda match: ''), - # Todo: Footnotes need link. (re.compile('\\\\Fn="(?P.+?)"(?P.+?)\\\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile('\\\\Sd="(?P.+?)"(?P.+?)\\\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile('\\\\I'), lambda match: ''), diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 6883649921..9354787447 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -44,13 +44,15 @@ class HeaderRecord(object): # Can't tell which is sidebar and footnote if they have same offset. # They don't exist if offset is larget than last_record. + # Todo: Determine if the subtraction is necessary and find out + # what _rec means. self.num_footnote_pages = self.sidebar_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 class Reader(object): - def __init__(self, header, stream): + def __init__(self, header, stream, log): raw = stream.read() self.sections = [] @@ -169,9 +171,9 @@ class Reader(object): with open(name, 'wb') as imgf: imgf.write(img) - self.create_opf(output_dir, images) + opf_path = self.create_opf(output_dir, images) - return os.path.join(output_dir, 'metadata.opf') + return opf_path def create_opf(self, output_dir, images): mi = MetaInformation(None, None) @@ -188,6 +190,8 @@ class Reader(object): opf.create_spine(['index.html']) with open('metadata.opf', 'wb') as opffile: opf.render(opffile) + + return os.path.join(output_dir, 'metadata.opf') def dump_pml(self): pml = '' @@ -197,3 +201,8 @@ class Reader(object): return pml + +class EreaderMetadata(object): + + def __init__(self, record): + pass diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 47125f28ab..d64e2aa51b 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -9,7 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdb.header import PdbHeader -from calibre.ebooks.pdb import PDBError, get_reader +from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader class PDBInput(InputFormatPlugin): @@ -24,9 +24,11 @@ class PDBInput(InputFormatPlugin): Reader = get_reader(header.ident) if Reader is None: - raise PDBError('Unknown format identity is %s' % header.identity) + raise PDBError('Unknown format in pdb file. Identity is %s' % header.identity) + + log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) - reader = Reader(header, stream) + reader = Reader(header, stream, log) opf = reader.extract_content(os.getcwd()) return opf From 3bbd277d2b95f2b539a11362a1be128bbb818de9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 22 Apr 2009 07:30:22 -0400 Subject: [PATCH 3/4] ereader reader debug output --- src/calibre/ebooks/pdb/ereader/__init__.py | 3 --- src/calibre/ebooks/pdb/ereader/pmlconverter.py | 2 +- src/calibre/ebooks/pdb/ereader/reader.py | 18 ++++++++++++------ src/calibre/ebooks/pdb/header.py | 6 +++--- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py index f2f1761cad..89d9dfdd35 100644 --- a/src/calibre/ebooks/pdb/ereader/__init__.py +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -1,8 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import with_statement -''' -Write content to TXT. -''' __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 454510f699..250b74eb56 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import with_statement ''' -Convert pml markup to html +Convert pml markup to and from html ''' __license__ = 'GPL v3' diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 9354787447..f6bbc3d23f 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -46,15 +46,16 @@ class HeaderRecord(object): # They don't exist if offset is larget than last_record. # Todo: Determine if the subtraction is necessary and find out # what _rec means. - self.num_footnote_pages = self.sidebar_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 + end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset + self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 class Reader(object): def __init__(self, header, stream, log): - raw = stream.read() - + self.log = log + self.sections = [] for i in range(header.num_sections): self.sections.append(header.section_data(i)) @@ -91,19 +92,19 @@ class Reader(object): assumed to be encoded as Windows-1252. The encoding is part of the eReader file spec and should always be this encoding. ''' - if number < 1 or number > self.header_record.num_text_pages: + if number not in range(1, self.header_record.num_text_pages): return '' return self.decompress_text(number) def get_footnote_page(self, number): - if number < self.header_record.footnote_offset or number > self.header_record.footnote_offset + self.header_record.num_footnote_pages - 1: + if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages): return '' return self.decompress_text(number) def get_sidebar_page(self, number): - if number < self.header_record.sidebar_offset or number > self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1: + if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1): return '' return self.decompress_text(number) @@ -139,6 +140,7 @@ class Reader(object): html = '' for i in range(1, self.header_record.num_text_pages + 1): + self.log.debug('Extracting text page %i' % i) html += pml_to_html(self.get_text_page(i)) # Untested: The num_.._pages variable may not be correct! @@ -147,11 +149,13 @@ class Reader(object): if has_footnotes(): html += '

%s

' % _('Footnotes') for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages): + self.log.debug('Extracting footnote page %i' % i) html += footnote_to_html(self.get_footnote_page(i)) if has_sidebar(): html += '

%s

' % _('Sidebar') for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages): + self.log.debug('Extracting sidebar page %i' % i) html += sidebar_to_html(self.get_sidebar_page(i)) ''' @@ -159,6 +163,7 @@ class Reader(object): with CurrentDir(output_dir): with open('index.html', 'wb') as index: + self.log.debug('Writing text to index.html') index.write(html.encode('utf-8')) if not os.path.exists(os.path.join(output_dir, 'images/')): @@ -169,6 +174,7 @@ class Reader(object): name, img = self.get_image(self.header_record.image_data_offset + i) images.append(name) with open(name, 'wb') as imgf: + self.log.debug('Writing image %s to images/' % name) imgf.write(img) opf_path = self.create_opf(output_dir, images) diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index a3aa56a718..efa727dac9 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -32,7 +32,7 @@ class PdbHeader(object): return self.stream.read(32).replace('\x00', '') def full_section_info(self, number): - if number > self.num_sections: + if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) self.stream.seek(78+number*8) @@ -41,14 +41,14 @@ class PdbHeader(object): return (offset, flags, val) def section_offset(self, number): - if number > self.num_sections: + if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) self.stream.seek(78+number*8) return struct.unpack('>LBBBB', self.stream.read(8))[0] def section_data(self, number): - if number > self.num_sections: + if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) start = self.section_offset(number) From f158c9c6430821568cdfd6a58ac7a08e948c8a93 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 22 Apr 2009 08:04:19 -0400 Subject: [PATCH 4/4] Interface for pdb format readers. PDB: support user input encodings --- src/calibre/ebooks/pdb/ereader/reader.py | 10 ++++++---- src/calibre/ebooks/pdb/formatreader.py | 18 ++++++++++++++++++ src/calibre/ebooks/pdb/input.py | 2 +- 3 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 src/calibre/ebooks/pdb/formatreader.py diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index f6bbc3d23f..b696005e85 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -13,6 +13,7 @@ import os, sys, struct, zlib from calibre import CurrentDir from calibre.ebooks import DRMError from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \ footnote_to_html, sidebar_to_html @@ -51,10 +52,11 @@ class HeaderRecord(object): self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 -class Reader(object): +class Reader(FormatReader): - def __init__(self, header, stream, log): + def __init__(self, header, stream, log, encoding=None): self.log = log + self.encoding = encoding self.sections = [] for i in range(header.num_sections): @@ -73,9 +75,9 @@ class Reader(object): def decompress_text(self, number): if self.header_record.version == 2: - return decompress_doc(self.section_data(number)).decode('cp1252') + return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) if self.header_record.version == 10: - return zlib.decompress(self.section_data(number)).decode('cp1252') + return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) def get_image(self, number): diff --git a/src/calibre/ebooks/pdb/formatreader.py b/src/calibre/ebooks/pdb/formatreader.py new file mode 100644 index 0000000000..25abb462cf --- /dev/null +++ b/src/calibre/ebooks/pdb/formatreader.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Interface defining the necessary public functions for a pdb format reader. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + + +class FormatReader(object): + + def __init__(self, header, stream, log, encoding=None): + raise NotImplementedError() + + def extract_content(self, output_dir): + raise NotImplementedError() diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index d64e2aa51b..9d848b1c24 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -28,7 +28,7 @@ class PDBInput(InputFormatPlugin): log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) - reader = Reader(header, stream, log) + reader = Reader(header, stream, log, options.input_encoding) opf = reader.extract_content(os.getcwd()) return opf