eReader format support from driver-dev

2025-07-09 03:04:10 -04:00 · 2009-04-22 14:38:40 -07:00 · 2009-04-22 14:38:40 -07:00 · b93029a4fe
commit b93029a4fe
parent 2905b9aedb f158c9c643
8 changed files with 467 additions and 1 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -278,6 +278,7 @@ class PDFMetadataWriter(MetadataWriterPlugin):
 from calibre.ebooks.epub.input import EPUBInput
 from calibre.ebooks.mobi.input import MOBIInput
 from calibre.ebooks.pdb.input import PDBInput
 from calibre.ebooks.pdf.input import PDFInput
 from calibre.ebooks.txt.input import TXTInput
 from calibre.ebooks.lit.input import LITInput
@ -290,7 +291,7 @@ from calibre.ebooks.txt.output import TXTOutput
 from calibre.ebooks.pdf.output import PDFOutput
 from calibre.customize.profiles import input_profiles, output_profiles
-plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
+plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
        TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput,
        FB2Input, ODTInput, RTFInput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
--- a/src/calibre/ebooks/pdb/init.py
+++ b/src/calibre/ebooks/pdb/init.py
@ -0,0 +1,31 @@
 # -*- coding: utf-8 -*-
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 from calibre.ebooks.pdb.ereader.reader import Reader as eReader
 FORMATS = {
    'PNPdPPrs' : eReader,
    'PNRdPPrs' : eReader,
 }
 IDENTITY_TO_NAME = {
    'PNPdPPrs' : 'eReader',
    'PNRdPPrs' : 'eReader',
 }
 class PDBError(Exception):
    pass
 def get_reader(identity):
    '''
    Returns None if no reader is found for the identity.
    '''
    if identity in FORMATS.keys():
        return FORMATS[identity]
    else:
        return None
--- a/src/calibre/ebooks/pdb/ereader/init.py
+++ b/src/calibre/ebooks/pdb/ereader/init.py
@ -0,0 +1,9 @@
 # -*- coding: utf-8 -*-
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 class EreaderError(Exception):
    pass
--- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py
+++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py
@ -0,0 +1,97 @@
 # -*- coding: utf-8 -*-
 from __future__ import with_statement
 '''
 Convert pml markup to and from html
 '''
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
 PML_HTML_RULES = [
    (re.compile('\\\\p'), lambda match: '<br /><br style="page-break-after: always;" />'),
    (re.compile('\\\\x(?P<text>.+?)\\\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text')),
    (re.compile('\\\\X(?P<val>[0-4])(?P<text>.+?)\\\\X[0-4]', re.DOTALL), lambda match: '<h%i style="page-break-before: always;">%i</h%i>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
    (re.compile('\\\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry
    (re.compile('\\\\c(?P<text>.+?)\\\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text')),
    (re.compile('\\\\r(?P<text>.+?)\\\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text')),
    (re.compile('\\\\i(?P<text>.+?)\\\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text')),
    (re.compile('\\\\u(?P<text>.+?)\\\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text')),
    (re.compile('\\\\o(?P<text>.+?)\\\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')),
    (re.compile('\\\\v(?P<text>.+?)\\\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')),
    (re.compile('\\\\t(?P<text>.+?)\\\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%">%s</div>' % match.group('text')),
    (re.compile('\\\\T="(?P<val>\d+%*)"(?P<text>.+?)$', re.MULTILINE), lambda match: '<div style="margin-left: %i%">%s</div>' % (match.group('val'), match.group('text'))),
    (re.compile('\\\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
    (re.compile('\\\\n'), lambda match: ''),
    (re.compile('\\\\s'), lambda match: ''),
    (re.compile('\\\\b(?P<text>.+?)\\\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead.
    (re.compile('\\\\l(?P<text>.+?)\\\\l', re.DOTALL), lambda match: '<big>%s</big>' % match.group('text')),
    (re.compile('\\\\B(?P<text>.+?)\\\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')),
    (re.compile('\\\\Sp(?P<text>.+?)\\\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text')),
    (re.compile('\\\\Sb(?P<text>.+?)\\\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text')),
    (re.compile('\\\\k(?P<text>.+?)\\\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')),
    (re.compile('\\\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
    (re.compile('\\\\U(?P<num>\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))),
    (re.compile('\\\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % match.group('name')),
    (re.compile('\\\\q="(?P<target>#.+?)"(?P<text>)\\\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
    (re.compile('\\\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
    (re.compile('\\\\-'), lambda match: ''),
    (re.compile('\\\\Fn="(?P<target>.+?)"(?P<text>.+?)\\\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
    (re.compile('\\\\Sd="(?P<target>.+?)"(?P<text>.+?)\\\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),
    (re.compile('\\\\I'), lambda match: ''),
    # eReader files are one paragraph per line.
    # This forces the lines to wrap properly.
    (re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
    # Remove unmatched plm codes.
    (re.compile('(?<=[^\\\\])\\\\[pxcriouvtblBk]'), lambda match: ''),
    (re.compile('(?<=[^\\\\])\\\\X[0-4]'), lambda match: ''),
    (re.compile('(?<=[^\\\\])\\\\Sp'), lambda match: ''),
    (re.compile('(?<=[^\\\\])\\\\Sb'), lambda match: ''),
    # Replace \\ with \.
    (re.compile('\\\\\\\\'), lambda match: '\\'),
 ]
 FOOTNOTE_HTML_RULES = [
    (re.compile('<footnote id="(?P<id>.+?)">(?P<text>.+?)</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>')
 ]
 SIDEBAR_HTML_RULES = [
    (re.compile('<sidebar id="(?P<id>.+?)">(?P<text>.+?)</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>')
 ]
 def pml_to_html(pml):
    html = pml
    for rule in PML_HTML_RULES:
        html = rule[0].sub(rule[1], html)
    for symbol in HTML_SYMBOLS.keys():
        if ord(symbol) > 128:
            html = html.replace(symbol, HTML_SYMBOLS[symbol][len(HTML_SYMBOLS[symbol]) - 1])
    return html
 def footnote_to_html(footnotes):
    html = footnotes
    for rule in FOOTNOTE_HTML_RULES:
        html = rule[0].sub(rule[1], html)
    html = pml_to_html(html)
    return html
 def sidebar_to_html(sidebars):
    html = sidebars
    for rule in FOOTNOTE_HTML_RULES:
        html = rule[0].sub(rule[1], html)
    html = pml_to_html(html)
    return html
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@ -0,0 +1,216 @@
 # -*- coding: utf-8 -*-
 from __future__ import with_statement
 '''
 Read content from ereader pdb file.
 '''
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import os, sys, struct, zlib
 from calibre import CurrentDir
 from calibre.ebooks import DRMError
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.pdb.formatreader import FormatReader
 from calibre.ebooks.pdb.ereader import EreaderError
 from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \
    footnote_to_html, sidebar_to_html 
 from calibre.ebooks.mobi.palmdoc import decompress_doc
 from calibre.ebooks.metadata.opf2 import OPFCreator
 class HeaderRecord(object):
    '''
    The first record in the file is always the header record. It holds
    information related to the location of text, images, and so on
    in the file. This is used in conjunction with the sections
    defined in the file header.
    '''
    def __init__(self, raw):
        self.version, = struct.unpack('>H', raw[0:2])
        self.non_text_offset, = struct.unpack('>H', raw[12:14]) 
        self.footnote_rec, = struct.unpack('>H', raw[28:30])
        self.sidebar_rec, =  struct.unpack('>H', raw[30:32])
        self.bookmark_offset, = struct.unpack('>H', raw[32:34])
        self.image_data_offset, = struct.unpack('>H', raw[40:42])
        self.metadata_offset, = struct.unpack('>H', raw[44:46])
        self.footnote_offset, = struct.unpack('>H', raw[48:50])
        self.sidebar_offset, = struct.unpack('>H', raw[50:52])
        self.last_data_offset, = struct.unpack('>H', raw[52:54])
        self.num_text_pages = self.non_text_offset -1
        self.num_image_pages = self.metadata_offset - self.image_data_offset
        # Can't tell which is sidebar and footnote if they have same offset.
        # They don't exist if offset is larget than last_record.
        # Todo: Determine if the subtraction is necessary and find out
        # what _rec means.
        end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset
        self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 
        self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0
 class Reader(FormatReader):
    def __init__(self, header, stream, log, encoding=None):
        self.log = log
        self.encoding = encoding
        self.sections = []
        for i in range(header.num_sections):
            self.sections.append(header.section_data(i))
        self.header_record = HeaderRecord(self.section_data(0))
        if self.header_record.version not in (2, 10):
            if self.header_record.version in (260, 272):
                raise DRMError('eReader DRM is not supported.')
            else:
                raise EreaderError('Unknown book version %i.' % self.header_record.version)
    def section_data(self, number):
        return self.sections[number]
    def decompress_text(self, number):
        if self.header_record.version == 2:
            return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
        if self.header_record.version == 10:
            return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
    def get_image(self, number):
        if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
            return 'empty', ''
        data = self.section_data(number)
        name = data[4:4+32].strip('\0')
        img = data[62:]
        return name, img
    def get_text_page(self, number):
        '''
        Only palmdoc and zlib compressed are supported. The text is
        assumed to be encoded as Windows-1252. The encoding is part of
        the eReader file spec and should always be this encoding.
        '''
        if number not in range(1, self.header_record.num_text_pages):
            return ''
        return self.decompress_text(number)
    def get_footnote_page(self, number):
        if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages):
            return ''
        return self.decompress_text(number)
    def get_sidebar_page(self, number):
        if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1):
            return ''
        return self.decompress_text(number)
    def has_footnotes(self):
        if self.header_record.num_footnote_pages > 1:
            try:
                content = self.decompress_text(self.header_record.footnote_offset)
                if content.contains('</footnote>'):
                    return True
            except:
                pass
        return False
    def has_sidebar(self):
        if self.header_record.num_sidebar_pages > 1:
            try:
                content = self.decompress_text(self.header_record.sidebar_offset)
                if content.contains('</sidebar>'):
                    return True
            except:
                pass
        return False
    def extract_content(self, output_dir):
        output_dir = os.path.abspath(output_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        html = '<html><head><title></title></head><body>'
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            html += pml_to_html(self.get_text_page(i))
        # Untested: The num_.._pages variable may not be correct!
        # Possibly use .._rec instead?
        '''
        if has_footnotes():
            html += '<br /><h1>%s</h1>' % _('Footnotes')
            for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages):
                self.log.debug('Extracting footnote page %i' % i)
                html += footnote_to_html(self.get_footnote_page(i))
        if has_sidebar():
            html += '<br /><h1>%s</h1>' % _('Sidebar')
            for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages):
                self.log.debug('Extracting sidebar page %i' % i)
                html += sidebar_to_html(self.get_sidebar_page(i))
        '''
        html += '</body></html>'
        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                images.append(name)
                with open(name, 'wb') as imgf:
                    self.log.debug('Writing image %s to images/' % name)
                    imgf.write(img)
        opf_path = self.create_opf(output_dir, images)
        return opf_path
    def create_opf(self, output_dir, images):
        mi = MetaInformation(None, None)
        with CurrentDir(output_dir):
            opf = OPFCreator(output_dir, mi)
            manifest = [('index.html', None)]
            for i in images:
                manifest.append((os.path.join('images/', i), None))
            opf.create_manifest(manifest)
            opf.create_spine(['index.html'])
            with open('metadata.opf', 'wb') as opffile:
                opf.render(opffile)
        return os.path.join(output_dir, 'metadata.opf')
    def dump_pml(self):
        pml = ''
        for i in range(1, self.header_record.num_text_pages + 1):
            pml += self.get_text_page(i)
        return pml
 class EreaderMetadata(object):
    def __init__(self, record):
        pass
--- a/src/calibre/ebooks/pdb/formatreader.py
+++ b/src/calibre/ebooks/pdb/formatreader.py
@ -0,0 +1,18 @@
 # -*- coding: utf-8 -*-
 from __future__ import with_statement
 '''
 Interface defining the necessary public functions for a pdb format reader.
 '''
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 class FormatReader(object):
    def __init__(self, header, stream, log, encoding=None):
        raise NotImplementedError()
    def extract_content(self, output_dir):
        raise NotImplementedError()
--- a/src/calibre/ebooks/pdb/header.py
+++ b/src/calibre/ebooks/pdb/header.py
@ -0,0 +1,60 @@
 # -*- coding: utf-8 -*-
 from __future__ import with_statement
 '''
 Read the header data from a pdb file.
 '''
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import os, struct
 class PdbHeader(object):
    def __init__(self, stream):
        self.stream = stream
        self.ident = self.identity()
        self.num_sections = self.section_count()
        self.title = self.name()
    def identity(self):
        self.stream.seek(60)
        ident = self.stream.read(8)
        return ident
    def section_count(self):
        self.stream.seek(76)
        return struct.unpack('>H', self.stream.read(2))[0]
    def name(self):
        self.stream.seek(0)
        return self.stream.read(32).replace('\x00', '')
    def full_section_info(self, number):
        if number not in range(0, self.num_sections):
            raise ValueError('Not a valid section number %i' % number)
        self.stream.seek(78+number*8)
        offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0]
        flags, val = a1, a2<<16 | a3<<8 | a4
        return (offset, flags, val)
    def section_offset(self, number):
        if number not in range(0, self.num_sections):
            raise ValueError('Not a valid section number %i' % number)
        self.stream.seek(78+number*8)
        return struct.unpack('>LBBBB', self.stream.read(8))[0]
    def section_data(self, number):
        if number not in range(0, self.num_sections):
            raise ValueError('Not a valid section number %i' % number)
        start = self.section_offset(number)
        if number == self.num_sections -1:
            end = os.stat(self.stream.name).st_size
        else:
            end = self.section_offset(number + 1)
        self.stream.seek(start)
        return self.stream.read(end - start)
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -0,0 +1,34 @@
 # -*- coding: utf-8 -*-
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import os
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.pdb.header import PdbHeader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
 class PDBInput(InputFormatPlugin):
    name        = 'PDB Input'
    author      = 'John Schember'
    description = 'Convert PDB to HTML'
    file_types  = set(['pdb'])
    def convert(self, stream, options, file_ext, log,
                accelerators):
        header = PdbHeader(stream)
        Reader = get_reader(header.ident)
        if Reader is None:
            raise PDBError('Unknown format in pdb file. Identity is %s' % header.identity)
        log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
        reader = Reader(header, stream, log, options.input_encoding)
        opf = reader.extract_content(os.getcwd())
        return opf