initial ereader input

2025-08-11 09:13:57 -04:00 · 2009-04-21 19:09:03 -04:00 · 2009-04-21 19:09:03 -04:00 · 68e7e1b112
commit 68e7e1b112
parent ac9f766a8d
7 changed files with 429 additions and 1 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -278,6 +278,7 @@ class PDFMetadataWriter(MetadataWriterPlugin):

 from calibre.ebooks.epub.input import EPUBInput
 from calibre.ebooks.mobi.input import MOBIInput
+from calibre.ebooks.pdb.input import PDBInput
 from calibre.ebooks.pdf.input import PDFInput
 from calibre.ebooks.txt.input import TXTInput
 from calibre.ebooks.lit.input import LITInput
@ -287,7 +288,7 @@ from calibre.ebooks.txt.output import TXTOutput
 from calibre.ebooks.pdf.output import PDFOutput
 from calibre.customize.profiles import input_profiles, output_profiles

-plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
+plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
        TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataReader')]
--- a/src/calibre/ebooks/pdb/init.py
+++ b/src/calibre/ebooks/pdb/init.py
@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.ebooks.pdb.ereader.reader import Reader as eReader
+
+FORMATS = {
+    'PNPdPPrs' : eReader,
+    'PNRdPPrs' : eReader,
+}
+
+class PDBError(Exception):
+    pass
+    
+
+def get_reader(identity):
+    '''
+    Returns None if no reader is found for the identity.
+    '''
+    if identity in FORMATS.keys():
+        return FORMATS[identity]
+    else:
+        return None
--- a/src/calibre/ebooks/pdb/ereader/init.py
+++ b/src/calibre/ebooks/pdb/ereader/init.py
@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+from __future__ import with_statement
+'''
+Write content to TXT.
+'''
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+class EreaderError(Exception):
+    pass
--- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py
+++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py
@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+from __future__ import with_statement
+'''
+Convert pml markup to html
+'''
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
+
+PML_HTML_RULES = [
+    (re.compile('\\\\p'), lambda match: '<br /><br style="page-break-after: always;" />'),
+    (re.compile('\\\\x(?P<text>.+?)\\\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text')),
+    (re.compile('\\\\X(?P<val>[0-4])(?P<text>.+?)\\\\X[0-4]', re.DOTALL), lambda match: '<h%i style="page-break-before: always;">%i</h%i>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
+    (re.compile('\\\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry
+    (re.compile('\\\\c(?P<text>.+?)\\\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text')),
+    (re.compile('\\\\r(?P<text>.+?)\\\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text')),
+    (re.compile('\\\\i(?P<text>.+?)\\\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text')),
+    (re.compile('\\\\u(?P<text>.+?)\\\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text')),
+    (re.compile('\\\\o(?P<text>.+?)\\\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')),
+    (re.compile('\\\\v(?P<text>.+?)\\\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')),
+    (re.compile('\\\\t(?P<text>.+?)\\\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%">%s</div>' % match.group('text')),
+    (re.compile('\\\\T="(?P<val>\d+%*)"(?P<text>.+?)$', re.MULTILINE), lambda match: '<div style="margin-left: %i%">%s</div>' % (match.group('val'), match.group('text'))),
+    (re.compile('\\\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
+    (re.compile('\\\\n'), lambda match: ''),
+    (re.compile('\\\\s'), lambda match: ''),
+    (re.compile('\\\\b(?P<text>.+?)\\\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead.
+    (re.compile('\\\\l(?P<text>.+?)\\\\l', re.DOTALL), lambda match: '<big>%s</big>' % match.group('text')),
+    (re.compile('\\\\B(?P<text>.+?)\\\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')),
+    (re.compile('\\\\Sp(?P<text>.+?)\\\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text')),
+    (re.compile('\\\\Sb(?P<text>.+?)\\\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text')),
+    (re.compile('\\\\k(?P<text>.+?)\\\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')),
+    (re.compile('\\\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
+    (re.compile('\\\\U(?P<num>\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))),
+    (re.compile('\\\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % match.group('name')),
+    (re.compile('\\\\q="(?P<target>#.+?)"(?P<text>)\\\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
+    (re.compile('\\\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
+    (re.compile('\\\\-'), lambda match: ''),
+    # Todo: Footnotes need link.
+    (re.compile('\\\\Fn="(?P<target>.+?)"(?P<text>.+?)\\\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
+    (re.compile('\\\\Sd="(?P<target>.+?)"(?P<text>.+?)\\\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),
+    (re.compile('\\\\I'), lambda match: ''),
+    
+    # eReader files are one paragraph per line.
+    # This forces the lines to wrap properly.
+    (re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
+    
+    # Remove unmatched plm codes.
+    (re.compile('(?<=[^\\\\])\\\\[pxcriouvtblBk]'), lambda match: ''),
+    (re.compile('(?<=[^\\\\])\\\\X[0-4]'), lambda match: ''),
+    (re.compile('(?<=[^\\\\])\\\\Sp'), lambda match: ''),
+    (re.compile('(?<=[^\\\\])\\\\Sb'), lambda match: ''),
+    
+    # Replace \\ with \.
+    (re.compile('\\\\\\\\'), lambda match: '\\'),
+]
+
+FOOTNOTE_HTML_RULES = [
+    (re.compile('<footnote id="(?P<id>.+?)">(?P<text>.+?)</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>')
+]
+
+SIDEBAR_HTML_RULES = [
+    (re.compile('<sidebar id="(?P<id>.+?)">(?P<text>.+?)</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>')
+]
+
+
+def pml_to_html(pml):
+    html = pml
+    for rule in PML_HTML_RULES:
+        html = rule[0].sub(rule[1], html)
+
+    for symbol in HTML_SYMBOLS.keys():
+        if ord(symbol) > 128:
+            html = html.replace(symbol, HTML_SYMBOLS[symbol][len(HTML_SYMBOLS[symbol]) - 1])
+        
+    return html
+
+def footnote_to_html(footnotes):
+    html = footnotes
+    for rule in FOOTNOTE_HTML_RULES:
+        html = rule[0].sub(rule[1], html)
+        
+    html = pml_to_html(html)
+        
+    return html
+    
+def sidebar_to_html(sidebars):
+    html = sidebars
+    for rule in FOOTNOTE_HTML_RULES:
+        html = rule[0].sub(rule[1], html)
+        
+    html = pml_to_html(html)
+        
+    return html
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@ -0,0 +1,199 @@
+# -*- coding: utf-8 -*-
+from __future__ import with_statement
+'''
+Read content from ereader pdb file.
+'''
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os, sys, struct, zlib
+
+from calibre import CurrentDir
+from calibre.ebooks import DRMError
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.pdb.ereader import EreaderError
+from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \
+    footnote_to_html, sidebar_to_html 
+from calibre.ebooks.mobi.palmdoc import decompress_doc
+from calibre.ebooks.metadata.opf2 import OPFCreator
+
+class HeaderRecord(object):
+    '''
+    The first record in the file is always the header record. It holds
+    information related to the location of text, images, and so on
+    in the file. This is used in conjunction with the sections
+    defined in the file header.
+    '''
+
+    def __init__(self, raw):
+        self.version, = struct.unpack('>H', raw[0:2])
+        self.non_text_offset, = struct.unpack('>H', raw[12:14]) 
+        self.footnote_rec, = struct.unpack('>H', raw[28:30])
+        self.sidebar_rec, =  struct.unpack('>H', raw[30:32])
+        self.bookmark_offset, = struct.unpack('>H', raw[32:34])
+        self.image_data_offset, = struct.unpack('>H', raw[40:42])
+        self.metadata_offset, = struct.unpack('>H', raw[44:46])
+        self.footnote_offset, = struct.unpack('>H', raw[48:50])
+        self.sidebar_offset, = struct.unpack('>H', raw[50:52])
+        self.last_data_offset, = struct.unpack('>H', raw[52:54])
+        
+        self.num_text_pages = self.non_text_offset -1
+        self.num_image_pages = self.metadata_offset - self.image_data_offset
+
+        # Can't tell which is sidebar and footnote if they have same offset.
+        # They don't exist if offset is larget than last_record.
+        self.num_footnote_pages = self.sidebar_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 
+        self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0
+        
+
+class Reader(object):
+
+    def __init__(self, header, stream):
+        raw = stream.read()
+
+        self.sections = []
+        for i in range(header.num_sections):
+            self.sections.append(header.section_data(i))
+
+        self.header_record = HeaderRecord(self.section_data(0))
+
+        if self.header_record.version not in (2, 10):
+            if self.header_record.version in (260, 272):
+                raise DRMError('eReader DRM is not supported.')
+            else:
+                raise EreaderError('Unknown book version %i.' % self.header_record.version)
+        
+    def section_data(self, number):
+        return self.sections[number]
+        
+    def decompress_text(self, number):
+        if self.header_record.version == 2:
+            return decompress_doc(self.section_data(number)).decode('cp1252')
+        if self.header_record.version == 10:
+            return zlib.decompress(self.section_data(number)).decode('cp1252')
+
+        
+    def get_image(self, number):
+        if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
+            return 'empty', ''
+        data = self.section_data(number)
+        name = data[4:4+32].strip('\0')
+        img = data[62:]
+        return name, img
+        
+    def get_text_page(self, number):
+        '''
+        Only palmdoc and zlib compressed are supported. The text is
+        assumed to be encoded as Windows-1252. The encoding is part of
+        the eReader file spec and should always be this encoding.
+        '''
+        if number < 1 or number > self.header_record.num_text_pages:
+            return ''
+            
+        return self.decompress_text(number)
+            
+    def get_footnote_page(self, number):
+        if number < self.header_record.footnote_offset or number > self.header_record.footnote_offset + self.header_record.num_footnote_pages - 1:
+            return ''
+            
+        return self.decompress_text(number)
+        
+    def get_sidebar_page(self, number):
+        if number < self.header_record.sidebar_offset or number > self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1:
+            return ''
+            
+        return self.decompress_text(number)
+
+    def has_footnotes(self):
+        if self.header_record.num_footnote_pages > 1:
+            try:
+                content = self.decompress_text(self.header_record.footnote_offset)
+                
+                if content.contains('</footnote>'):
+                    return True
+            except:
+                pass
+        return False
+        
+    def has_sidebar(self):
+        if self.header_record.num_sidebar_pages > 1:
+            try:
+                content = self.decompress_text(self.header_record.sidebar_offset)
+                
+                if content.contains('</sidebar>'):
+                    return True
+            except:
+                pass
+        return False
+
+    def extract_content(self, output_dir):
+        output_dir = os.path.abspath(output_dir)
+        
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        
+        html = '<html><head><title></title></head><body>'
+        
+        for i in range(1, self.header_record.num_text_pages + 1):
+            html += pml_to_html(self.get_text_page(i))
+
+        # Untested: The num_.._pages variable may not be correct!
+        # Possibly use .._rec instead?
+        '''
+        if has_footnotes():
+            html += '<br /><h1>%s</h1>' % _('Footnotes')
+            for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages):
+                html += footnote_to_html(self.get_footnote_page(i))
+                
+        if has_sidebar():
+            html += '<br /><h1>%s</h1>' % _('Sidebar')
+            for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages):
+                html += sidebar_to_html(self.get_sidebar_page(i))
+        '''
+        
+        html += '</body></html>'
+        
+        with CurrentDir(output_dir):
+            with open('index.html', 'wb') as index:
+                index.write(html.encode('utf-8'))
+        
+        if not os.path.exists(os.path.join(output_dir, 'images/')):
+            os.makedirs(os.path.join(output_dir, 'images/'))
+        images = []
+        with CurrentDir(os.path.join(output_dir, 'images/')):
+            for i in range(0, self.header_record.num_image_pages):
+                name, img = self.get_image(self.header_record.image_data_offset + i)
+                images.append(name)
+                with open(name, 'wb') as imgf:
+                    imgf.write(img)
+            
+        self.create_opf(output_dir, images)
+            
+        return os.path.join(output_dir, 'metadata.opf')
+        
+    def create_opf(self, output_dir, images):
+        mi = MetaInformation(None, None)
+        
+        with CurrentDir(output_dir):
+            opf = OPFCreator(output_dir, mi)
+            
+            manifest = [('index.html', None)]
+        
+            for i in images:
+                manifest.append((os.path.join('images/', i), None))
+        
+            opf.create_manifest(manifest)
+            opf.create_spine(['index.html'])
+            with open('metadata.opf', 'wb') as opffile:
+                opf.render(opffile)
+        
+    def dump_pml(self):
+        pml = ''
+        
+        for i in range(1, self.header_record.num_text_pages + 1):
+            pml += self.get_text_page(i)
+        
+        return pml
+        
--- a/src/calibre/ebooks/pdb/header.py
+++ b/src/calibre/ebooks/pdb/header.py
@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+from __future__ import with_statement
+'''
+Read the header data from a pdb file.
+'''
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os, struct
+
+class PdbHeader(object):
+
+    def __init__(self, stream):
+        self.stream = stream
+        self.ident = self.identity()
+        self.num_sections = self.section_count()
+        self.title = self.name()
+
+    def identity(self):
+        self.stream.seek(60)
+        ident = self.stream.read(8)
+        return ident
+
+    def section_count(self):
+        self.stream.seek(76)
+        return struct.unpack('>H', self.stream.read(2))[0]
+
+    def name(self):
+        self.stream.seek(0)
+        return self.stream.read(32).replace('\x00', '')
+
+    def full_section_info(self, number):
+        if number > self.num_sections:
+            raise ValueError('Not a valid section number %i' % number)
+            
+        self.stream.seek(78+number*8)
+        offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0]
+        flags, val = a1, a2<<16 | a3<<8 | a4
+        return (offset, flags, val)
+
+    def section_offset(self, number):
+        if number > self.num_sections:
+            raise ValueError('Not a valid section number %i' % number)
+            
+        self.stream.seek(78+number*8)
+        return struct.unpack('>LBBBB', self.stream.read(8))[0]
+
+    def section_data(self, number):
+        if number > self.num_sections:
+            raise ValueError('Not a valid section number %i' % number)
+            
+        start = self.section_offset(number)
+        if number == self.num_sections -1:
+            end = os.stat(self.stream.name).st_size
+        else:
+            end = self.section_offset(number + 1)
+        self.stream.seek(start)
+        return self.stream.read(end - start)
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.pdb.header import PdbHeader
+from calibre.ebooks.pdb import PDBError, get_reader
+
+class PDBInput(InputFormatPlugin):
+
+    name        = 'PDB Input'
+    author      = 'John Schember'
+    description = 'Convert PDB to HTML'
+    file_types  = set(['pdb'])
+    
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        header = PdbHeader(stream)
+        Reader = get_reader(header.ident)
+        
+        if Reader is None:
+            raise PDBError('Unknown format identity is %s' % header.identity)
+            
+        reader = Reader(header, stream)
+        opf = reader.extract_content(os.getcwd())
+        
+        return opf