eReader input support Makebook produced books (202 byte header documents).

2025-07-08 18:54:09 -04:00 · 2009-05-21 17:22:32 -04:00 · 2009-05-21 17:22:32 -04:00 · e4ee664bb3
commit e4ee664bb3
parent 24ca1a1134
4 changed files with 382 additions and 188 deletions
--- a/src/calibre/ebooks/metadata/ereader.py
+++ b/src/calibre/ebooks/metadata/ereader.py
@ -8,11 +8,13 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

-import re
+import struct

-from calibre.ebooks.metadata import MetaInformation, authors_to_string
-from calibre.ebooks.pdb.header import PdbHeaderReader, PdbHeaderBuilder
-from calibre.ebooks.pdb.ereader.reader import HeaderRecord
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.metadata import authors_to_string
+from calibre.ebooks.pdb.ereader.reader132 import HeaderRecord
+from calibre.ebooks.pdb.header import PdbHeaderBuilder
+from calibre.ebooks.pdb.header import PdbHeaderReader

 def get_metadata(stream, extract_cover=True):
    """
@ -20,14 +22,14 @@ def get_metadata(stream, extract_cover=True):
    """
    mi = MetaInformation(None, [_('Unknown')])
    stream.seek(0)
-    
+
    pheader = PdbHeaderReader(stream)
    hr = HeaderRecord(pheader.section_data(0))
-        
+
    if hr.version in (2, 10) and hr.has_metadata == 1:
        try:
            mdata = pheader.section_data(hr.metadata_offset)
-    
+
            mdata = mdata.split('\x00')
            mi.title = mdata[0]
            mi.authors = [mdata[1]]
@ -35,7 +37,7 @@ def get_metadata(stream, extract_cover=True):
            mi.isbn = mdata[4]
        except:
            pass
-        
+
    if not mi.title:
        mi.title = pheader.title if pheader.title else _('Unknown')

@ -43,26 +45,31 @@ def get_metadata(stream, extract_cover=True):

 def set_metadata(stream, mi):
    pheader = PdbHeaderReader(stream)
+
+    # Only Dropbook produced 132 byte record0 files are supported
+    if pheader.section_data(0) != 132:
+        return
+
    sections = [pheader.section_data(x) for x in range(0, pheader.section_count())]
    hr = HeaderRecord(sections[0])
-    
+
    if hr.version not in (2, 10):
        return
-    
+
    # Create a metadata record for the file if one does not alreay exist
    if not hr.has_metadata:
        sections += ['', 'MeTaInFo\x00']
        last_data = len(sections) - 1
-        
+
        for i in range(0, 132, 2):
-            val, = struct.unpack('>H', sections[0][i:i+2])
+            val, = struct.unpack('>H', sections[0][i:i + 2])
            if val >= hr.last_data_offset:
-                sections[0][i:i+2] = struct.pack('>H', last_data)
-            
+                sections[0][i:i + 2] = struct.pack('>H', last_data)
+
        sections[0][24:26] = struct.pack('>H', 1) # Set has metadata
        sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata
        sections[0][52:54] = struct.pack('>H', last_data) # Ensure last data offset is updated
-    
+
    # Merge the metadata into the file
    file_mi = get_metadata(stream, False)
    file_mi.smart_update(mi)
@ -79,4 +86,3 @@ def set_metadata(stream, mi):
    # Write the data back to the file
    for item in sections:
        stream.write(item)
-
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@ -8,186 +8,27 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

-import os
-import re
-import struct
-import zlib
-
-from calibre import CurrentDir
-from calibre.ebooks import DRMError
-from calibre.ebooks.metadata.opf2 import OPFCreator
-from calibre.ebooks.compression.palmdoc import decompress_doc
-from calibre.ebooks.pdb.ereader import EreaderError
 from calibre.ebooks.pdb.formatreader import FormatReader
-from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
-from calibre.ebooks.pml.pmlconverter import pml_to_html
-
-class HeaderRecord(object):
-    '''
-    The first record in the file is always the header record. It holds
-    information related to the location of text, images, and so on
-    in the file. This is used in conjunction with the sections
-    defined in the file header.
-    '''
-
-    def __init__(self, raw):
-        self.version, = struct.unpack('>H', raw[0:2])
-        self.non_text_offset, = struct.unpack('>H', raw[12:14])
-        self.has_metadata, = struct.unpack('>H', raw[24:26])
-        self.footnote_rec, = struct.unpack('>H', raw[28:30])
-        self.sidebar_rec, = struct.unpack('>H', raw[30:32])
-        self.bookmark_offset, = struct.unpack('>H', raw[32:34])
-        self.image_data_offset, = struct.unpack('>H', raw[40:42])
-        self.metadata_offset, = struct.unpack('>H', raw[44:46])
-        self.footnote_offset, = struct.unpack('>H', raw[48:50])
-        self.sidebar_offset, = struct.unpack('>H', raw[50:52])
-        self.last_data_offset, = struct.unpack('>H', raw[52:54])
-
-        self.num_text_pages = self.non_text_offset - 1
-        self.num_image_pages = self.metadata_offset - self.image_data_offset
-
+from calibre.ebooks.pdb.ereader.reader132 import Reader132
+from calibre.ebooks.pdb.ereader.reader202 import Reader202

 class Reader(FormatReader):

    def __init__(self, header, stream, log, encoding=None):
-        self.log = log
-        self.encoding = encoding
+        record0_size = len(header.section_data(0))

-        self.sections = []
-        for i in range(header.num_sections):
-            self.sections.append(header.section_data(i))
-
-        self.header_record = HeaderRecord(self.section_data(0))
-
-        if self.header_record.version not in (2, 10):
-            if self.header_record.version in (260, 272):
-                raise DRMError('eReader DRM is not supported.')
-            else:
-                raise EreaderError('Unknown book version %i.' % self.header_record.version)
-
-        from calibre.ebooks.metadata.pdb import get_metadata
-        self.mi = get_metadata(stream, False)
-
-    def section_data(self, number):
-        return self.sections[number]
-
-    def decompress_text(self, number):
-        if self.header_record.version == 2:
-            return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
-        if self.header_record.version == 10:
-            return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
-
-
-    def get_image(self, number):
-        if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
-            return 'empty', ''
-        data = self.section_data(number)
-        name = data[4:4 + 32].strip('\x00')
-        img = data[62:]
-        return name, img
-
-    def get_text_page(self, number):
-        '''
-        Only palmdoc and zlib compressed are supported. The text is
-        assumed to be encoded as Windows-1252. The encoding is part of
-        the eReader file spec and should always be this encoding.
-        '''
-        if number not in range(1, self.header_record.num_text_pages + 1):
-            return ''
-
-        return self.decompress_text(number)
+        if record0_size == 132:
+            self.reader = Reader132(header, stream, log, encoding)
+        elif record0_size == 202:
+            self.reader = Reader202(header, stream, log, encoding)
+        else:
+            raise ValueError('Unknown eReader Header')

    def extract_content(self, output_dir):
-        output_dir = os.path.abspath(output_dir)
-
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
-        html = u'<html><head><title></title></head><body>'
-
-        for i in range(1, self.header_record.num_text_pages + 1):
-            self.log.debug('Extracting text page %i' % i)
-            html += pml_to_html(self.get_text_page(i))
-
-        if self.header_record.footnote_rec > 0:
-            html += '<br /><h1>%s</h1>' % _('Footnotes')
-            footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
-            for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
-                self.log.debug('Extracting footnote page %i' % i)
-                html += '<dl>'
-                html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
-                html += '</dl>'
-
-        if self.header_record.sidebar_rec > 0:
-            html += '<br /><h1>%s</h1>' % _('Sidebar')
-            sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
-            for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
-                self.log.debug('Extracting sidebar page %i' % i)
-                html += '<dl>'
-                html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
-                html += '</dl>'
-
-        html += '</body></html>'
-
-        with CurrentDir(output_dir):
-            with open('index.html', 'wb') as index:
-                self.log.debug('Writing text to index.html')
-                index.write(html.encode('utf-8'))
-
-        if not os.path.exists(os.path.join(output_dir, 'images/')):
-            os.makedirs(os.path.join(output_dir, 'images/'))
-        images = []
-        with CurrentDir(os.path.join(output_dir, 'images/')):
-            for i in range(0, self.header_record.num_image_pages):
-                name, img = self.get_image(self.header_record.image_data_offset + i)
-                images.append(name)
-                with open(name, 'wb') as imgf:
-                    self.log.debug('Writing image %s to images/' % name)
-                    imgf.write(img)
-
-        opf_path = self.create_opf(output_dir, images)
-
-        return opf_path
-
-    def create_opf(self, output_dir, images):
-        with CurrentDir(output_dir):
-            opf = OPFCreator(output_dir, self.mi)
-
-            manifest = [('index.html', None)]
-
-            for i in images:
-                manifest.append((os.path.join('images/', i), None))
-
-            opf.create_manifest(manifest)
-            opf.create_spine(['index.html'])
-            with open('metadata.opf', 'wb') as opffile:
-                opf.render(opffile)
-
-        return os.path.join(output_dir, 'metadata.opf')
+        return self.reader.extract_content(output_dir)

    def dump_pml(self):
-        '''
-        This is primarily used for debugging and 3rd party tools to
-        get the plm markup that comprises the text in the file.
-        '''
-        pml = ''
-
-        for i in range(1, self.header_record.num_text_pages + 1):
-            pml += self.get_text_page(i)
-
-        return pml
-
-    def dump_images(self, output_dir):
-        '''
-        This is primarily used for debugging and 3rd party tools to
-        get the images in the file.
-        '''
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
-        with CurrentDir(output_dir):
-            for i in range(0, self.header_record.num_image_pages):
-                name, img = self.get_image(self.header_record.image_data_offset + i)
-                with open(name, 'wb') as imgf:
-                    imgf.write(img)
+        return self.reader.dump_pml()

+    def dump_images(self):
+        return self.reader.dump_images()
--- a/src/calibre/ebooks/pdb/ereader/reader132.py
+++ b/src/calibre/ebooks/pdb/ereader/reader132.py
@ -0,0 +1,192 @@
+# -*- coding: utf-8 -*-
+
+'''
+Read content from ereader pdb file with a 132 byte header created by Dropbook.
+'''
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+import re
+import struct
+import zlib
+
+from calibre import CurrentDir
+from calibre.ebooks import DRMError
+from calibre.ebooks.compression.palmdoc import decompress_doc
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.pdb.ereader import EreaderError
+from calibre.ebooks.pdb.formatreader import FormatReader
+from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
+from calibre.ebooks.pml.pmlconverter import pml_to_html
+
+class HeaderRecord(object):
+    '''
+    The first record in the file is always the header record. It holds
+    information related to the location of text, images, and so on
+    in the file. This is used in conjunction with the sections
+    defined in the file header.
+    '''
+
+    def __init__(self, raw):
+        self.version, = struct.unpack('>H', raw[0:2])
+        self.non_text_offset, = struct.unpack('>H', raw[12:14])
+        self.has_metadata, = struct.unpack('>H', raw[24:26])
+        self.footnote_rec, = struct.unpack('>H', raw[28:30])
+        self.sidebar_rec, = struct.unpack('>H', raw[30:32])
+        self.bookmark_offset, = struct.unpack('>H', raw[32:34])
+        self.image_data_offset, = struct.unpack('>H', raw[40:42])
+        self.metadata_offset, = struct.unpack('>H', raw[44:46])
+        self.footnote_offset, = struct.unpack('>H', raw[48:50])
+        self.sidebar_offset, = struct.unpack('>H', raw[50:52])
+        self.last_data_offset, = struct.unpack('>H', raw[52:54])
+
+        self.num_text_pages = self.non_text_offset - 1
+        self.num_image_pages = self.metadata_offset - self.image_data_offset
+
+
+class Reader132(FormatReader):
+
+    def __init__(self, header, stream, log, encoding=None):
+        self.log = log
+        self.encoding = encoding
+
+        self.sections = []
+        for i in range(header.num_sections):
+            self.sections.append(header.section_data(i))
+
+        self.header_record = HeaderRecord(self.section_data(0))
+
+        if self.header_record.version not in (2, 10):
+            if self.header_record.version in (260, 272):
+                raise DRMError('eReader DRM is not supported.')
+            else:
+                raise EreaderError('Unknown book version %i.' % self.header_record.version)
+
+        from calibre.ebooks.metadata.pdb import get_metadata
+        self.mi = get_metadata(stream, False)
+
+    def section_data(self, number):
+        return self.sections[number]
+
+    def decompress_text(self, number):
+        if self.header_record.version == 2:
+            return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
+        if self.header_record.version == 10:
+            return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
+
+    def get_image(self, number):
+        if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
+            return 'empty', ''
+        data = self.section_data(number)
+        name = data[4:4 + 32].strip('\x00')
+        img = data[62:]
+        return name, img
+
+    def get_text_page(self, number):
+        '''
+        Only palmdoc and zlib compressed are supported. The text is
+        assumed to be encoded as Windows-1252. The encoding is part of
+        the eReader file spec and should always be this encoding.
+        '''
+        if number not in range(1, self.header_record.num_text_pages + 1):
+            return ''
+
+        return self.decompress_text(number)
+
+    def extract_content(self, output_dir):
+        output_dir = os.path.abspath(output_dir)
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        html = u'<html><head><title></title></head><body>'
+
+        for i in range(1, self.header_record.num_text_pages + 1):
+            self.log.debug('Extracting text page %i' % i)
+            html += pml_to_html(self.get_text_page(i))
+
+        if self.header_record.footnote_rec > 0:
+            html += '<br /><h1>%s</h1>' % _('Footnotes')
+            footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
+            for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
+                self.log.debug('Extracting footnote page %i' % i)
+                html += '<dl>'
+                html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
+                html += '</dl>'
+
+        if self.header_record.sidebar_rec > 0:
+            html += '<br /><h1>%s</h1>' % _('Sidebar')
+            sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
+            for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
+                self.log.debug('Extracting sidebar page %i' % i)
+                html += '<dl>'
+                html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
+                html += '</dl>'
+
+        html += '</body></html>'
+
+        with CurrentDir(output_dir):
+            with open('index.html', 'wb') as index:
+                self.log.debug('Writing text to index.html')
+                index.write(html.encode('utf-8'))
+
+        if not os.path.exists(os.path.join(output_dir, 'images/')):
+            os.makedirs(os.path.join(output_dir, 'images/'))
+        images = []
+        with CurrentDir(os.path.join(output_dir, 'images/')):
+            for i in range(0, self.header_record.num_image_pages):
+                name, img = self.get_image(self.header_record.image_data_offset + i)
+                images.append(name)
+                with open(name, 'wb') as imgf:
+                    self.log.debug('Writing image %s to images/' % name)
+                    imgf.write(img)
+
+        opf_path = self.create_opf(output_dir, images)
+
+        return opf_path
+
+    def create_opf(self, output_dir, images):
+        with CurrentDir(output_dir):
+            opf = OPFCreator(output_dir, self.mi)
+
+            manifest = [('index.html', None)]
+
+            for i in images:
+                manifest.append((os.path.join('images/', i), None))
+
+            opf.create_manifest(manifest)
+            opf.create_spine(['index.html'])
+            with open('metadata.opf', 'wb') as opffile:
+                opf.render(opffile)
+
+        return os.path.join(output_dir, 'metadata.opf')
+
+    def dump_pml(self):
+        '''
+        This is primarily used for debugging and 3rd party tools to
+        get the plm markup that comprises the text in the file.
+        '''
+        pml = ''
+
+        for i in range(1, self.header_record.num_text_pages + 1):
+            pml += self.get_text_page(i)
+
+        return pml
+
+    def dump_images(self, output_dir):
+        '''
+        This is primarily used for debugging and 3rd party tools to
+        get the images in the file.
+        '''
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        with CurrentDir(output_dir):
+            for i in range(0, self.header_record.num_image_pages):
+                name, img = self.get_image(self.header_record.image_data_offset + i)
+                with open(name, 'wb') as imgf:
+                    imgf.write(img)
+
--- a/src/calibre/ebooks/pdb/ereader/reader202.py
+++ b/src/calibre/ebooks/pdb/ereader/reader202.py
@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+
+'''
+Read content from ereader pdb file with a 202 byte header created by Makebook.
+'''
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+import struct
+
+from calibre import CurrentDir
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.pml.pmlconverter import pml_to_html
+from calibre.ebooks.compression.palmdoc import decompress_doc
+from calibre.ebooks.pdb.formatreader import FormatReader
+from calibre.ebooks.pdb.ereader import EreaderError
+
+class HeaderRecord(object):
+    '''
+    The first record in the file is always the header record. It holds
+    information related to the location of text, images, and so on
+    in the file. This is used in conjunction with the sections
+    defined in the file header.
+    '''
+
+    def __init__(self, raw):
+        self.version, = struct.unpack('>H', raw[0:2])
+        self.non_text_offset, = struct.unpack('>H', raw[8:10])
+
+        self.num_text_pages = self.non_text_offset - 1
+
+
+class Reader202(FormatReader):
+
+    def __init__(self, header, stream, log, encoding=None):
+        self.log = log
+        self.encoding = encoding
+
+        self.sections = []
+        for i in range(header.num_sections):
+            self.sections.append(header.section_data(i))
+
+        self.header_record = HeaderRecord(self.section_data(0))
+
+        if self.header_record.version != 4:
+            raise EreaderError('Unknown book version %i.' % self.header_record.version)
+
+    def section_data(self, number):
+        return self.sections[number]
+
+    def decompress_text(self, number):
+            return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding)
+
+    def get_image(self, number):
+        name = None
+        img = None
+
+        data = self.section_data(number)
+        if data.startswith('PNG'):
+            name = data[4:4 + 32].strip('\x00')
+            img = data[62:]
+            
+        return name, img
+
+    def get_text_page(self, number):
+        '''
+        Only palmdoc compression is supported. The text is xored with 0xA5 and
+        assumed to be encoded as Windows-1252. The encoding is part of
+        the eReader file spec and should always be this encoding.
+        '''
+        if number not in range(1, self.header_record.num_text_pages + 1):
+            return ''
+
+        return self.decompress_text(number)
+
+    def extract_content(self, output_dir):
+        output_dir = os.path.abspath(output_dir)
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        html = u'<html><head><title></title></head><body>'
+
+        for i in range(1, self.header_record.num_text_pages + 1):
+            self.log.debug('Extracting text page %i' % i)
+            html += pml_to_html(self.get_text_page(i))
+
+
+        html += '</body></html>'
+
+        with CurrentDir(output_dir):
+            with open('index.html', 'wb') as index:
+                self.log.debug('Writing text to index.html')
+                index.write(html.encode('utf-8'))
+
+        if not os.path.exists(os.path.join(output_dir, 'images/')):
+            os.makedirs(os.path.join(output_dir, 'images/'))
+        images = []
+        with CurrentDir(os.path.join(output_dir, 'images/')):
+            for i in range(self.header_record.non_text_offset, len(self.sections)):
+                name, img = self.get_image(i)
+                if name:
+                    images.append(name)
+                    with open(name, 'wb') as imgf:
+                        self.log.debug('Writing image %s to images/' % name)
+                        imgf.write(img)
+
+        opf_path = self.create_opf(output_dir, images)
+
+        return opf_path
+
+    def create_opf(self, output_dir, images):
+        with CurrentDir(output_dir):
+            opf = OPFCreator(output_dir, MetaInformation(_('Unknown'), _('Unknown')))
+
+            manifest = [('index.html', None)]
+
+            for i in images:
+                manifest.append((os.path.join('images/', i), None))
+
+            opf.create_manifest(manifest)
+            opf.create_spine(['index.html'])
+            with open('metadata.opf', 'wb') as opffile:
+                opf.render(opffile)
+
+        return os.path.join(output_dir, 'metadata.opf')
+
+    def dump_pml(self):
+        '''
+        This is primarily used for debugging and 3rd party tools to
+        get the plm markup that comprises the text in the file.
+        '''
+        pml = ''
+
+        for i in range(1, self.header_record.num_text_pages + 1):
+            pml += self.get_text_page(i)
+
+        return pml
+
+    def dump_images(self, output_dir):
+        '''
+        This is primarily used for debugging and 3rd party tools to
+        get the images in the file.
+        '''
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        with CurrentDir(output_dir):
+            for i in range(0, self.header_record.num_image_pages):
+                name, img = self.get_image(self.header_record.image_data_offset + i)
+                with open(name, 'wb') as imgf:
+                    imgf.write(img)