eReader input support Makebook produced books (202 byte header documents).

2025-07-09 03:04:10 -04:00 · 2009-05-21 17:22:32 -04:00 · 2009-05-21 17:22:32 -04:00 · e4ee664bb3
commit e4ee664bb3
parent 24ca1a1134
4 changed files with 382 additions and 188 deletions
--- a/src/calibre/ebooks/metadata/ereader.py
+++ b/src/calibre/ebooks/metadata/ereader.py
@ -8,11 +8,13 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
-import re
+import struct
-from calibre.ebooks.metadata import MetaInformation, authors_to_string
+from calibre.ebooks.metadata import MetaInformation
-from calibre.ebooks.pdb.header import PdbHeaderReader, PdbHeaderBuilder
+from calibre.ebooks.metadata import authors_to_string
-from calibre.ebooks.pdb.ereader.reader import HeaderRecord
+from calibre.ebooks.pdb.ereader.reader132 import HeaderRecord
 from calibre.ebooks.pdb.header import PdbHeaderBuilder
 from calibre.ebooks.pdb.header import PdbHeaderReader
 def get_metadata(stream, extract_cover=True):
    """
@ -20,14 +22,14 @@ def get_metadata(stream, extract_cover=True):
    """
    mi = MetaInformation(None, [_('Unknown')])
    stream.seek(0)
-    
+
    pheader = PdbHeaderReader(stream)
    hr = HeaderRecord(pheader.section_data(0))
-        
+
    if hr.version in (2, 10) and hr.has_metadata == 1:
        try:
            mdata = pheader.section_data(hr.metadata_offset)
-    
+
            mdata = mdata.split('\x00')
            mi.title = mdata[0]
            mi.authors = [mdata[1]]
@ -35,7 +37,7 @@ def get_metadata(stream, extract_cover=True):
            mi.isbn = mdata[4]
        except:
            pass
-        
+
    if not mi.title:
        mi.title = pheader.title if pheader.title else _('Unknown')
@ -43,26 +45,31 @@ def get_metadata(stream, extract_cover=True):
 def set_metadata(stream, mi):
    pheader = PdbHeaderReader(stream)
    # Only Dropbook produced 132 byte record0 files are supported
    if pheader.section_data(0) != 132:
        return
    sections = [pheader.section_data(x) for x in range(0, pheader.section_count())]
    hr = HeaderRecord(sections[0])
-    
+
    if hr.version not in (2, 10):
        return
-    
+
    # Create a metadata record for the file if one does not alreay exist
    if not hr.has_metadata:
        sections += ['', 'MeTaInFo\x00']
        last_data = len(sections) - 1
-        
+
        for i in range(0, 132, 2):
-            val, = struct.unpack('>H', sections[0][i:i+2])
+            val, = struct.unpack('>H', sections[0][i:i + 2])
            if val >= hr.last_data_offset:
-                sections[0][i:i+2] = struct.pack('>H', last_data)
+                sections[0][i:i + 2] = struct.pack('>H', last_data)
-            
+
        sections[0][24:26] = struct.pack('>H', 1) # Set has metadata
        sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata
        sections[0][52:54] = struct.pack('>H', last_data) # Ensure last data offset is updated
-    
+
    # Merge the metadata into the file
    file_mi = get_metadata(stream, False)
    file_mi.smart_update(mi)
@ -79,4 +86,3 @@ def set_metadata(stream, mi):
    # Write the data back to the file
    for item in sections:
        stream.write(item)
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@ -8,186 +8,27 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import os
 import re
 import struct
 import zlib
 from calibre import CurrentDir
 from calibre.ebooks import DRMError
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.compression.palmdoc import decompress_doc
 from calibre.ebooks.pdb.ereader import EreaderError
 from calibre.ebooks.pdb.formatreader import FormatReader
-from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
+from calibre.ebooks.pdb.ereader.reader132 import Reader132
-from calibre.ebooks.pml.pmlconverter import pml_to_html
+from calibre.ebooks.pdb.ereader.reader202 import Reader202
 class HeaderRecord(object):
    '''
    The first record in the file is always the header record. It holds
    information related to the location of text, images, and so on
    in the file. This is used in conjunction with the sections
    defined in the file header.
    '''
    def __init__(self, raw):
        self.version, = struct.unpack('>H', raw[0:2])
        self.non_text_offset, = struct.unpack('>H', raw[12:14])
        self.has_metadata, = struct.unpack('>H', raw[24:26])
        self.footnote_rec, = struct.unpack('>H', raw[28:30])
        self.sidebar_rec, = struct.unpack('>H', raw[30:32])
        self.bookmark_offset, = struct.unpack('>H', raw[32:34])
        self.image_data_offset, = struct.unpack('>H', raw[40:42])
        self.metadata_offset, = struct.unpack('>H', raw[44:46])
        self.footnote_offset, = struct.unpack('>H', raw[48:50])
        self.sidebar_offset, = struct.unpack('>H', raw[50:52])
        self.last_data_offset, = struct.unpack('>H', raw[52:54])
        self.num_text_pages = self.non_text_offset - 1
        self.num_image_pages = self.metadata_offset - self.image_data_offset
 class Reader(FormatReader):
    def __init__(self, header, stream, log, encoding=None):
-        self.log = log
+        record0_size = len(header.section_data(0))
        self.encoding = encoding
-        self.sections = []
+        if record0_size == 132:
-        for i in range(header.num_sections):
+            self.reader = Reader132(header, stream, log, encoding)
-            self.sections.append(header.section_data(i))
+        elif record0_size == 202:
-
+            self.reader = Reader202(header, stream, log, encoding)
-        self.header_record = HeaderRecord(self.section_data(0))
+        else:
-
+            raise ValueError('Unknown eReader Header')
        if self.header_record.version not in (2, 10):
            if self.header_record.version in (260, 272):
                raise DRMError('eReader DRM is not supported.')
            else:
                raise EreaderError('Unknown book version %i.' % self.header_record.version)
        from calibre.ebooks.metadata.pdb import get_metadata
        self.mi = get_metadata(stream, False)
    def section_data(self, number):
        return self.sections[number]
    def decompress_text(self, number):
        if self.header_record.version == 2:
            return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
        if self.header_record.version == 10:
            return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
    def get_image(self, number):
        if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
            return 'empty', ''
        data = self.section_data(number)
        name = data[4:4 + 32].strip('\x00')
        img = data[62:]
        return name, img
    def get_text_page(self, number):
        '''
        Only palmdoc and zlib compressed are supported. The text is
        assumed to be encoded as Windows-1252. The encoding is part of
        the eReader file spec and should always be this encoding.
        '''
        if number not in range(1, self.header_record.num_text_pages + 1):
            return ''
        return self.decompress_text(number)
    def extract_content(self, output_dir):
-        output_dir = os.path.abspath(output_dir)
+        return self.reader.extract_content(output_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        html = u'<html><head><title></title></head><body>'
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            html += pml_to_html(self.get_text_page(i))
        if self.header_record.footnote_rec > 0:
            html += '<br /><h1>%s</h1>' % _('Footnotes')
            footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
                self.log.debug('Extracting footnote page %i' % i)
                html += '<dl>'
                html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
                html += '</dl>'
        if self.header_record.sidebar_rec > 0:
            html += '<br /><h1>%s</h1>' % _('Sidebar')
            sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
                self.log.debug('Extracting sidebar page %i' % i)
                html += '<dl>'
                html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
                html += '</dl>'
        html += '</body></html>'
        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                images.append(name)
                with open(name, 'wb') as imgf:
                    self.log.debug('Writing image %s to images/' % name)
                    imgf.write(img)
        opf_path = self.create_opf(output_dir, images)
        return opf_path
    def create_opf(self, output_dir, images):
        with CurrentDir(output_dir):
            opf = OPFCreator(output_dir, self.mi)
            manifest = [('index.html', None)]
            for i in images:
                manifest.append((os.path.join('images/', i), None))
            opf.create_manifest(manifest)
            opf.create_spine(['index.html'])
            with open('metadata.opf', 'wb') as opffile:
                opf.render(opffile)
        return os.path.join(output_dir, 'metadata.opf')
    def dump_pml(self):
-        '''
+        return self.reader.dump_pml()
        This is primarily used for debugging and 3rd party tools to
        get the plm markup that comprises the text in the file.
        '''
        pml = ''
        for i in range(1, self.header_record.num_text_pages + 1):
            pml += self.get_text_page(i)
        return pml
    def dump_images(self, output_dir):
        '''
        This is primarily used for debugging and 3rd party tools to
        get the images in the file.
        '''
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        with CurrentDir(output_dir):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                with open(name, 'wb') as imgf:
                    imgf.write(img)
    def dump_images(self):
        return self.reader.dump_images()
--- a/src/calibre/ebooks/pdb/ereader/reader132.py
+++ b/src/calibre/ebooks/pdb/ereader/reader132.py
@ -0,0 +1,192 @@
 # -*- coding: utf-8 -*-
 '''
 Read content from ereader pdb file with a 132 byte header created by Dropbook.
 '''
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import os
 import re
 import struct
 import zlib
 from calibre import CurrentDir
 from calibre.ebooks import DRMError
 from calibre.ebooks.compression.palmdoc import decompress_doc
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.pdb.ereader import EreaderError
 from calibre.ebooks.pdb.formatreader import FormatReader
 from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
 from calibre.ebooks.pml.pmlconverter import pml_to_html
 class HeaderRecord(object):
    '''
    The first record in the file is always the header record. It holds
    information related to the location of text, images, and so on
    in the file. This is used in conjunction with the sections
    defined in the file header.
    '''
    def __init__(self, raw):
        self.version, = struct.unpack('>H', raw[0:2])
        self.non_text_offset, = struct.unpack('>H', raw[12:14])
        self.has_metadata, = struct.unpack('>H', raw[24:26])
        self.footnote_rec, = struct.unpack('>H', raw[28:30])
        self.sidebar_rec, = struct.unpack('>H', raw[30:32])
        self.bookmark_offset, = struct.unpack('>H', raw[32:34])
        self.image_data_offset, = struct.unpack('>H', raw[40:42])
        self.metadata_offset, = struct.unpack('>H', raw[44:46])
        self.footnote_offset, = struct.unpack('>H', raw[48:50])
        self.sidebar_offset, = struct.unpack('>H', raw[50:52])
        self.last_data_offset, = struct.unpack('>H', raw[52:54])
        self.num_text_pages = self.non_text_offset - 1
        self.num_image_pages = self.metadata_offset - self.image_data_offset
 class Reader132(FormatReader):
    def __init__(self, header, stream, log, encoding=None):
        self.log = log
        self.encoding = encoding
        self.sections = []
        for i in range(header.num_sections):
            self.sections.append(header.section_data(i))
        self.header_record = HeaderRecord(self.section_data(0))
        if self.header_record.version not in (2, 10):
            if self.header_record.version in (260, 272):
                raise DRMError('eReader DRM is not supported.')
            else:
                raise EreaderError('Unknown book version %i.' % self.header_record.version)
        from calibre.ebooks.metadata.pdb import get_metadata
        self.mi = get_metadata(stream, False)
    def section_data(self, number):
        return self.sections[number]
    def decompress_text(self, number):
        if self.header_record.version == 2:
            return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
        if self.header_record.version == 10:
            return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
    def get_image(self, number):
        if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
            return 'empty', ''
        data = self.section_data(number)
        name = data[4:4 + 32].strip('\x00')
        img = data[62:]
        return name, img
    def get_text_page(self, number):
        '''
        Only palmdoc and zlib compressed are supported. The text is
        assumed to be encoded as Windows-1252. The encoding is part of
        the eReader file spec and should always be this encoding.
        '''
        if number not in range(1, self.header_record.num_text_pages + 1):
            return ''
        return self.decompress_text(number)
    def extract_content(self, output_dir):
        output_dir = os.path.abspath(output_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        html = u'<html><head><title></title></head><body>'
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            html += pml_to_html(self.get_text_page(i))
        if self.header_record.footnote_rec > 0:
            html += '<br /><h1>%s</h1>' % _('Footnotes')
            footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
                self.log.debug('Extracting footnote page %i' % i)
                html += '<dl>'
                html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
                html += '</dl>'
        if self.header_record.sidebar_rec > 0:
            html += '<br /><h1>%s</h1>' % _('Sidebar')
            sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
                self.log.debug('Extracting sidebar page %i' % i)
                html += '<dl>'
                html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
                html += '</dl>'
        html += '</body></html>'
        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                images.append(name)
                with open(name, 'wb') as imgf:
                    self.log.debug('Writing image %s to images/' % name)
                    imgf.write(img)
        opf_path = self.create_opf(output_dir, images)
        return opf_path
    def create_opf(self, output_dir, images):
        with CurrentDir(output_dir):
            opf = OPFCreator(output_dir, self.mi)
            manifest = [('index.html', None)]
            for i in images:
                manifest.append((os.path.join('images/', i), None))
            opf.create_manifest(manifest)
            opf.create_spine(['index.html'])
            with open('metadata.opf', 'wb') as opffile:
                opf.render(opffile)
        return os.path.join(output_dir, 'metadata.opf')
    def dump_pml(self):
        '''
        This is primarily used for debugging and 3rd party tools to
        get the plm markup that comprises the text in the file.
        '''
        pml = ''
        for i in range(1, self.header_record.num_text_pages + 1):
            pml += self.get_text_page(i)
        return pml
    def dump_images(self, output_dir):
        '''
        This is primarily used for debugging and 3rd party tools to
        get the images in the file.
        '''
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        with CurrentDir(output_dir):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                with open(name, 'wb') as imgf:
                    imgf.write(img)
--- a/src/calibre/ebooks/pdb/ereader/reader202.py
+++ b/src/calibre/ebooks/pdb/ereader/reader202.py
@ -0,0 +1,155 @@
 # -*- coding: utf-8 -*-
 '''
 Read content from ereader pdb file with a 202 byte header created by Makebook.
 '''
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import os
 import struct
 from calibre import CurrentDir
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.pml.pmlconverter import pml_to_html
 from calibre.ebooks.compression.palmdoc import decompress_doc
 from calibre.ebooks.pdb.formatreader import FormatReader
 from calibre.ebooks.pdb.ereader import EreaderError
 class HeaderRecord(object):
    '''
    The first record in the file is always the header record. It holds
    information related to the location of text, images, and so on
    in the file. This is used in conjunction with the sections
    defined in the file header.
    '''
    def __init__(self, raw):
        self.version, = struct.unpack('>H', raw[0:2])
        self.non_text_offset, = struct.unpack('>H', raw[8:10])
        self.num_text_pages = self.non_text_offset - 1
 class Reader202(FormatReader):
    def __init__(self, header, stream, log, encoding=None):
        self.log = log
        self.encoding = encoding
        self.sections = []
        for i in range(header.num_sections):
            self.sections.append(header.section_data(i))
        self.header_record = HeaderRecord(self.section_data(0))
        if self.header_record.version != 4:
            raise EreaderError('Unknown book version %i.' % self.header_record.version)
    def section_data(self, number):
        return self.sections[number]
    def decompress_text(self, number):
            return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding)
    def get_image(self, number):
        name = None
        img = None
        data = self.section_data(number)
        if data.startswith('PNG'):
            name = data[4:4 + 32].strip('\x00')
            img = data[62:]
        return name, img
    def get_text_page(self, number):
        '''
        Only palmdoc compression is supported. The text is xored with 0xA5 and
        assumed to be encoded as Windows-1252. The encoding is part of
        the eReader file spec and should always be this encoding.
        '''
        if number not in range(1, self.header_record.num_text_pages + 1):
            return ''
        return self.decompress_text(number)
    def extract_content(self, output_dir):
        output_dir = os.path.abspath(output_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        html = u'<html><head><title></title></head><body>'
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            html += pml_to_html(self.get_text_page(i))
        html += '</body></html>'
        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(self.header_record.non_text_offset, len(self.sections)):
                name, img = self.get_image(i)
                if name:
                    images.append(name)
                    with open(name, 'wb') as imgf:
                        self.log.debug('Writing image %s to images/' % name)
                        imgf.write(img)
        opf_path = self.create_opf(output_dir, images)
        return opf_path
    def create_opf(self, output_dir, images):
        with CurrentDir(output_dir):
            opf = OPFCreator(output_dir, MetaInformation(_('Unknown'), _('Unknown')))
            manifest = [('index.html', None)]
            for i in images:
                manifest.append((os.path.join('images/', i), None))
            opf.create_manifest(manifest)
            opf.create_spine(['index.html'])
            with open('metadata.opf', 'wb') as opffile:
                opf.render(opffile)
        return os.path.join(output_dir, 'metadata.opf')
    def dump_pml(self):
        '''
        This is primarily used for debugging and 3rd party tools to
        get the plm markup that comprises the text in the file.
        '''
        pml = ''
        for i in range(1, self.header_record.num_text_pages + 1):
            pml += self.get_text_page(i)
        return pml
    def dump_images(self, output_dir):
        '''
        This is primarily used for debugging and 3rd party tools to
        get the images in the file.
        '''
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        with CurrentDir(output_dir):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                with open(name, 'wb') as imgf:
                    imgf.write(img)