eReader output. HTML_TO_PML cleanup.

2025-07-08 02:34:06 -04:00 · 2009-05-10 12:23:26 -04:00 · 2009-05-10 12:23:26 -04:00 · 4c49aa5c60
commit 4c49aa5c60
parent 49c7999064
9 changed files with 86 additions and 88 deletions
--- a/src/calibre/ebooks/pdb/init.py
+++ b/src/calibre/ebooks/pdb/init.py
@ -22,10 +22,12 @@ FORMAT_READERS = {
 from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
 from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer
 from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer
 FORMAT_WRITERS = {
    'doc'     : palmdoc_writer,
    'ztxt'    : ztxt_writer,
    'ereader' : ereader_writer,
 }
 IDENTITY_TO_NAME = {
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@ -126,7 +126,7 @@ class Reader(FormatReader):
        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
-                index.write(html)
+                index.write(html.encode('utf-8'))
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@ -12,19 +12,29 @@ import struct, zlib
 import Image, cStringIO
 from calibre.ebooks.pdb.formatwriter import FormatWriter
 from calibre.ebooks.oeb.base import OEB_IMAGES
 from calibre.ebooks.pdb.header import PdbHeaderBuilder
 from calibre.ebooks.pdb.ereader import image_name
 from calibre.ebooks.pml.pmlconverter import html_to_pml
 # We are using the older identity because we do not user newer features
 # (sidebar, footnotes). This will ensure compatibility with older readers.
 # If newer features are used (anything supported by dropbook but not by makebook
 # change the identity to the newer PNRdPPrs.
 IDENTITY = 'PNPdPPrs'
-class Writer(object):
+# This is an arbitrary number that is small enough to work. The actual maximum
 # record size is unknown.
 MAX_RECORD_SIZE = 3560
-    def __init__(self, log):
+class Writer(FormatWriter):
    def __init__(self, opts, log):
        self.opts = opts
        self.log = log
-    def dump(self, oeb_book, out_stream, metadata=None):
+    def write_content(self, oeb_book, out_stream, metadata=None):
        text = self._text(oeb_book.spine)
        images = self._images(oeb_book.manifest)
        metadata = [self._metadata(metadata)]
@ -42,26 +52,28 @@ class Writer(object):
            out_stream.write(item)
    def _text(self, pages):
-        # Todo: Split pages over 65505 Bytes
+        pml = ''
        pml_pages = []
        for page in pages:
-            pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8')))
+            pml += html_to_pml(unicode(page)).encode('cp1252')
        pml_pages = []
        for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
            pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]))
        return pml_pages            
    def _images(self, manifest):
        # Todo: resize images over 65505 Bytes
        images = []
        for item in manifest:
            if item.media_type in OEB_IMAGES:
-                image = '\x00\x00\x00\x00'
+                image = 'PNG '
                image += image_name(item.href)
                image = image.ljust(62, '\x00')
-                im = Image.open(cStringIO.StringIO(item.data))
+                im = Image.open(cStringIO.StringIO(item.data)).convert('P')
                im.thumbnail((300,300), Image.ANTIALIAS)
                data = cStringIO.StringIO()
                im.save(data, 'PNG')
@ -83,22 +95,38 @@ class Writer(object):
        publisher\x00
        isbn\x00
        '''
-        return '\x00\x00\x00\x00\x00'
+        
        title = _('Unknown')
        author = _('Unknown')
        copyright = ''
        publisher = ''
        isbn = ''
        if metadata != None:
            if len(metadata.title) >= 1:
                title = metadata.title[0].value
            if len(metadata.creator) >= 1:
                from calibre.ebooks.metadata import authors_to_string
                author = authors_to_string([x.value for x in metadata.creator])
            if len(metadata.rights) >= 1:
                copyright = metadata.rights[0].value
            if len(metadata.publisher) >= 1:
                publisher = metadata.publisher[0].value
        return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn)
    def _header_record(self, text_items, image_items):
        # Todo: Find out more about header and add correct values to the file
        # can be read by eReader reader software.
        '''
        text_items = the number of text pages
        image_items = the number of images
        '''
-        version = 10
+        version = 10 # Zlib compression
        non_text_offset = text_items + 1
        if image_items > 0:
            image_data_offset = text_items + 1
            meta_data_offset = image_data_offset + image_items
-            last_data_offset = meta_data_offset + 2
+            last_data_offset = meta_data_offset + 1
        else:
            meta_data_offset = text_items + 1
            last_data_offset = meta_data_offset + 1
@ -106,61 +134,36 @@ class Writer(object):
        record = ''
-        record += struct.pack('>H', version)                # [0:2]
+        record += struct.pack('>H', version)                # [0:2]    # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
        record += struct.pack('>H', 0)                      # [2:4]
        record += struct.pack('>H', 0)                      # [4:6]
-        record += struct.pack('>H', 25152)                  # [6:8]   # 25152 is MAGIC
+        record += struct.pack('>H', 25152)                  # [6:8]    # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text
-        record += struct.pack('>H', last_data_offset)       # [8:10]
+        record += struct.pack('>H', 0)                      # [8:10]
-        record += struct.pack('>H', last_data_offset)       # [10:12]
+        record += struct.pack('>H', 0)                      # [10:12]
        record += struct.pack('>H', non_text_offset)        # [12:14]  # non_text_offset
-        record += struct.pack('>H', non_text_offset)        # [14:16]
+        record += struct.pack('>H', 0)                      # [14:16]
-        record += struct.pack('>H', 1)                      # [16:18]
+        record += struct.pack('>H', 0)                      # [16:18]
-        record += struct.pack('>H', 1)                      # [18:20]
+        record += struct.pack('>H', 0)                      # [18:20]
-        record += struct.pack('>H', 0)                      # [20:22]
+        record += struct.pack('>H', image_items)            # [20:22]  # Number of images
-        record += struct.pack('>H', 1)                      # [22:24]
+        record += struct.pack('>H', 0)                      # [22:24]
-        record += struct.pack('>H', 1)                      # [24:26]
+        record += struct.pack('>H', 1)                      # [24:26]  # 1 if has metadata, 0 if not
        record += struct.pack('>H', 0)                      # [26:28]
        record += struct.pack('>H', 0)                      # [28:30]  # footnote_rec
        record += struct.pack('>H', 0)                      # [30:32]  # sidebar_rec
        record += struct.pack('>H', last_data_offset)       # [32:34]  # bookmark_offset
        record += struct.pack('>H', 2560)                   # [34:36]  # 2560 is MAGIC
-        record += struct.pack('>H', non_text_offset)        # [36:38]
+        record += struct.pack('>H', 0)                      # [36:38]
-        record += struct.pack('>H', non_text_offset + 1)    # [38:40]
+        record += struct.pack('>H', 0)                      # [38:40]
-        record += struct.pack('>H', image_data_offset)      # [40:42]
+        record += struct.pack('>H', image_data_offset)      # [40:42]  # image_data_offset. This will be the last data offset if there are no images
-        record += struct.pack('>H', image_data_offset)      # [42:44]
+        record += struct.pack('>H', 0)                      # [42:44]
-        record += struct.pack('>H', meta_data_offset)       # [44:46]
+        record += struct.pack('>H', meta_data_offset)       # [44:46]  # meta_data_offset. This will be the last data offset if there are no images
-        record += struct.pack('>H', meta_data_offset)       # [46:48]
+        record += struct.pack('>H', 0)                      # [46:48]
-        record += struct.pack('>H', last_data_offset)       # [48:50] # footnote_offset
+        record += struct.pack('>H', last_data_offset)       # [48:50]  # footnote_offset. This will be the last data offset if there are no images
-        record += struct.pack('>H', last_data_offset)       # [50:52] # sidebar_offset
+        record += struct.pack('>H', last_data_offset)       # [50:52]  # sidebar_offset. This will be the last data offset if there are no images
        record += struct.pack('>H', last_data_offset)       # [52:54]  # last_data_offset
-        record += struct.pack('>H', 1)       # [54:56]
+        for i in range(54, 132, 2):
-        for i in range(56, 132, 2):
+            record += struct.pack('>H', 0)                  # [54:132]
            record += struct.pack('>H', 0)
        '''
        # Version
        record += struct.pack('>H', version)
        record = record.ljust(12, '\x00')
        # Non-text offset, everything between record 0 and non_text_offset is text pages
        record += struct.pack('>H', non_text_offset)
        record = record.ljust(28, '\x00')
        # Footnote and Sidebar rec
        record += struct.pack('>H', 0)
        record += struct.pack('>H', 0)
        record += struct.pack('>H', last_data_offset)
        record = record.ljust(40, '\x00')
        # image pages
        record += struct.pack('>H', image_data_offset)
        record = record.ljust(44, '\x00')
        # metadata string
        record += struct.pack('>H', meta_data_offset)
        record = record.ljust(48, '\x00')
        # footnote and sidebar offsets
        record += struct.pack('>H', last_data_offset)
        record += struct.pack('>H', last_data_offset)
        record = record.ljust(52, '\x00')
        record += struct.pack('>H', last_data_offset)
        '''
        return record
--- a/src/calibre/ebooks/pdb/formatwriter.py
+++ b/src/calibre/ebooks/pdb/formatwriter.py
@ -14,5 +14,5 @@ class FormatWriter(object):
    def __init__(self, opts, log):
        raise NotImplementedError()
-    def write_content(self, oeb_book, output_stream, ):
+    def write_content(self, oeb_book, output_stream, metadata=None):
        raise NotImplementedError()
--- a/src/calibre/ebooks/pdb/output.py
+++ b/src/calibre/ebooks/pdb/output.py
@ -44,7 +44,7 @@ class PDBOutput(OutputFormatPlugin):
        out_stream.seek(0)
        out_stream.truncate()
-        writer.write_content(oeb_book, out_stream)
+        writer.write_content(oeb_book, out_stream, oeb_book.metadata)
        if close:
            out_stream.close()
--- a/src/calibre/ebooks/pdb/palmdoc/writer.py
+++ b/src/calibre/ebooks/pdb/palmdoc/writer.py
@ -23,7 +23,7 @@ class Writer(FormatWriter):
        self.opts = opts
        self.log = log
-    def write_content(self, oeb_book, out_stream):
+    def write_content(self, oeb_book, out_stream, metadata=None):
        title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
        txt_records, txt_length = self._generate_text(oeb_book.spine)
--- a/src/calibre/ebooks/pdb/ztxt/writer.py
+++ b/src/calibre/ebooks/pdb/ztxt/writer.py
@ -22,7 +22,7 @@ class Writer(FormatWriter):
        self.opts = opts
        self.log = log
-    def write_content(self, oeb_book, out_stream):
+    def write_content(self, oeb_book, out_stream, metadata=None):
        title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
        txt_records, txt_length = self._generate_text(oeb_book.spine)
--- a/src/calibre/ebooks/pml/input.py
+++ b/src/calibre/ebooks/pml/input.py
@ -41,7 +41,7 @@ class PMLInput(InputFormatPlugin):
            ienc = self.options.input_encoding
        html = pml_to_html(pml_stream.read().decode(ienc)) 
-        html_stream.write('<html><head><title /></head><body>' + html + '</body></html>')
+        html_stream.write('<html><head><title /></head><body>' + html.encode('utf-8') + '</body></html>')
        if pclose:
            pml_stream.close()
@ -67,6 +67,8 @@ class PMLInput(InputFormatPlugin):
                    self.process_pml(pml, html_path)
                imgs = glob.glob(os.path.join(tdir, '*.png'))
                if len(imgs) > 0:
                    os.makedirs(os.path.join(os.getcwd(), 'images'))
                for img in imgs:
                    pimg_name = os.path.basename(img)
                    pimg_path = os.path.join(os.getcwd(), 'images', pimg_name)
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
 import re
 from calibre import entity_to_unicode
 from calibre.ebooks.pdb.ereader import image_name
 from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
@ -86,10 +87,9 @@ HTML_PML_RULES = [
    (re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
    (re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
    (re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
-    #(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
+    (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name')).strip('\x00')),
-    (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name').strip('\x00'))),
+    (re.compile('&(?P<num>#\d+);'), lambda match: entity_to_unicode(match)),
-    (re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % hex(int(match.group('num')))[2:].rjust(4, '0')),
+    (re.compile('&(?P<num>.+);'), lambda match: entity_to_unicode(match)),
    (re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
    (re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
    (re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
    (re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
@ -162,13 +162,4 @@ def html_to_pml(html):
        pml += body
    # Replace symbols outside of cp1512 wtih \Uxxxx
    chars = set(pml)
    unichars = []
    for c in chars:
        if ord(c) > 128:
            unichars.append(c)
    for u in unichars:
        pml = pml.replace(u, '\U%s' % hex(ord(u))[2:].rjust(4, '0'))
    return pml