From b3ad9f0160839ecc1115a038608128f594261ee3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 12 Oct 2009 07:35:27 -0600
Subject: [PATCH] eReader PDB output: proper length of indexes and do not try
 to add them if they are not avaliable. PML Outpu: cleanup. PML Input: read
 unicode and entity PML tags correctly.

---
 src/calibre/ebooks/fb2/fb2ml.py             |   1 -
 src/calibre/ebooks/pdb/ereader/reader132.py |   1 -
 src/calibre/ebooks/pdb/ereader/writer.py    | 184 +++++++++++++++-----
 src/calibre/ebooks/pml/pmlconverter.py      |  15 +-
 src/calibre/ebooks/pml/pmlml.py             |   5 +
 5 files changed, 149 insertions(+), 57 deletions(-)

diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index ff914568d2..aaf8361b99 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -75,7 +75,6 @@ class FB2MLizer(object):
         output.append(self.fb2mlize_images())
         output.append(self.fb2_footer())
         output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc())
-        return output
         return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
 
     def fb2_header(self):
diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py
index 98dbe13790..49fdfb8980 100644
--- a/src/calibre/ebooks/pdb/ereader/reader132.py
+++ b/src/calibre/ebooks/pdb/ereader/reader132.py
@@ -34,7 +34,6 @@ class HeaderRecord(object):
         self.has_metadata, = struct.unpack('>H', raw[24:26])
         self.footnote_rec, = struct.unpack('>H', raw[28:30])
         self.sidebar_rec, = struct.unpack('>H', raw[30:32])
-        self.bookmark_offset, = struct.unpack('>H', raw[32:34])
         self.image_data_offset, = struct.unpack('>H', raw[40:42])
         self.metadata_offset, = struct.unpack('>H', raw[44:46])
         self.footnote_offset, = struct.unpack('>H', raw[48:50])
diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py
index 2f4e3bf16f..263f6964bf 100644
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@@ -8,6 +8,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
+import re
 import struct
 import zlib
 
@@ -28,7 +29,7 @@ IDENTITY = 'PNRdPPrs'
 
 # This is an arbitrary number that is small enough to work. The actual maximum
 # record size is unknown.
-MAX_RECORD_SIZE = 3560
+MAX_RECORD_SIZE = 8192
 
 class Writer(FormatWriter):
 
@@ -37,13 +38,33 @@ class Writer(FormatWriter):
         self.log = log
 
     def write_content(self, oeb_book, out_stream, metadata=None):
-        text, image_hrefs = self._text(oeb_book)
-        images = self._images(oeb_book.manifest, image_hrefs)
+        pmlmlizer = PMLMLizer(self.log)
+        pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
+
+        text, text_sizes = self._text(pml)
+        chapter_index = self._chapter_index(pml)
+        link_index = self._link_index(pml)
+        images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
         metadata = [self._metadata(metadata)]
+        hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))]
 
-        hr = [self._header_record(len(text), len(images))]
-
-        sections = hr+text+images+metadata+['MeTaInFo\x00']
+        '''
+        Record order as generated by Dropbook.
+            1. eReader Header
+            2. Compressed text
+            3. Small font page index
+            4. Large font page index
+            5. Chapter index
+            6. Links index
+            7. Images
+            8. (Extrapolation: there should be one more record type here though yet uncovered what it might be).
+            9. Metadata
+           10. Sidebar records
+           11. Footnote records
+           12. Text block size record
+           13. "MeTaInFo\x00" word record
+        '''
+        sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+['MeTaInFo\x00']
 
         lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]
 
@@ -57,17 +78,74 @@ class Writer(FormatWriter):
             else:
                 out_stream.write(item)
 
-    def _text(self, oeb_book):
-        pmlmlizer = PMLMLizer(self.log)
-        pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
-
+    def _text(self, pml):
         pml_pages = []
-        for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
-            pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]))
+        text_sizes = ''
+        index = 0
+        while index < len(pml):
+            '''
+            Split on the space character closest to MAX_RECORD_SIZE when possible.
+            '''
+            split = pml.rfind(' ', index, MAX_RECORD_SIZE)
+            if split == -1:
+                len_end = len(pml[index:])
+                if len_end > MAX_RECORD_SIZE:
+                    split = MAX_RECORD_SIZE
+                else:
+                    split = len_end
+            if split == 0:
+                split = 1
+            pml_pages.append(zlib.compress(pml[index:index+split]))
+            text_sizes += struct.pack('>H', split)
+            index += split
 
-        return pml_pages, pmlmlizer.image_hrefs
+        return pml_pages, text_sizes
+
+    def _index_item(self, mo):
+        index = ''
+        if 'text' in mo.groupdict().keys():
+            index += struct.pack('>L', mo.start())
+            text = mo.group('text')
+            # Strip all PML tags from text
+            text = re.sub(r'\\U[0-9a-z]{4}', '', text)
+            text = re.sub(r'\\a\d{3}', '', text)
+            text = re.sub(r'\\.', '', text)
+            # Add appropriate spacing to denote the various levels of headings
+            if 'val' in mo.groupdict().keys():
+                text = '%s%s' % (' ' * 4 * int(mo.group('val')), text)
+            index += text
+            index += '\x00'
+        return index
+
+    def _chapter_index(self, pml):
+        chapter_marks = [
+            r'(?s)\\x(?P<text>.+?)\\x',
+            r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]',
+            r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"',
+        ]
+        index = []
+        for chapter_mark in chapter_marks:
+            for mo in re.finditer(chapter_mark, pml):
+                index.append(self._index_item(mo))
+        return index
+
+    def _link_index(self, pml):
+        index = []
+        for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml):
+            index.append(self._index_item(mo))
+        return index
 
     def _images(self, manifest, image_hrefs):
+        '''
+        Image format.
+
+        0-4   : 'PNG '. There must be a space after PNG.
+        4-36  : Image name. Must be exactly 32 bytes long. Pad with \x00 for names shorter than 32 bytes
+        36-58 : Unknown.
+        58-60 : Width.
+        60-62 : Height.
+        62-...: Raw image data in 8 bit PNG format.
+        '''
         images = []
 
         for item in manifest:
@@ -82,6 +160,8 @@ class Writer(FormatWriter):
 
                     header = 'PNG '
                     header += image_hrefs[item.href].ljust(32, '\x00')[:32]
+                    header = header.ljust(58, '\x00')
+                    header += struct.pack('>HH', im.size[0], im.size[1])
                     header = header.ljust(62, '\x00')
 
                     if len(data) + len(header) < 65505:
@@ -121,52 +201,60 @@ class Writer(FormatWriter):
 
         return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn)
 
-    def _header_record(self, text_items, image_items):
+    def _header_record(self, text_count, chapter_count, link_count, image_count):
         '''
-        text_items = the number of text pages
-        image_items = the number of images
+        text_count = the number of text pages
+        image_count = the number of images
         '''
-        version = 10 # Zlib compression
-        non_text_offset = text_items + 1
+        compression = 10 # zlib compression.
+        non_text_offset = text_count + 1
 
-        if image_items > 0:
-            image_data_offset = text_items + 1
-            meta_data_offset = image_data_offset + image_items
+        chapter_offset = non_text_offset
+        link_offset = chapter_offset + chapter_count
+
+        if image_count > 0:
+            image_data_offset = link_offset + link_count
+            meta_data_offset = image_data_offset + image_count
             last_data_offset = meta_data_offset + 1
         else:
-            meta_data_offset = text_items + 1
+            meta_data_offset = link_offset + link_count
             last_data_offset = meta_data_offset + 1
             image_data_offset = last_data_offset
 
+        if chapter_count == 0:
+            chapter_offset = last_data_offset
+        if link_count == 0:
+            link_offset = last_data_offset
+
         record = ''
 
-        record += struct.pack('>H', version)                # [0:2]    # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
-        record += struct.pack('>H', 0)                      # [2:4]
-        record += struct.pack('>H', 0)                      # [4:6]
+        record += struct.pack('>H', compression)            # [0:2]    # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
+        record += struct.pack('>H', 0)                      # [2:4]    # Unknown.
+        record += struct.pack('>H', 0)                      # [4:6]    # Unknown.
         record += struct.pack('>H', 25152)                  # [6:8]    # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text
-        record += struct.pack('>H', 0)                      # [8:10]
-        record += struct.pack('>H', 0)                      # [10:12]
-        record += struct.pack('>H', non_text_offset)        # [12:14]  # non_text_offset
-        record += struct.pack('>H', 0)                      # [14:16]
-        record += struct.pack('>H', 0)                      # [16:18]
-        record += struct.pack('>H', 0)                      # [18:20]
-        record += struct.pack('>H', image_items)            # [20:22]  # Number of images
-        record += struct.pack('>H', 0)                      # [22:24]
-        record += struct.pack('>H', 1)                      # [24:26]  # 1 if has metadata, 0 if not
-        record += struct.pack('>H', 0)                      # [26:28]
-        record += struct.pack('>H', 0)                      # [28:30]  # footnote_rec
-        record += struct.pack('>H', 0)                      # [30:32]  # sidebar_rec
-        record += struct.pack('>H', last_data_offset)       # [32:34]  # bookmark_offset
-        record += struct.pack('>H', 2560)                   # [34:36]  # 2560 is MAGIC
-        record += struct.pack('>H', 0)                      # [36:38]
-        record += struct.pack('>H', 0)                      # [38:40]
-        record += struct.pack('>H', image_data_offset)      # [40:42]  # image_data_offset. This will be the last data offset if there are no images
-        record += struct.pack('>H', 0)                      # [42:44]
-        record += struct.pack('>H', meta_data_offset)       # [44:46]  # meta_data_offset. This will be the last data offset if there are no images
-        record += struct.pack('>H', 0)                      # [46:48]
-        record += struct.pack('>H', last_data_offset)       # [48:50]  # footnote_offset. This will be the last data offset if there are no images
-        record += struct.pack('>H', last_data_offset)       # [50:52]  # sidebar_offset. This will be the last data offset if there are no images
-        record += struct.pack('>H', last_data_offset)       # [52:54]  # last_data_offset
+        record += struct.pack('>H', 0)                      # [8:10]   # Number of small font pages. 0 if page index is not built.
+        record += struct.pack('>H', 0)                      # [10:12]  # Number of large font pages. 0 if page index is not built.
+        record += struct.pack('>H', non_text_offset)        # [12:14]  # Non-Text record start.
+        record += struct.pack('>H', chapter_count)          # [14:16]  # Number of chapter index records.
+        record += struct.pack('>H', 0)                      # [16:18]  # Number of small font page index records.
+        record += struct.pack('>H', 0)                      # [18:20]  # Number of large font page index records.
+        record += struct.pack('>H', image_count)            # [20:22]  # Number of images.
+        record += struct.pack('>H', link_count)             # [22:24]  # Number of links.
+        record += struct.pack('>H', 1)                      # [24:26]  # 1 if has metadata, 0 if not.
+        record += struct.pack('>H', 0)                      # [26:28]  # Unknown.
+        record += struct.pack('>H', 0)                      # [28:30]  # Number of Footnotes.
+        record += struct.pack('>H', 0)                      # [30:32]  # Number of Sidebars.
+        record += struct.pack('>H', chapter_offset)         # [32:34]  # Chapter index offset.
+        record += struct.pack('>H', 2560)                   # [34:36]  # 2560 is MAGIC.
+        record += struct.pack('>H', last_data_offset)       # [36:38]  # Small font page offset. This will be the last data offset if there are none.
+        record += struct.pack('>H', last_data_offset)       # [38:40]  # Large font page offset. This will be the last data offset if there are none.
+        record += struct.pack('>H', image_data_offset)      # [40:42]  # Image offset. This will be the last data offset if there are none.
+        record += struct.pack('>H', link_offset)            # [42:44]  # Links offset. This will be the last data offset if there are none.
+        record += struct.pack('>H', meta_data_offset)       # [44:46]  # Metadata offset. This will be the last data offset if there are none.
+        record += struct.pack('>H', 0)                      # [46:48]  # Unknown.
+        record += struct.pack('>H', last_data_offset)       # [48:50]  # Footnote offset. This will be the last data offset if there are none.
+        record += struct.pack('>H', last_data_offset)       # [50:52]  # Sidebar offset. This will be the last data offset if there are none.
+        record += struct.pack('>H', last_data_offset)       # [52:54]  # Last data offset.
 
         for i in range(54, 132, 2):
             record += struct.pack('>H', 0)                  # [54:132]
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index b4ab238da9..c72a21a5f9 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -18,10 +18,10 @@ PML_HTML_RULES = [
     (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''),
     (re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
     (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry
-    (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text') if match.group('text') else ''),
+    (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<span style="text-align: center; display: block; margin: auto;">%s</span>' % match.group('text') if match.group('text') else ''),
+    (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<span style="text-align: right; display: block;">%s</span>' % match.group('text') if match.group('text') else ''),
     (re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text') if match.group('text') else ''),
+    (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<span style="text-decoration: underline;">%s</span>' % match.group('text') if match.group('text') else ''),
     (re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''),
     (re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''),
     (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''),
@@ -35,8 +35,8 @@ PML_HTML_RULES = [
     (re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
     (re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
     (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text').upper() if match.group('text') else ''),
-    (re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
-    (re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
+    (re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')),
+    (re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
     (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
     (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
     (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
@@ -64,7 +64,7 @@ PML_HTML_RULES = [
     (re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''),
     (re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''),
     # Remove invalid single item pml codes.
-    (re.compile(r'(?<=[^\\])\\.'), lambda match: ''),
+    (re.compile(r'(?<=[^\\])\\[^\\]'), lambda match: ''),
 
     # Replace \\ with \.
     (re.compile(r'\\\\'), lambda match: '\\'),
@@ -78,6 +78,7 @@ def pml_to_html(pml):
     return html
 
 def footnote_sidebar_to_html(id, pml):
+    if id.startswith('\x01'):
+        id = id[2:]
     html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
     return html
-
diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index 2438fd9bef..9582d2bfbb 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -154,10 +154,15 @@ class PMLMLizer(object):
         for unused in anchors.difference(links):
             text = text.replace('\\Q="%s"' % unused, '')
 
+        # Turn all html entities into unicode. This should not be necessary as
+        # lxml should have already done this but we want to be sure it happens.
         for entity in set(re.findall('&.+?;', text)):
             mo = re.search('(%s)' % entity[1:-1], text)
             text = text.replace(entity, entity_to_unicode(mo))
 
+        # Turn all unicode characters into their PML hex equivelent
+        text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text)
+
         return text
 
     def dump_text(self, elem, stylizer, page, tag_stack=[]):