From b857fd3fd13a3cf57d9f6cd3231898444f00b382 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 9 Apr 2011 20:01:49 -0400
Subject: [PATCH 01/13] Start of plucker input support.

---
 src/calibre/ebooks/pdb/__init__.py         |   4 +-
 src/calibre/ebooks/pdb/plucker/__init__.py |   0
 src/calibre/ebooks/pdb/plucker/reader.py   | 149 +++++++++++++++++++++
 3 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 src/calibre/ebooks/pdb/plucker/__init__.py
 create mode 100644 src/calibre/ebooks/pdb/plucker/reader.py

diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py
index 092c8a21bd..c8089297db 100644
--- a/src/calibre/ebooks/pdb/__init__.py
+++ b/src/calibre/ebooks/pdb/__init__.py
@@ -12,6 +12,7 @@ from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader
 from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader
 from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
 from calibre.ebooks.pdb.pdf.reader import Reader as pdf_reader
+from calibre.ebooks.pdb.plucker.reader import Reader as plucker_reader
 
 FORMAT_READERS = {
     'PNPdPPrs': ereader_reader,
@@ -19,6 +20,7 @@ FORMAT_READERS = {
     'zTXTGPlm': ztxt_reader,
     'TEXtREAd': palmdoc_reader,
     '.pdfADBE': pdf_reader,
+    'DataPlkr': plucker_reader,
 }
 
 from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
@@ -37,6 +39,7 @@ IDENTITY_TO_NAME = {
     'zTXTGPlm': 'zTXT',
     'TEXtREAd': 'PalmDOC',
     '.pdfADBE': 'Adobe Reader',
+    'DataPlkr': 'Plucker',
 
     'BVokBDIC': 'BDicty',
     'DB99DBOS': 'DB (Database program)',
@@ -50,7 +53,6 @@ IDENTITY_TO_NAME = {
     'DATALSdb': 'LIST',
     'Mdb1Mdb1': 'MobileDB',
     'BOOKMOBI': 'MobiPocket',
-    'DataPlkr': 'Plucker',
     'DataSprd': 'QuickSheet',
     'SM01SMem': 'SuperMemo',
     'TEXtTlDc': 'TealDoc',
diff --git a/src/calibre/ebooks/pdb/plucker/__init__.py b/src/calibre/ebooks/pdb/plucker/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
new file mode 100644
index 0000000000..d1e5931580
--- /dev/null
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+
+#from __future__ import (unicode_literals, division, absolute_import, print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '20011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+import struct
+import zlib
+
+from calibre import CurrentDir
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.pdb.formatreader import FormatReader
+
+DATATYPE_PHTML = 0
+DATATYPE_PHTML_COMPRESSED = 1
+DATATYPE_TBMP = 2
+DATATYPE_TBMP_COMPRESSED = 3
+DATATYPE_MAILTO = 4
+DATATYPE_LINK_INDEX = 5
+DATATYPE_LINKS = 6
+DATATYPE_LINKS_COMPRESSED = 7
+DATATYPE_BOOKMARKS = 8
+DATATYPE_CATEGORY = 9
+DATATYPE_METADATA = 10
+DATATYPE_STYLE_SHEET = 11
+DATATYPE_FONT_PAGE = 12
+DATATYPE_TABLE = 13
+DATATYPE_TABLE_COMPRESSED = 14
+DATATYPE_COMPOSITE_IMAGE = 15
+DATATYPE_PAGELIST_METADATA = 16
+DATATYPE_SORTED_URL_INDEX = 17
+DATATYPE_SORTED_URL = 18
+DATATYPE_SORTED_URL_COMPRESSED = 19
+DATATYPE_EXT_ANCHOR_INDEX = 20
+DATATYPE_EXT_ANCHOR = 21
+DATATYPE_EXT_ANCHOR_COMPRESSED = 22
+
+class HeaderRecord(object):
+
+    def __init__(self, raw):
+        self.uid, = struct.unpack('>H', raw[0:2])
+        # This is labled version in the spec.
+        # 2 is ZLIB compressed,
+        # 1 is DOC compressed
+        self.compression, = struct.unpack('>H', raw[2:4])
+        self.records, = struct.unpack('>H', raw[4:6])
+        
+        self.reserved = {}
+        for i in xrange(self.records):
+            adv = 4*i
+            name, = struct.unpack('>H', raw[6+adv:8+adv])
+            id, = struct.unpack('>H', raw[8+adv:10+adv])
+            self.reserved[id] = name
+
+
+class SectionHeader(object):
+    
+    def __init__(self, raw):
+        self.uid, = struct.unpack('>H', raw[0:2])
+        self.paragraphs, = struct.unpack('>H', raw[2:4])
+        self.size, = struct.unpack('>H', raw[4:6])
+        self.type, = struct.unpack('>B', raw[6])
+        self.flags, = struct.unpack('>B', raw[7])
+
+
+class SectionHeaderText(object):
+    
+    def __init__(self, data_header, raw):
+        self.sizes = []
+        self.attributes = []
+
+        for i in xrange(data_header.paragraphs):
+            adv = 4*i
+            self.sizes.append(struct.unpack('>H', raw[8+adv:10+adv])[0])
+            self.attributes.append(struct.unpack('>H', raw[10+adv:12+adv])[0])
+
+
+class Reader(FormatReader):
+
+    def __init__(self, header, stream, log, options):
+        self.stream = stream
+        self.log = log
+        self.options = options
+        
+        self.sections = []
+        for i in range(1, header.num_sections):
+            start = 8
+            raw_data = header.section_data(i)
+            data_header = SectionHeader(raw_data)
+            sub_header = None
+            if data_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
+                sub_header = SectionHeaderText(data_header, raw_data)
+                start += data_header.paragraphs * 4
+            self.sections.append((data_header, sub_header, raw_data[start:]))
+
+        self.header_record = HeaderRecord(header.section_data(0))
+
+        from calibre.ebooks.metadata.pdb import get_metadata
+        self.mi = get_metadata(stream, False)
+
+    def extract_content(self, output_dir):
+        html = u''
+        images = []
+        
+        for header, sub_header, data in self.sections:
+            if header.type == DATATYPE_PHTML:
+                html += data
+            elif header.type == DATATYPE_PHTML_COMPRESSED:
+                d = self.decompress_phtml(data).decode('latin-1', 'replace')
+                print len(d) == header.size
+                html += d
+        
+        print html
+        with CurrentDir(output_dir):
+            with open('index.html', 'wb') as index:
+                self.log.debug('Writing text to index.html')
+                index.write(html.encode('utf-8'))
+        
+        opf_path = self.create_opf(output_dir, images)
+
+        return opf_path
+
+    def decompress_phtml(self, data):
+        if self.header_record.compression == 2:
+            raise NotImplementedError
+            #return zlib.decompress(data)
+        elif self.header_record.compression == 1:
+            from calibre.ebooks.compression.palmdoc import decompress_doc
+            return decompress_doc(data)
+            
+
+    def create_opf(self, output_dir, images):
+        with CurrentDir(output_dir):
+            opf = OPFCreator(output_dir, self.mi)
+
+            manifest = [('index.html', None)]
+
+            for i in images:
+                manifest.append((os.path.join('images/', i), None))
+
+            opf.create_manifest(manifest)
+            opf.create_spine(['index.html'])
+            with open('metadata.opf', 'wb') as opffile:
+                opf.render(opffile)
+
+        return os.path.join(output_dir, 'metadata.opf')

From 0f3228e6585dadcf6f4aa6110ed3619966bbfff2 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 11 Apr 2011 19:04:56 -0400
Subject: [PATCH 02/13] Basic plucker working (text, non-composite images).

---
 src/calibre/ebooks/pdb/plucker/reader.py | 455 +++++++++++++++++++++--
 1 file changed, 425 insertions(+), 30 deletions(-)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index d1e5931580..502682baba 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -10,9 +10,13 @@ import os
 import struct
 import zlib
 
+from collections import OrderedDict
+
 from calibre import CurrentDir
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.pdb.formatreader import FormatReader
+from calibre.ptempfile import TemporaryFile
+from calibre.utils.magick import Image
 
 DATATYPE_PHTML = 0
 DATATYPE_PHTML_COMPRESSED = 1
@@ -38,6 +42,100 @@ DATATYPE_EXT_ANCHOR_INDEX = 20
 DATATYPE_EXT_ANCHOR = 21
 DATATYPE_EXT_ANCHOR_COMPRESSED = 22
 
+# IETF IANA MIBenum value for the character set.
+# See the http://www.iana.org/assignments/character-sets for valid values.
+# Not all character sets are handled by Python. This is a small subset that
+# the MIBenum maps to Python standard encodings
+# from http://docs.python.org/library/codecs.html#standard-encodings
+MIBNUM_TO_NAME = {
+    3: 'ascii',
+    4: 'latin_1',
+    5: 'iso8859_2',
+    6: 'iso8859_3',
+    7: 'iso8859_4',
+    8: 'iso8859_5',
+    9: 'iso8859_6',
+    10: 'iso8859_7',
+    11: 'iso8859_8',
+    12: 'iso8859_9',
+    13: 'iso8859_10',
+    17: 'shift_jis',
+    18: 'euc_jp',
+    27: 'utf_7',
+    36: 'euc_kr',
+    37: 'iso2022_kr',
+    38: 'euc_kr',
+    39: 'iso2022_jp',
+    40: 'iso2022_jp_2',
+    106: 'utf-8',
+    109: 'iso8859_13',
+    110: 'iso8859_14',
+    111: 'iso8859_15',
+    112: 'iso8859_16',
+    1013: 'utf_16_be',
+    1014: 'utf_16_le',
+    1015: 'utf_16',
+    2009: 'cp850',
+    2010: 'cp852',
+    2011: 'cp437',
+    2013: 'cp862',
+    2025: 'gb2312',
+    2026: 'big5',
+    2028: 'cp037',
+    2043: 'cp424',
+    2044: 'cp500',
+    2046: 'cp855',
+    2047: 'cp857',
+    2048: 'cp860',
+    2049: 'cp861',
+    2050: 'cp863',
+    2051: 'cp864',
+    2052: 'cp865',
+    2054: 'cp869',
+    2063: 'cp1026',
+    2085: 'hz',
+    2086: 'cp866',
+    2087: 'cp775',
+    2089: 'cp858',
+    2091: 'cp1140',
+    2102: 'big5hkscs',
+    2250: 'cp1250',
+    2251: 'cp1251',
+    2252: 'cp1252',
+    2253: 'cp1253',
+    2254: 'cp1254',
+    2255: 'cp1255',
+    2256: 'cp1256',
+    2257: 'cp1257',
+    2258: 'cp1258',    
+}
+
+def decompress_doc(data):
+    buffer = [ord(i) for i in data]
+    res = []
+    i = 0
+    while i < len(buffer):
+        c = buffer[i]
+        i += 1
+        if c >= 1 and c <= 8:
+            res.extend(buffer[i:i+c])
+            i += c
+        elif c <= 0x7f:
+            res.append(c)
+        elif c >= 0xc0:
+            res.extend( (ord(' '), c^0x80) )
+        else:
+            c = (c << 8) + buffer[i]
+            i += 1
+            di = (c & 0x3fff) >> 3
+            j = len(res)
+            num = (c & ((1 << 3) - 1)) + 3
+
+            for k in range( num ):
+                res.append(res[j - di+k])
+
+    return ''.join([chr(i) for i in res])
+
 class HeaderRecord(object):
 
     def __init__(self, raw):
@@ -68,14 +166,62 @@ class SectionHeader(object):
 
 class SectionHeaderText(object):
     
-    def __init__(self, data_header, raw):
+    def __init__(self, section_header, raw):
         self.sizes = []
         self.attributes = []
 
-        for i in xrange(data_header.paragraphs):
+        for i in xrange(section_header.paragraphs):
             adv = 4*i
-            self.sizes.append(struct.unpack('>H', raw[8+adv:10+adv])[0])
-            self.attributes.append(struct.unpack('>H', raw[10+adv:12+adv])[0])
+            self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0])
+            self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0])
+
+class SectionMetadata(object):
+    
+    def __init__(self, raw):
+        self.default_encoding = 'utf-8'
+        self.exceptional_uid_encodings = {}
+        self.owner_id = None
+        
+        record_count, = struct.unpack('>H', raw[0:2])
+        
+        adv = 0
+        for i in xrange(record_count):
+            type, = struct.unpack('>H', raw[2+adv:4+adv])
+            length, = struct.unpack('>H', raw[4+adv:6+adv])
+            
+            # CharSet
+            if type == 1:
+                val, = struct.unpack('>H', raw[6+adv:8+adv])
+                self.default_encoding = MIBNUM_TO_NAME.get(val, 'utf-8')
+            # ExceptionalCharSets
+            elif type == 2:
+                ii_adv = 0
+                for ii in xrange(length / 2):
+                    uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv])
+                    mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv])
+                    self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'utf-8')
+                    ii_adv += 4
+            # OwnerID
+            elif type == 3:
+                self.owner_id = struct.unpack('>I', raw[6+adv:10+adv])
+            # Author, Title, PubDate
+            # Ignored here. The metadata reader plugin
+            # will get this info because if it's missing
+            # the metadata reader plugin will use fall
+            # back data from elsewhere in the file.
+            elif type in (4, 5, 6):
+                pass
+            # Linked Documents
+            elif type == 7:
+                pass
+             
+            adv += 2*length
+
+class SectionText(object):
+    
+    def __init__(self, section_header, raw):
+        self.header = SectionHeaderText(section_header, raw)
+        self.data = raw[section_header.paragraphs * 4:]
 
 
 class Reader(FormatReader):
@@ -84,53 +230,302 @@ class Reader(FormatReader):
         self.stream = stream
         self.log = log
         self.options = options
-        
-        self.sections = []
-        for i in range(1, header.num_sections):
-            start = 8
-            raw_data = header.section_data(i)
-            data_header = SectionHeader(raw_data)
-            sub_header = None
-            if data_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
-                sub_header = SectionHeaderText(data_header, raw_data)
-                start += data_header.paragraphs * 4
-            self.sections.append((data_header, sub_header, raw_data[start:]))
 
+        # Mapping of section uid to our internal
+        # list of sections.
+        self.uid_section_number = OrderedDict()
+        self.uid_text_secion_number = OrderedDict()
+        self.uid_text_secion_encoding = {}
+        self.uid_image_section_number = {}
+        self.metadata_section_number = None
+        self.default_encoding = 'utf-8'
+        self.owner_id = None
+        self.sections = []
+        
         self.header_record = HeaderRecord(header.section_data(0))
+        
+        for i in range(1, header.num_sections):
+            section_number = i - 1
+            start = 8
+            section = None
+            
+            raw_data = header.section_data(i)
+            section_header = SectionHeader(raw_data)
+            
+            self.uid_section_number[section_header.uid] = section_number
+            
+            if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
+                self.uid_text_secion_number[section_header.uid] = section_number
+                section = SectionText(section_header, raw_data[start:])
+            elif section_header.type in (DATATYPE_TBMP, DATATYPE_TBMP_COMPRESSED):
+                self.uid_image_section_number[section_header.uid] = section_number
+                section = raw_data[start:]
+            elif section_header.type == DATATYPE_METADATA:
+                self.metadata_section_number = section_number
+                section = SectionMetadata(raw_data[start:])
+            elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
+                
+
+            self.sections.append((section_header, section))
+
+        if self.metadata_section_number:
+            mdata_section = self.sections[self.metadata_section_number][1]
+            for k, v in mdata_section.exceptional_uid_encodings.items():
+                self.uid_text_secion_encoding[k] = v
+            self.default_encoding = mdata_section.default_encoding
+            self.owner_id = mdata_section.owner_id
 
         from calibre.ebooks.metadata.pdb import get_metadata
         self.mi = get_metadata(stream, False)
 
     def extract_content(self, output_dir):
-        html = u''
+        html = u'<html><body>'
         images = []
-        
-        for header, sub_header, data in self.sections:
-            if header.type == DATATYPE_PHTML:
-                html += data
-            elif header.type == DATATYPE_PHTML_COMPRESSED:
-                d = self.decompress_phtml(data).decode('latin-1', 'replace')
-                print len(d) == header.size
-                html += d
-        
-        print html
+
+        for uid, num in self.uid_text_secion_number.items():
+            section_header, section_data = self.sections[num]
+            if section_header.type == DATATYPE_PHTML:
+                html += self.process_phtml(section_data.header, section_data.data.decode(self.get_text_uid_encoding(section_header.uid), 'replace'))
+            elif section_header.type == DATATYPE_PHTML_COMPRESSED:
+                d = self.decompress_phtml(section_data.data).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
+                html += self.process_phtml(section_data.header, d)
+
+        html += '</body></html>'
+
         with CurrentDir(output_dir):
             with open('index.html', 'wb') as index:
                 self.log.debug('Writing text to index.html')
                 index.write(html.encode('utf-8'))
-        
+
+        if not os.path.exists(os.path.join(output_dir, 'images/')):
+            os.makedirs(os.path.join(output_dir, 'images/'))
+        with CurrentDir(os.path.join(output_dir, 'images/')):
+            #im.read('/Users/john/Tmp/plkr/apnx.palm')
+            for uid, num in self.uid_image_section_number.items():
+                section_header, section_data = self.sections[num]
+                if section_data:
+                    idata = None
+                    if section_header.type == DATATYPE_TBMP:
+                        idata = section_data
+                    elif section_header.type == DATATYPE_TBMP_COMPRESSED:
+                        if self.header_record.compression == 1:
+                            idata = decompress_doc(section_data)
+                        elif self.header_record.compression == 2:
+                            idata = zlib.decompress(section_data)
+                    try:
+                        with TemporaryFile(suffix='.palm') as itn:
+                            with open(itn, 'wb') as itf: 
+                                itf.write(idata)
+                            im = Image()
+                            im.read(itn)
+                            im.set_compression_quality(70)
+                            im.save('%s.jpg' % uid)
+                            self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
+                    except Exception as e:
+                        self.log.error('Failed to write image with uid %s: %s' % (uid, e))
+                    images.append('%s.jpg' % uid)
+                else:
+                    self.log.error('Failed to write image with uid %s: No data.' % uid)
+
         opf_path = self.create_opf(output_dir, images)
 
         return opf_path
 
     def decompress_phtml(self, data):
         if self.header_record.compression == 2:
-            raise NotImplementedError
-            #return zlib.decompress(data)
+            if self.owner_id:
+                raise NotImplementedError
+            return zlib.decompress(data)
         elif self.header_record.compression == 1:
-            from calibre.ebooks.compression.palmdoc import decompress_doc
+            #from calibre.ebooks.compression.palmdoc import decompress_doc
             return decompress_doc(data)
             
+    def process_phtml(self, sub_header, d):
+        html = u''
+        offset = 0
+        paragraph_open = False
+        paragraph_offsets = []
+        running_offset = 0
+        for size in sub_header.sizes:
+            running_offset += size
+            paragraph_offsets.append(running_offset)
+        
+        while offset < len(d):
+            if not paragraph_open:
+                html += u'<p>'
+                paragraph_open = True
+
+            c = ord(d[offset])
+            if c == 0x0:
+                offset += 1
+                c = ord(d[offset])
+                # Page link begins
+                # 2 Bytes
+                # record ID
+                if c == 0x0a:
+                    offset += 2
+                # Targeted page link begins
+                # 3 Bytes
+                # record ID, target
+                elif c == 0x0b:
+                    offset += 3
+                # Paragraph link begins
+                # 4 Bytes
+                # record ID, paragraph number
+                elif c == 0x0c:
+                    offset += 4
+                # Targeted paragraph link begins
+                # 5 Bytes
+                # record ID, paragraph number, target
+                elif c == 0x0d:
+                    offset += 5
+                # Link ends
+                # 0 Bytes
+                elif c == 0x08:
+                    pass
+                # Set font
+                # 1 Bytes
+                # font specifier
+                elif c == 0x11:
+                    offset += 1
+                # Embedded image
+                # 2 Bytes
+                # image record ID
+                elif c == 0x1a:
+                    offset += 1
+                    uid = struct.unpack('>H', d[offset:offset+2])[0]
+                    html += '<img src="images/%s.jpg" />' % uid
+                    offset += 1
+                # Set margin
+                # 2 Bytes
+                # left margin, right margin
+                elif c == 0x22:
+                    offset += 2
+                # Alignment of text
+                # 1 Bytes
+                # alignment
+                elif c == 0x29:
+                    offset += 1
+                # Horizontal rule
+                # 3 Bytes
+                # 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100)
+                elif c == 0x33:
+                    offset += 3
+                    if paragraph_open:
+                        html += u'</p>'
+                        paragraph_open = False
+                    html += u'<hr />'
+                # New line
+                # 0 Bytes
+                elif c == 0x38:
+                    if paragraph_open:
+                        html += u'</p>\n'
+                        paragraph_open = False
+                # Italic text begins
+                # 0 Bytes
+                elif c == 0x40:
+                    html += u'<i>'
+                # Italic text ends
+                # 0 Bytes
+                elif c == 0x48:
+                    html += u'</i>'
+                # Set text color
+                # 3 Bytes
+                # 8-bit red, 8-bit green, 8-bit blue
+                elif c == 0x53:
+                    offset += 3
+                # Multiple embedded image
+                # 4 Bytes
+                # alternate image record ID, image record ID
+                elif c == 0x5c:
+                    offset += 4
+                # Underline text begins
+                # 0 Bytes
+                elif c == 0x60:
+                    html += u'<u>'
+                # Underline text ends
+                # 0 Bytes
+                elif c == 0x68:
+                    html += u'</u>'
+                # Strike-through text begins
+                # 0 Bytes
+                elif c == 0x70:
+                    html += u'<s>'
+                # Strike-through text ends
+                # 0 Bytes
+                elif c == 0x78:
+                    html += u'</s>'
+                # 16-bit Unicode character
+                # 3 Bytes
+                # alternate text length, 16-bit unicode character
+                elif c == 0x83:
+                    #offset += 2
+                    #c16 = d[offset:offset+2]
+                    #html += c16.decode('utf-16')
+                    #offset += 1
+                    offset += 3
+                # 32-bit Unicode character
+                # 5 Bytes
+                # alternate text length, 32-bit unicode character
+                elif c == 0x85:
+                    #offset += 2
+                    #c32 = d[offset:offset+4]
+                    #html += c32.decode('utf-32')
+                    #offset += 3
+                    offset += 5
+                # Begin custom font span
+                # 6 Bytes
+                # font page record ID, X page position, Y page position
+                elif c == 0x8e:
+                    offset += 6
+                # Adjust custom font glyph position
+                # 4 Bytes
+                # X page position, Y page position
+                elif c == 0x8c:
+                    offset += 4
+                # Change font page
+                # 2 Bytes
+                # font record ID
+                elif c == 0x8a:
+                    offset += 2
+                # End custom font span
+                # 0 Bytes
+                elif c == 0x88:
+                    pass
+                # Begin new table row
+                # 0 Bytes
+                elif c == 0x90:
+                    pass
+                # Insert table (or table link)
+                # 2 Bytes
+                # table record ID
+                elif c == 0x92:
+                    offset += 2
+                # Table cell data
+                # 7 Bytes
+                # 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length
+                elif c == 0x97:
+                    offset += 7
+                # Exact link modifier
+                # 2 Bytes
+                # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or Targeted Paragraph Link function to specify an exact byte offset within the paragraph. This function must be followed immediately by the function it modifies).
+                elif c == 0x9a:
+                    offset += 2
+            else:
+                html += unichr(c)
+            offset += 1
+            if offset in paragraph_offsets:
+                if paragraph_open:
+                    html += u'</p>\n'
+                    paragraph_open = False
+        
+        if paragraph_open:
+            html += u'</p>'
+        
+        return html
+
+    def get_text_uid_encoding(self, uid):
+        return self.uid_text_secion_encoding.get(uid, self.default_encoding)
 
     def create_opf(self, output_dir, images):
         with CurrentDir(output_dir):

From acaa06de53fe280084c753408b682df835b1cf2d Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 16 Apr 2011 14:13:45 -0400
Subject: [PATCH 03/13] Fix decoding text. Add internal link support.

---
 src/calibre/ebooks/pdb/plucker/reader.py | 43 +++++++++++++++++-------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index 502682baba..13dea343a7 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -263,7 +263,7 @@ class Reader(FormatReader):
             elif section_header.type == DATATYPE_METADATA:
                 self.metadata_section_number = section_number
                 section = SectionMetadata(raw_data[start:])
-            elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
+            #elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
                 
 
             self.sections.append((section_header, section))
@@ -285,10 +285,10 @@ class Reader(FormatReader):
         for uid, num in self.uid_text_secion_number.items():
             section_header, section_data = self.sections[num]
             if section_header.type == DATATYPE_PHTML:
-                html += self.process_phtml(section_data.header, section_data.data.decode(self.get_text_uid_encoding(section_header.uid), 'replace'))
+                html += self.process_phtml(section_data.header, section_data.data)
             elif section_header.type == DATATYPE_PHTML_COMPRESSED:
-                d = self.decompress_phtml(section_data.data).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
-                html += self.process_phtml(section_data.header, d)
+                d = self.decompress_phtml(section_data.data)
+                html += self.process_phtml(section_header.uid, section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
 
         html += '</body></html>'
 
@@ -300,7 +300,6 @@ class Reader(FormatReader):
         if not os.path.exists(os.path.join(output_dir, 'images/')):
             os.makedirs(os.path.join(output_dir, 'images/'))
         with CurrentDir(os.path.join(output_dir, 'images/')):
-            #im.read('/Users/john/Tmp/plkr/apnx.palm')
             for uid, num in self.uid_image_section_number.items():
                 section_header, section_data = self.sections[num]
                 if section_data:
@@ -340,10 +339,12 @@ class Reader(FormatReader):
             #from calibre.ebooks.compression.palmdoc import decompress_doc
             return decompress_doc(data)
             
-    def process_phtml(self, sub_header, d):
-        html = u''
+    def process_phtml(self, uid, sub_header, d):
+        html = u'<a id="p%s" /><p id="p%s-0">' % (uid, uid)
         offset = 0
-        paragraph_open = False
+        paragraph_open = True
+        need_set_p_id = False
+        p_num = 1
         paragraph_offsets = []
         running_offset = 0
         for size in sub_header.sizes:
@@ -352,7 +353,12 @@ class Reader(FormatReader):
         
         while offset < len(d):
             if not paragraph_open:
-                html += u'<p>'
+                if need_set_p_id:
+                    html += u'<p id="p%s-%s">' % (uid, p_num)
+                    p_num += 1
+                    need_set_p_id = False
+                else:
+                    html += u'<p>'
                 paragraph_open = True
 
             c = ord(d[offset])
@@ -363,26 +369,36 @@ class Reader(FormatReader):
                 # 2 Bytes
                 # record ID
                 if c == 0x0a:
-                    offset += 2
+                    offset += 1
+                    id = struct.unpack('>H', d[offset:offset+2])[0]
+                    html += '<a href="#p%s">' % id
+                    offset += 1
                 # Targeted page link begins
                 # 3 Bytes
                 # record ID, target
                 elif c == 0x0b:
                     offset += 3
+                    html += '<a>'
                 # Paragraph link begins
                 # 4 Bytes
                 # record ID, paragraph number
                 elif c == 0x0c:
-                    offset += 4
+                    offset += 1
+                    id = struct.unpack('>H', d[offset:offset+2])[0]
+                    offset += 2
+                    pid = struct.unpack('>H', d[offset:offset+2])[0]
+                    html += '<a href="#p%s-%s">' % (id, pid)
+                    offset += 1
                 # Targeted paragraph link begins
                 # 5 Bytes
                 # record ID, paragraph number, target
                 elif c == 0x0d:
                     offset += 5
+                    html += '<a>'
                 # Link ends
                 # 0 Bytes
                 elif c == 0x08:
-                    pass
+                    html += '</a>'
                 # Set font
                 # 1 Bytes
                 # font specifier
@@ -515,10 +531,11 @@ class Reader(FormatReader):
                 html += unichr(c)
             offset += 1
             if offset in paragraph_offsets:
+                need_set_p_id = True
                 if paragraph_open:
                     html += u'</p>\n'
                     paragraph_open = False
-        
+
         if paragraph_open:
             html += u'</p>'
         

From 8557981a51d551907154684b7b16f4d89c56247b Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 16 Apr 2011 14:43:36 -0400
Subject: [PATCH 04/13] Don't put every PHTML record into one ordered html
 file. Plucker documents are groups of separate PHTML pages that are linked
 via hyperlinks.

---
 src/calibre/ebooks/pdb/plucker/reader.py | 78 ++++++++++++------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index 13dea343a7..171c051bbd 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -145,6 +145,7 @@ class HeaderRecord(object):
         # 1 is DOC compressed
         self.compression, = struct.unpack('>H', raw[2:4])
         self.records, = struct.unpack('>H', raw[4:6])
+        self.home_html = None
         
         self.reserved = {}
         for i in xrange(self.records):
@@ -152,6 +153,8 @@ class HeaderRecord(object):
             name, = struct.unpack('>H', raw[6+adv:8+adv])
             id, = struct.unpack('>H', raw[8+adv:10+adv])
             self.reserved[id] = name
+            if name == 0:
+                self.home_html = id
 
 
 class SectionHeader(object):
@@ -279,24 +282,21 @@ class Reader(FormatReader):
         self.mi = get_metadata(stream, False)
 
     def extract_content(self, output_dir):
-        html = u'<html><body>'
-        images = []
-
-        for uid, num in self.uid_text_secion_number.items():
-            section_header, section_data = self.sections[num]
-            if section_header.type == DATATYPE_PHTML:
-                html += self.process_phtml(section_data.header, section_data.data)
-            elif section_header.type == DATATYPE_PHTML_COMPRESSED:
-                d = self.decompress_phtml(section_data.data)
-                html += self.process_phtml(section_header.uid, section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
-
-        html += '</body></html>'
-
         with CurrentDir(output_dir):
-            with open('index.html', 'wb') as index:
-                self.log.debug('Writing text to index.html')
-                index.write(html.encode('utf-8'))
+            for uid, num in self.uid_text_secion_number.items():
+                self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid)))
+                with open('%s.html' % uid, 'wb') as htmlf:
+                    html = u'<html><body>'
+                    section_header, section_data = self.sections[num]
+                    if section_header.type == DATATYPE_PHTML:
+                        html += self.process_phtml(section_data.header, section_data.data)
+                    elif section_header.type == DATATYPE_PHTML_COMPRESSED:
+                        d = self.decompress_phtml(section_data.data)
+                        html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
+                    html += '</body></html>'
+                    htmlf.write(html.encode('utf-8'))
 
+        images = []
         if not os.path.exists(os.path.join(output_dir, 'images/')):
             os.makedirs(os.path.join(output_dir, 'images/'))
         with CurrentDir(os.path.join(output_dir, 'images/')):
@@ -326,9 +326,25 @@ class Reader(FormatReader):
                 else:
                     self.log.error('Failed to write image with uid %s: No data.' % uid)
 
-        opf_path = self.create_opf(output_dir, images)
+        # Run the HTML through the html processing plugin.
+        from calibre.customize.ui import plugin_for_input_format
+        html_input = plugin_for_input_format('html')
+        for opt in html_input.options:
+            setattr(self.options, opt.option.name, opt.recommended_value)
+        self.options.input_encoding = 'utf-8'
+        odi = self.options.debug_pipeline
+        self.options.debug_pipeline = None
+        # Generate oeb from html conversion.
+        try:
+            home_html = self.header_record.home_html
+            if not home_html:
+                home_html = self.uid_text_secion_number.items()[0][0]
+        except:
+            raise Exception(_('Could not determine home.html'))
+        oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {})
+        self.options.debug_pipeline = odi
 
-        return opf_path
+        return oeb
 
     def decompress_phtml(self, data):
         if self.header_record.compression == 2:
@@ -339,8 +355,8 @@ class Reader(FormatReader):
             #from calibre.ebooks.compression.palmdoc import decompress_doc
             return decompress_doc(data)
             
-    def process_phtml(self, uid, sub_header, d):
-        html = u'<a id="p%s" /><p id="p%s-0">' % (uid, uid)
+    def process_phtml(self, sub_header, d):
+        html = u'<p id="p0">'
         offset = 0
         paragraph_open = True
         need_set_p_id = False
@@ -354,7 +370,7 @@ class Reader(FormatReader):
         while offset < len(d):
             if not paragraph_open:
                 if need_set_p_id:
-                    html += u'<p id="p%s-%s">' % (uid, p_num)
+                    html += u'<p id="p%s">' % p_num
                     p_num += 1
                     need_set_p_id = False
                 else:
@@ -371,7 +387,7 @@ class Reader(FormatReader):
                 if c == 0x0a:
                     offset += 1
                     id = struct.unpack('>H', d[offset:offset+2])[0]
-                    html += '<a href="#p%s">' % id
+                    html += '<a href="%s.html">' % id
                     offset += 1
                 # Targeted page link begins
                 # 3 Bytes
@@ -387,7 +403,7 @@ class Reader(FormatReader):
                     id = struct.unpack('>H', d[offset:offset+2])[0]
                     offset += 2
                     pid = struct.unpack('>H', d[offset:offset+2])[0]
-                    html += '<a href="#p%s-%s">' % (id, pid)
+                    html += '<a href="%s.html#p%s">' % (id, pid)
                     offset += 1
                 # Targeted paragraph link begins
                 # 5 Bytes
@@ -543,19 +559,3 @@ class Reader(FormatReader):
 
     def get_text_uid_encoding(self, uid):
         return self.uid_text_secion_encoding.get(uid, self.default_encoding)
-
-    def create_opf(self, output_dir, images):
-        with CurrentDir(output_dir):
-            opf = OPFCreator(output_dir, self.mi)
-
-            manifest = [('index.html', None)]
-
-            for i in images:
-                manifest.append((os.path.join('images/', i), None))
-
-            opf.create_manifest(manifest)
-            opf.create_spine(['index.html'])
-            with open('metadata.opf', 'wb') as opffile:
-                opf.render(opffile)
-
-        return os.path.join(output_dir, 'metadata.opf')

From 644335d97b1494ee04c3c657d435a3aeef44551c Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 16 Apr 2011 21:23:13 -0400
Subject: [PATCH 05/13] Ignore non internal links. Support composite images.

---
 src/calibre/ebooks/pdb/plucker/reader.py | 109 ++++++++++++++++++++---
 1 file changed, 99 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index 171c051bbd..c6c404b125 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -13,10 +13,9 @@ import zlib
 from collections import OrderedDict
 
 from calibre import CurrentDir
-from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.pdb.formatreader import FormatReader
 from calibre.ptempfile import TemporaryFile
-from calibre.utils.magick import Image
+from calibre.utils.magick import Image, create_canvas
 
 DATATYPE_PHTML = 0
 DATATYPE_PHTML_COMPRESSED = 1
@@ -178,6 +177,7 @@ class SectionHeaderText(object):
             self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0])
             self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0])
 
+
 class SectionMetadata(object):
     
     def __init__(self, raw):
@@ -220,6 +220,7 @@ class SectionMetadata(object):
              
             adv += 2*length
 
+
 class SectionText(object):
     
     def __init__(self, section_header, raw):
@@ -227,6 +228,34 @@ class SectionText(object):
         self.data = raw[section_header.paragraphs * 4:]
 
 
+class SectionCompositeImage(object):
+    
+    def __init__(self, raw):
+        self.columns, = struct.unpack('>H', raw[0:2])
+        self.rows, = struct.unpack('>H', raw[2:4])
+
+        # [
+        #  row [col, col, col...],
+        #  row [col, col, col...],
+        #  ...
+        # ]
+        #
+        # Each item in the layout is in it's
+        # correct position in the final
+        # composite.
+        #
+        # Each item in the layout is a uid
+        # to an image record.
+        self.layout = []
+        offset = 4
+        for i in xrange(self.rows):
+            col = []
+            for j in xrange(self.columns):
+                col.append(struct.unpack('>H', raw[offset:offset+2])[0])
+                offset += 2 
+            self.layout.append(col)
+
+
 class Reader(FormatReader):
 
     def __init__(self, header, stream, log, options):
@@ -240,6 +269,7 @@ class Reader(FormatReader):
         self.uid_text_secion_number = OrderedDict()
         self.uid_text_secion_encoding = {}
         self.uid_image_section_number = {}
+        self.uid_composite_image_section_number = {}
         self.metadata_section_number = None
         self.default_encoding = 'utf-8'
         self.owner_id = None
@@ -266,8 +296,9 @@ class Reader(FormatReader):
             elif section_header.type == DATATYPE_METADATA:
                 self.metadata_section_number = section_number
                 section = SectionMetadata(raw_data[start:])
-            #elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
-                
+            elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
+                self.uid_composite_image_section_number[section_header.uid] = section_number
+                section = SectionCompositeImage(raw_data[start:])
 
             self.sections.append((section_header, section))
 
@@ -282,6 +313,9 @@ class Reader(FormatReader):
         self.mi = get_metadata(stream, False)
 
     def extract_content(self, output_dir):
+        # Each text record is independent (unless the continuation
+        # value is set in the previous record). Put each converted
+        # text recored into a separate file.
         with CurrentDir(output_dir):
             for uid, num in self.uid_text_secion_number.items():
                 self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid)))
@@ -297,9 +331,11 @@ class Reader(FormatReader):
                     htmlf.write(html.encode('utf-8'))
 
         images = []
+        image_sizes = {}
         if not os.path.exists(os.path.join(output_dir, 'images/')):
             os.makedirs(os.path.join(output_dir, 'images/'))
         with CurrentDir(os.path.join(output_dir, 'images/')):
+            # Single images.
             for uid, num in self.uid_image_section_number.items():
                 section_header, section_data = self.sections[num]
                 if section_data:
@@ -317,6 +353,7 @@ class Reader(FormatReader):
                                 itf.write(idata)
                             im = Image()
                             im.read(itn)
+                            image_sizes[uid] = im.size
                             im.set_compression_quality(70)
                             im.save('%s.jpg' % uid)
                             self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
@@ -325,6 +362,49 @@ class Reader(FormatReader):
                     images.append('%s.jpg' % uid)
                 else:
                     self.log.error('Failed to write image with uid %s: No data.' % uid)
+            # Composite images.
+            for uid, num in self.uid_composite_image_section_number.items():
+                try:
+                    section_header, section_data = self.sections[num]
+                    # Get the final width and height.
+                    width = 0
+                    height = 0
+                    for row in section_data.layout:
+                        row_width = 0
+                        col_height = 0
+                        for col in row:
+                            if col not in image_sizes:
+                                raise Exception('Image with uid: %s missing.' % col)
+                            im = Image()
+                            im.read('%s.jpg' % col)
+                            w, h = im.size
+                            row_width += w
+                            if col_height < h:
+                                col_height = h
+                        if width < row_width:
+                            width = row_width
+                        height += col_height
+                    # Create a new image the total size of all image
+                    # parts. Put the parts into the new image.
+                    canvas = create_canvas(width, height)
+                    y_off = 0
+                    for row in section_data.layout:
+                        x_off = 0
+                        largest_height = 0
+                        for col in row:
+                            im = Image()
+                            im.read('%s.jpg' % col)
+                            canvas.compose(im, x_off, y_off)
+                            w, h = im.size
+                            x_off += w
+                            if largest_height < h:
+                                largest_height = h
+                        y_off += largest_height
+                    canvas.set_compression_quality(70)
+                    canvas.save('%s.jpg' % uid)
+                    self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid))
+                except Exception as e:
+                    self.log.error('Failed to write composite image with uid %s: %s' % (uid, e))
 
         # Run the HTML through the html processing plugin.
         from calibre.customize.ui import plugin_for_input_format
@@ -334,13 +414,17 @@ class Reader(FormatReader):
         self.options.input_encoding = 'utf-8'
         odi = self.options.debug_pipeline
         self.options.debug_pipeline = None
-        # Generate oeb from html conversion.
+        # Determine the home.html record uid. This should be set in the
+        # reserved values in the metadata recored. home.html is the first
+        # text record (should have hyper link references to other records)
+        # in the document.
         try:
             home_html = self.header_record.home_html
             if not home_html:
                 home_html = self.uid_text_secion_number.items()[0][0]
         except:
             raise Exception(_('Could not determine home.html'))
+        # Generate oeb from html conversion.
         oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {})
         self.options.debug_pipeline = odi
 
@@ -359,6 +443,7 @@ class Reader(FormatReader):
         html = u'<p id="p0">'
         offset = 0
         paragraph_open = True
+        link_open = False
         need_set_p_id = False
         p_num = 1
         paragraph_offsets = []
@@ -387,14 +472,15 @@ class Reader(FormatReader):
                 if c == 0x0a:
                     offset += 1
                     id = struct.unpack('>H', d[offset:offset+2])[0]
-                    html += '<a href="%s.html">' % id
+                    if id in self.uid_text_secion_number:
+                        html += '<a href="%s.html">' % id
+                        link_open = True
                     offset += 1
                 # Targeted page link begins
                 # 3 Bytes
                 # record ID, target
                 elif c == 0x0b:
                     offset += 3
-                    html += '<a>'
                 # Paragraph link begins
                 # 4 Bytes
                 # record ID, paragraph number
@@ -403,18 +489,21 @@ class Reader(FormatReader):
                     id = struct.unpack('>H', d[offset:offset+2])[0]
                     offset += 2
                     pid = struct.unpack('>H', d[offset:offset+2])[0]
-                    html += '<a href="%s.html#p%s">' % (id, pid)
+                    if id in self.uid_text_secion_number:
+                        html += '<a href="%s.html#p%s">' % (id, pid)
+                        link_open = True
                     offset += 1
                 # Targeted paragraph link begins
                 # 5 Bytes
                 # record ID, paragraph number, target
                 elif c == 0x0d:
                     offset += 5
-                    html += '<a>'
                 # Link ends
                 # 0 Bytes
                 elif c == 0x08:
-                    html += '</a>'
+                    if link_open:
+                        html += '</a>'
+                        link_open = False
                 # Set font
                 # 1 Bytes
                 # font specifier

From 494c040d36b2ee5620143f5bf70600a811d2de1e Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 16 Apr 2011 21:54:28 -0400
Subject: [PATCH 06/13] Comments.

---
 src/calibre/ebooks/pdb/plucker/reader.py | 74 ++++++++++++++++++++----
 1 file changed, 62 insertions(+), 12 deletions(-)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index c6c404b125..20943be3f0 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -136,6 +136,9 @@ def decompress_doc(data):
     return ''.join([chr(i) for i in res])
 
 class HeaderRecord(object):
+    '''
+    Plucker header. PDB record 0.
+    '''
 
     def __init__(self, raw):
         self.uid, = struct.unpack('>H', raw[0:2])
@@ -144,6 +147,8 @@ class HeaderRecord(object):
         # 1 is DOC compressed
         self.compression, = struct.unpack('>H', raw[2:4])
         self.records, = struct.unpack('>H', raw[4:6])
+        # uid of the first html file. This should link
+        # to other files which in turn may link to others.
         self.home_html = None
         
         self.reserved = {}
@@ -157,6 +162,10 @@ class HeaderRecord(object):
 
 
 class SectionHeader(object):
+    '''
+    Every sections (record) has this header. It gives
+    details about the section such as it's uid.
+    '''
     
     def __init__(self, raw):
         self.uid, = struct.unpack('>H', raw[0:2])
@@ -167,9 +176,14 @@ class SectionHeader(object):
 
 
 class SectionHeaderText(object):
+    '''
+    Sub header for text records.
+    '''
     
     def __init__(self, section_header, raw):
+        # The uncompressed size of each paragraph.
         self.sizes = []
+        # Paragraph attributes.
         self.attributes = []
 
         for i in xrange(section_header.paragraphs):
@@ -179,6 +193,19 @@ class SectionHeaderText(object):
 
 
 class SectionMetadata(object):
+    '''
+    Metadata.
+    
+    This does not store metadata such as title, or author.
+    That metadata would be best retrieved with the PDB (plucker)
+    metdata reader.
+    
+    This stores document specific information such as the
+    text encoding.
+    
+    Note: There is a default encoding but each text section
+    can be assigned a different encoding.
+    '''
     
     def __init__(self, raw):
         self.default_encoding = 'utf-8'
@@ -222,6 +249,9 @@ class SectionMetadata(object):
 
 
 class SectionText(object):
+    '''
+    Text data. Stores a text section header and the PHTML.
+    '''
     
     def __init__(self, section_header, raw):
         self.header = SectionHeaderText(section_header, raw)
@@ -229,14 +259,19 @@ class SectionText(object):
 
 
 class SectionCompositeImage(object):
+    '''
+    A composite image consists of a a 2D array
+    of rows and columns. The entries in the array
+    are uid's. 
+    '''
     
     def __init__(self, raw):
         self.columns, = struct.unpack('>H', raw[0:2])
         self.rows, = struct.unpack('>H', raw[2:4])
 
         # [
-        #  row [col, col, col...],
-        #  row [col, col, col...],
+        #  [uid, uid, uid, ...],
+        #  [uid, uid, uid, ...],
         #  ...
         # ]
         #
@@ -275,18 +310,21 @@ class Reader(FormatReader):
         self.owner_id = None
         self.sections = []
         
+        # The Plucker record0 header
         self.header_record = HeaderRecord(header.section_data(0))
         
         for i in range(1, header.num_sections):
-            section_number = i - 1
+            section_number = len(self.sections)
+            # The length of the section header.
+            # Where the actual data in the section starts.
             start = 8
             section = None
             
             raw_data = header.section_data(i)
+            # Every sections has a section header.
             section_header = SectionHeader(raw_data)
-            
-            self.uid_section_number[section_header.uid] = section_number
-            
+        
+            # Store sections we care able.    
             if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
                 self.uid_text_secion_number[section_header.uid] = section_number
                 section = SectionText(section_header, raw_data[start:])
@@ -300,8 +338,13 @@ class Reader(FormatReader):
                 self.uid_composite_image_section_number[section_header.uid] = section_number
                 section = SectionCompositeImage(raw_data[start:])
 
-            self.sections.append((section_header, section))
+            # Store the section.
+            if section:
+                self.uid_section_number[section_header.uid] = section_number
+                self.sections.append((section_header, section))
 
+        # Store useful information from the metadata section locally
+        # to make access easier.
         if self.metadata_section_number:
             mdata_section = self.sections[self.metadata_section_number][1]
             for k, v in mdata_section.exceptional_uid_encodings.items():
@@ -309,13 +352,16 @@ class Reader(FormatReader):
             self.default_encoding = mdata_section.default_encoding
             self.owner_id = mdata_section.owner_id
 
+        # Get the metadata (tile, author, ...) with the metadata reader.
         from calibre.ebooks.metadata.pdb import get_metadata
         self.mi = get_metadata(stream, False)
 
     def extract_content(self, output_dir):
         # Each text record is independent (unless the continuation
         # value is set in the previous record). Put each converted
-        # text recored into a separate file.
+        # text recored into a separate file. We will reference the
+        # home.html file as the first file and let the HTML input
+        # plugin assemble the order based on hyperlinks.
         with CurrentDir(output_dir):
             for uid, num in self.uid_text_secion_number.items():
                 self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid)))
@@ -329,8 +375,9 @@ class Reader(FormatReader):
                         html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
                     html += '</body></html>'
                     htmlf.write(html.encode('utf-8'))
-
-        images = []
+        
+        # Images.
+        # Cache the image sizes in case they are used by a composite image.
         image_sizes = {}
         if not os.path.exists(os.path.join(output_dir, 'images/')):
             os.makedirs(os.path.join(output_dir, 'images/'))
@@ -359,10 +406,10 @@ class Reader(FormatReader):
                             self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
                     except Exception as e:
                         self.log.error('Failed to write image with uid %s: %s' % (uid, e))
-                    images.append('%s.jpg' % uid)
                 else:
                     self.log.error('Failed to write image with uid %s: No data.' % uid)
             # Composite images.
+            # We're going to use the already compressed .jpg images here.
             for uid, num in self.uid_composite_image_section_number.items():
                 try:
                     section_header, section_data = self.sections[num]
@@ -559,7 +606,10 @@ class Reader(FormatReader):
                 # 4 Bytes
                 # alternate image record ID, image record ID
                 elif c == 0x5c:
-                    offset += 4
+                    offset += 3
+                    uid = struct.unpack('>H', d[offset:offset+2])[0]
+                    html += '<img src="images/%s.jpg" />' % uid
+                    offset += 1
                 # Underline text begins
                 # 0 Bytes
                 elif c == 0x60:

From 93492a9ec8f01233723bd8b6038a0440c738a705 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 17 Apr 2011 09:42:00 -0400
Subject: [PATCH 07/13] Add font changes.

---
 src/calibre/ebooks/pdb/plucker/reader.py | 50 ++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index 20943be3f0..5c128fa3d3 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -493,6 +493,7 @@ class Reader(FormatReader):
         link_open = False
         need_set_p_id = False
         p_num = 1
+        font_specifier_close = ''
         paragraph_offsets = []
         running_offset = 0
         for size in sub_header.sizes:
@@ -556,6 +557,55 @@ class Reader(FormatReader):
                 # font specifier
                 elif c == 0x11:
                     offset += 1
+                    specifier = d[offset]
+                    html += font_specifier_close
+                    # Regular text
+                    if specifier == 0:
+                        font_specifier_close = ''
+                    # h1
+                    elif specifier == 1:
+                        html += '<h1>'
+                        font_specifier_close = '</h1>'
+                    # h2
+                    elif specifier == 2:
+                        html += '<h2>'
+                        font_specifier_close = '</h2>'
+                    # h3
+                    elif specifier == 3:
+                        html += '<h13>'
+                        font_specifier_close = '</h3>'
+                    # h4
+                    elif specifier == 4:
+                        html += '<h4>'
+                        font_specifier_close = '</h4>'
+                    # h5
+                    elif specifier == 5:
+                        html += '<h5>'
+                        font_specifier_close = '</h5>'
+                    # h6
+                    elif specifier == 6:
+                        html += '<h6>'
+                        font_specifier_close = '</h6>'
+                    # Bold
+                    elif specifier == 7:
+                        html += '<b>'
+                        font_specifier_close = '</b>'
+                    # Fixed-width
+                    elif specifier == 8:
+                        html += '<tt>'
+                        font_specifier_close = '</tt>'
+                    # Small
+                    elif specifier == 9:
+                        html += '<small>'
+                        font_specifier_close = '</small>'
+                    # Subscript
+                    elif specifier == 10:
+                        html += '<sub>'
+                        font_specifier_close = '</sub>'
+                    # Superscript
+                    elif specifier == 11:
+                        html += '<sup>'
+                        font_specifier_close = '</sup>'
                 # Embedded image
                 # 2 Bytes
                 # image record ID

From 87bb34d9940a39553d4f72a056486cb20a88e587 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 17 Apr 2011 10:03:50 -0400
Subject: [PATCH 08/13] Use latin-1 instead of utf-8 for default encoding.

---
 src/calibre/ebooks/pdb/plucker/reader.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index 5c128fa3d3..9ae449e579 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -208,7 +208,7 @@ class SectionMetadata(object):
     '''
     
     def __init__(self, raw):
-        self.default_encoding = 'utf-8'
+        self.default_encoding = 'latin-1'
         self.exceptional_uid_encodings = {}
         self.owner_id = None
         
@@ -222,14 +222,14 @@ class SectionMetadata(object):
             # CharSet
             if type == 1:
                 val, = struct.unpack('>H', raw[6+adv:8+adv])
-                self.default_encoding = MIBNUM_TO_NAME.get(val, 'utf-8')
+                self.default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1')
             # ExceptionalCharSets
             elif type == 2:
                 ii_adv = 0
                 for ii in xrange(length / 2):
                     uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv])
                     mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv])
-                    self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'utf-8')
+                    self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'latin-1')
                     ii_adv += 4
             # OwnerID
             elif type == 3:
@@ -306,7 +306,7 @@ class Reader(FormatReader):
         self.uid_image_section_number = {}
         self.uid_composite_image_section_number = {}
         self.metadata_section_number = None
-        self.default_encoding = 'utf-8'
+        self.default_encoding = 'latin-1'
         self.owner_id = None
         self.sections = []
         
@@ -680,10 +680,12 @@ class Reader(FormatReader):
                 # 3 Bytes
                 # alternate text length, 16-bit unicode character
                 elif c == 0x83:
-                    #offset += 2
+                    #offset += 1
+                    #alt_len = struct.unpack('>B', str(d[offset]))[0]
+                    #offset += 1
                     #c16 = d[offset:offset+2]
                     #html += c16.decode('utf-16')
-                    #offset += 1
+                    #offset += 1 + alt_len
                     offset += 3
                 # 32-bit Unicode character
                 # 5 Bytes

From 15a0384481e3e8ec9ee3adce0b082c21bde51fd2 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 17 Apr 2011 10:59:15 -0400
Subject: [PATCH 09/13] ..

---
 src/calibre/ebooks/pdb/plucker/reader.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index 9ae449e579..ced9dafc0f 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -680,21 +680,11 @@ class Reader(FormatReader):
                 # 3 Bytes
                 # alternate text length, 16-bit unicode character
                 elif c == 0x83:
-                    #offset += 1
-                    #alt_len = struct.unpack('>B', str(d[offset]))[0]
-                    #offset += 1
-                    #c16 = d[offset:offset+2]
-                    #html += c16.decode('utf-16')
-                    #offset += 1 + alt_len
                     offset += 3
                 # 32-bit Unicode character
                 # 5 Bytes
                 # alternate text length, 32-bit unicode character
                 elif c == 0x85:
-                    #offset += 2
-                    #c32 = d[offset:offset+4]
-                    #html += c32.decode('utf-32')
-                    #offset += 3
                     offset += 5
                 # Begin custom font span
                 # 6 Bytes

From 05fc3eec93fd3b05af981bc3d20a1627673aa043 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 17 Apr 2011 11:09:46 -0400
Subject: [PATCH 10/13] Add todo for non supported features.

---
 src/calibre/ebooks/pdb/plucker/reader.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index ced9dafc0f..207a466178 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -292,6 +292,18 @@ class SectionCompositeImage(object):
 
 
 class Reader(FormatReader):
+    '''
+    Convert a plucker archive into HTML.
+    
+    TODO:
+          * UTF 16 and 32 characters.
+          * Margins.
+          * Alignment.
+          * DATATYPE_MAILTO
+          * DATATYPE_TABLE(_COMPRESSED)
+          * DATATYPE_EXT_ANCHOR_INDEX
+          * DATATYPE_EXT_ANCHOR(_COMPRESSED)
+    '''
 
     def __init__(self, header, stream, log, options):
         self.stream = stream

From c0cf0e91d47b1213b2093bac4cdd1317a87b258f Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 17 Apr 2011 19:28:04 -0400
Subject: [PATCH 11/13] Allow user specify input encoding and override what is
 specified by the file. Turn 0xa0 character into nbsp entity.

---
 src/calibre/ebooks/pdb/plucker/reader.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index 207a466178..5fa66e1246 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -523,6 +523,7 @@ class Reader(FormatReader):
                 paragraph_open = True
 
             c = ord(d[offset])
+            # PHTML "functions"
             if c == 0x0:
                 offset += 1
                 c = ord(d[offset])
@@ -736,6 +737,8 @@ class Reader(FormatReader):
                 # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or Targeted Paragraph Link function to specify an exact byte offset within the paragraph. This function must be followed immediately by the function it modifies).
                 elif c == 0x9a:
                     offset += 2
+            elif c == 0xa0:
+                html += '&nbsp;'
             else:
                 html += unichr(c)
             offset += 1
@@ -751,4 +754,4 @@ class Reader(FormatReader):
         return html
 
     def get_text_uid_encoding(self, uid):
-        return self.uid_text_secion_encoding.get(uid, self.default_encoding)
+        return self.options.input_encoding if self.options.input_encoding else self.uid_text_secion_encoding.get(uid, self.default_encoding)

From 377313df7d4343ab9e4035877ac2d184f5dd73ba Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 17 Apr 2011 19:42:39 -0400
Subject: [PATCH 12/13] cleanup.

---
 src/calibre/ebooks/pdb/plucker/reader.py | 25 +++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py
index 5fa66e1246..9f1d2ad426 100644
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@@ -183,6 +183,9 @@ class SectionHeaderText(object):
     def __init__(self, section_header, raw):
         # The uncompressed size of each paragraph.
         self.sizes = []
+        # uncompressed offset of each paragraph starting
+        # at the beginning of the PHTML.
+        self.paragraph_offsets = []
         # Paragraph attributes.
         self.attributes = []
 
@@ -191,6 +194,11 @@ class SectionHeaderText(object):
             self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0])
             self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0])
 
+        running_offset = 0
+        for size in self.sizes:
+            running_offset += size
+            self.paragraph_offsets.append(running_offset)
+
 
 class SectionMetadata(object):
     '''
@@ -299,6 +307,7 @@ class Reader(FormatReader):
           * UTF 16 and 32 characters.
           * Margins.
           * Alignment.
+          * Font color.
           * DATATYPE_MAILTO
           * DATATYPE_TABLE(_COMPRESSED)
           * DATATYPE_EXT_ANCHOR_INDEX
@@ -381,13 +390,13 @@ class Reader(FormatReader):
                     html = u'<html><body>'
                     section_header, section_data = self.sections[num]
                     if section_header.type == DATATYPE_PHTML:
-                        html += self.process_phtml(section_data.header, section_data.data)
+                        html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets)
                     elif section_header.type == DATATYPE_PHTML_COMPRESSED:
                         d = self.decompress_phtml(section_data.data)
-                        html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
+                        html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
                     html += '</body></html>'
                     htmlf.write(html.encode('utf-8'))
-        
+
         # Images.
         # Cache the image sizes in case they are used by a composite image.
         image_sizes = {}
@@ -498,7 +507,7 @@ class Reader(FormatReader):
             #from calibre.ebooks.compression.palmdoc import decompress_doc
             return decompress_doc(data)
             
-    def process_phtml(self, sub_header, d):
+    def process_phtml(self, d, paragraph_offsets=[]):
         html = u'<p id="p0">'
         offset = 0
         paragraph_open = True
@@ -506,11 +515,6 @@ class Reader(FormatReader):
         need_set_p_id = False
         p_num = 1
         font_specifier_close = ''
-        paragraph_offsets = []
-        running_offset = 0
-        for size in sub_header.sizes:
-            running_offset += size
-            paragraph_offsets.append(running_offset)
         
         while offset < len(d):
             if not paragraph_open:
@@ -754,4 +758,7 @@ class Reader(FormatReader):
         return html
 
     def get_text_uid_encoding(self, uid):
+        # Return the user sepcified input encoding,
+        # otherwise return the alternate encoding specified for the uid,
+        # otherwise retur the default encoding for the document.
         return self.options.input_encoding if self.options.input_encoding else self.uid_text_secion_encoding.get(uid, self.default_encoding)

From e690e7196e094787985dd148038d38a6d5e08163 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 24 Apr 2011 09:44:50 -0400
Subject: [PATCH 13/13] Plucker metadata reader.

---
 src/calibre/ebooks/metadata/pdb.py     |  4 +-
 src/calibre/ebooks/metadata/plucker.py | 73 ++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 src/calibre/ebooks/metadata/plucker.py

diff --git a/src/calibre/ebooks/metadata/pdb.py b/src/calibre/ebooks/metadata/pdb.py
index ddf2b0c818..d01bb0ecdb 100644
--- a/src/calibre/ebooks/metadata/pdb.py
+++ b/src/calibre/ebooks/metadata/pdb.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 '''
-Read meta information from eReader pdb files.
+Read meta information from pdb files.
 '''
 
 __license__   = 'GPL v3'
@@ -13,10 +13,12 @@ import re
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.metadata.ereader import get_metadata as get_eReader
+from calibre.ebooks.metadata.plucker import get_metadata as get_plucker
 
 MREADER = {
     'PNPdPPrs' : get_eReader,
     'PNRdPPrs' : get_eReader,
+    'DataPlkr' : get_plucker,
 }
 
 from calibre.ebooks.metadata.ereader import set_metadata as set_eReader
diff --git a/src/calibre/ebooks/metadata/plucker.py b/src/calibre/ebooks/metadata/plucker.py
new file mode 100644
index 0000000000..991945f42b
--- /dev/null
+++ b/src/calibre/ebooks/metadata/plucker.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import (unicode_literals, division, absolute_import, print_function)
+
+'''
+Read meta information from Plucker pdb files.
+'''
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import struct
+from datetime import datetime
+
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.pdb.header import PdbHeaderReader
+from calibre.ebooks.pdb.plucker.reader import SectionHeader, DATATYPE_METADATA, \
+    MIBNUM_TO_NAME
+
+def get_metadata(stream, extract_cover=True):
+    '''
+    Return metadata as a L{MetaInfo} object
+    '''
+    mi = MetaInformation(_('Unknown'), [_('Unknown')])
+    stream.seek(0)
+
+    pheader = PdbHeaderReader(stream)
+    section_data = None
+    for i in range(1, pheader.num_sections):
+        raw_data = pheader.section_data(i)
+        section_header = SectionHeader(raw_data)
+        if section_header.type == DATATYPE_METADATA:
+            section_data = raw_data[8:]
+            break
+        
+    if not section_data:
+        return mi
+    
+    default_encoding = 'latin-1'    
+    record_count, = struct.unpack('>H', section_data[0:2])    
+    adv = 0
+    title = None
+    author = None
+    pubdate = 0
+    for i in xrange(record_count):
+        type, = struct.unpack('>H', section_data[2+adv:4+adv])
+        length, = struct.unpack('>H', section_data[4+adv:6+adv])
+        
+        # CharSet
+        if type == 1:
+            val, = struct.unpack('>H', section_data[6+adv:8+adv])
+            default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1')
+        # Author
+        elif type == 4:
+            author = section_data[6+adv+(2*length)]
+        # Title
+        elif type == 5:
+            title = section_data[6+adv+(2*length)]
+        # Publication Date
+        elif type == 6:
+            pubdate, = struct.unpack('>I', section_data[6+adv:6+adv+4])
+
+        adv += 2*length
+
+    if title:
+        mi.title = title.replace('\0', '').decode(default_encoding, 'replace')
+    if author:
+        author = author.replace('\0', '').decode(default_encoding, 'replace')
+        mi.author = author.split(',')
+    mi.pubdate = datetime.fromtimestamp(pubdate)
+
+    return mi