Basic plucker working (text, non-composite images).

2025-07-09 03:04:10 -04:00 · 2011-04-11 19:04:56 -04:00 · 2011-04-11 19:04:56 -04:00 · 0f3228e658
commit 0f3228e658
parent 0f62ea111c
1 changed files with 425 additions and 30 deletions
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@ -10,9 +10,13 @@ import os
 import struct
 import zlib

+from collections import OrderedDict
+
 from calibre import CurrentDir
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.pdb.formatreader import FormatReader
+from calibre.ptempfile import TemporaryFile
+from calibre.utils.magick import Image

 DATATYPE_PHTML = 0
 DATATYPE_PHTML_COMPRESSED = 1
@ -38,6 +42,100 @@ DATATYPE_EXT_ANCHOR_INDEX = 20
 DATATYPE_EXT_ANCHOR = 21
 DATATYPE_EXT_ANCHOR_COMPRESSED = 22

+# IETF IANA MIBenum value for the character set.
+# See the http://www.iana.org/assignments/character-sets for valid values.
+# Not all character sets are handled by Python. This is a small subset that
+# the MIBenum maps to Python standard encodings
+# from http://docs.python.org/library/codecs.html#standard-encodings
+MIBNUM_TO_NAME = {
+    3: 'ascii',
+    4: 'latin_1',
+    5: 'iso8859_2',
+    6: 'iso8859_3',
+    7: 'iso8859_4',
+    8: 'iso8859_5',
+    9: 'iso8859_6',
+    10: 'iso8859_7',
+    11: 'iso8859_8',
+    12: 'iso8859_9',
+    13: 'iso8859_10',
+    17: 'shift_jis',
+    18: 'euc_jp',
+    27: 'utf_7',
+    36: 'euc_kr',
+    37: 'iso2022_kr',
+    38: 'euc_kr',
+    39: 'iso2022_jp',
+    40: 'iso2022_jp_2',
+    106: 'utf-8',
+    109: 'iso8859_13',
+    110: 'iso8859_14',
+    111: 'iso8859_15',
+    112: 'iso8859_16',
+    1013: 'utf_16_be',
+    1014: 'utf_16_le',
+    1015: 'utf_16',
+    2009: 'cp850',
+    2010: 'cp852',
+    2011: 'cp437',
+    2013: 'cp862',
+    2025: 'gb2312',
+    2026: 'big5',
+    2028: 'cp037',
+    2043: 'cp424',
+    2044: 'cp500',
+    2046: 'cp855',
+    2047: 'cp857',
+    2048: 'cp860',
+    2049: 'cp861',
+    2050: 'cp863',
+    2051: 'cp864',
+    2052: 'cp865',
+    2054: 'cp869',
+    2063: 'cp1026',
+    2085: 'hz',
+    2086: 'cp866',
+    2087: 'cp775',
+    2089: 'cp858',
+    2091: 'cp1140',
+    2102: 'big5hkscs',
+    2250: 'cp1250',
+    2251: 'cp1251',
+    2252: 'cp1252',
+    2253: 'cp1253',
+    2254: 'cp1254',
+    2255: 'cp1255',
+    2256: 'cp1256',
+    2257: 'cp1257',
+    2258: 'cp1258',    
+}
+
+def decompress_doc(data):
+    buffer = [ord(i) for i in data]
+    res = []
+    i = 0
+    while i < len(buffer):
+        c = buffer[i]
+        i += 1
+        if c >= 1 and c <= 8:
+            res.extend(buffer[i:i+c])
+            i += c
+        elif c <= 0x7f:
+            res.append(c)
+        elif c >= 0xc0:
+            res.extend( (ord(' '), c^0x80) )
+        else:
+            c = (c << 8) + buffer[i]
+            i += 1
+            di = (c & 0x3fff) >> 3
+            j = len(res)
+            num = (c & ((1 << 3) - 1)) + 3
+
+            for k in range( num ):
+                res.append(res[j - di+k])
+
+    return ''.join([chr(i) for i in res])
+
 class HeaderRecord(object):

    def __init__(self, raw):
@ -68,14 +166,62 @@ class SectionHeader(object):

 class SectionHeaderText(object):
    
-    def __init__(self, data_header, raw):
+    def __init__(self, section_header, raw):
        self.sizes = []
        self.attributes = []

-        for i in xrange(data_header.paragraphs):
+        for i in xrange(section_header.paragraphs):
            adv = 4*i
-            self.sizes.append(struct.unpack('>H', raw[8+adv:10+adv])[0])
-            self.attributes.append(struct.unpack('>H', raw[10+adv:12+adv])[0])
+            self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0])
+            self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0])
+
+class SectionMetadata(object):
+    
+    def __init__(self, raw):
+        self.default_encoding = 'utf-8'
+        self.exceptional_uid_encodings = {}
+        self.owner_id = None
+        
+        record_count, = struct.unpack('>H', raw[0:2])
+        
+        adv = 0
+        for i in xrange(record_count):
+            type, = struct.unpack('>H', raw[2+adv:4+adv])
+            length, = struct.unpack('>H', raw[4+adv:6+adv])
+            
+            # CharSet
+            if type == 1:
+                val, = struct.unpack('>H', raw[6+adv:8+adv])
+                self.default_encoding = MIBNUM_TO_NAME.get(val, 'utf-8')
+            # ExceptionalCharSets
+            elif type == 2:
+                ii_adv = 0
+                for ii in xrange(length / 2):
+                    uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv])
+                    mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv])
+                    self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'utf-8')
+                    ii_adv += 4
+            # OwnerID
+            elif type == 3:
+                self.owner_id = struct.unpack('>I', raw[6+adv:10+adv])
+            # Author, Title, PubDate
+            # Ignored here. The metadata reader plugin
+            # will get this info because if it's missing
+            # the metadata reader plugin will use fall
+            # back data from elsewhere in the file.
+            elif type in (4, 5, 6):
+                pass
+            # Linked Documents
+            elif type == 7:
+                pass
+             
+            adv += 2*length
+
+class SectionText(object):
+    
+    def __init__(self, section_header, raw):
+        self.header = SectionHeaderText(section_header, raw)
+        self.data = raw[section_header.paragraphs * 4:]


 class Reader(FormatReader):
@ -84,53 +230,302 @@ class Reader(FormatReader):
        self.stream = stream
        self.log = log
        self.options = options
-        
-        self.sections = []
-        for i in range(1, header.num_sections):
-            start = 8
-            raw_data = header.section_data(i)
-            data_header = SectionHeader(raw_data)
-            sub_header = None
-            if data_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
-                sub_header = SectionHeaderText(data_header, raw_data)
-                start += data_header.paragraphs * 4
-            self.sections.append((data_header, sub_header, raw_data[start:]))

+        # Mapping of section uid to our internal
+        # list of sections.
+        self.uid_section_number = OrderedDict()
+        self.uid_text_secion_number = OrderedDict()
+        self.uid_text_secion_encoding = {}
+        self.uid_image_section_number = {}
+        self.metadata_section_number = None
+        self.default_encoding = 'utf-8'
+        self.owner_id = None
+        self.sections = []
+        
        self.header_record = HeaderRecord(header.section_data(0))
+        
+        for i in range(1, header.num_sections):
+            section_number = i - 1
+            start = 8
+            section = None
+            
+            raw_data = header.section_data(i)
+            section_header = SectionHeader(raw_data)
+            
+            self.uid_section_number[section_header.uid] = section_number
+            
+            if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
+                self.uid_text_secion_number[section_header.uid] = section_number
+                section = SectionText(section_header, raw_data[start:])
+            elif section_header.type in (DATATYPE_TBMP, DATATYPE_TBMP_COMPRESSED):
+                self.uid_image_section_number[section_header.uid] = section_number
+                section = raw_data[start:]
+            elif section_header.type == DATATYPE_METADATA:
+                self.metadata_section_number = section_number
+                section = SectionMetadata(raw_data[start:])
+            elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
+                
+
+            self.sections.append((section_header, section))
+
+        if self.metadata_section_number:
+            mdata_section = self.sections[self.metadata_section_number][1]
+            for k, v in mdata_section.exceptional_uid_encodings.items():
+                self.uid_text_secion_encoding[k] = v
+            self.default_encoding = mdata_section.default_encoding
+            self.owner_id = mdata_section.owner_id

        from calibre.ebooks.metadata.pdb import get_metadata
        self.mi = get_metadata(stream, False)

    def extract_content(self, output_dir):
-        html = u''
+        html = u'<html><body>'
        images = []
-        
-        for header, sub_header, data in self.sections:
-            if header.type == DATATYPE_PHTML:
-                html += data
-            elif header.type == DATATYPE_PHTML_COMPRESSED:
-                d = self.decompress_phtml(data).decode('latin-1', 'replace')
-                print len(d) == header.size
-                html += d
-        
-        print html
+
+        for uid, num in self.uid_text_secion_number.items():
+            section_header, section_data = self.sections[num]
+            if section_header.type == DATATYPE_PHTML:
+                html += self.process_phtml(section_data.header, section_data.data.decode(self.get_text_uid_encoding(section_header.uid), 'replace'))
+            elif section_header.type == DATATYPE_PHTML_COMPRESSED:
+                d = self.decompress_phtml(section_data.data).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
+                html += self.process_phtml(section_data.header, d)
+
+        html += '</body></html>'
+
        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))
-        
+
+        if not os.path.exists(os.path.join(output_dir, 'images/')):
+            os.makedirs(os.path.join(output_dir, 'images/'))
+        with CurrentDir(os.path.join(output_dir, 'images/')):
+            #im.read('/Users/john/Tmp/plkr/apnx.palm')
+            for uid, num in self.uid_image_section_number.items():
+                section_header, section_data = self.sections[num]
+                if section_data:
+                    idata = None
+                    if section_header.type == DATATYPE_TBMP:
+                        idata = section_data
+                    elif section_header.type == DATATYPE_TBMP_COMPRESSED:
+                        if self.header_record.compression == 1:
+                            idata = decompress_doc(section_data)
+                        elif self.header_record.compression == 2:
+                            idata = zlib.decompress(section_data)
+                    try:
+                        with TemporaryFile(suffix='.palm') as itn:
+                            with open(itn, 'wb') as itf: 
+                                itf.write(idata)
+                            im = Image()
+                            im.read(itn)
+                            im.set_compression_quality(70)
+                            im.save('%s.jpg' % uid)
+                            self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
+                    except Exception as e:
+                        self.log.error('Failed to write image with uid %s: %s' % (uid, e))
+                    images.append('%s.jpg' % uid)
+                else:
+                    self.log.error('Failed to write image with uid %s: No data.' % uid)
+
        opf_path = self.create_opf(output_dir, images)

        return opf_path

    def decompress_phtml(self, data):
        if self.header_record.compression == 2:
-            raise NotImplementedError
-            #return zlib.decompress(data)
+            if self.owner_id:
+                raise NotImplementedError
+            return zlib.decompress(data)
        elif self.header_record.compression == 1:
-            from calibre.ebooks.compression.palmdoc import decompress_doc
+            #from calibre.ebooks.compression.palmdoc import decompress_doc
            return decompress_doc(data)
            
+    def process_phtml(self, sub_header, d):
+        html = u''
+        offset = 0
+        paragraph_open = False
+        paragraph_offsets = []
+        running_offset = 0
+        for size in sub_header.sizes:
+            running_offset += size
+            paragraph_offsets.append(running_offset)
+        
+        while offset < len(d):
+            if not paragraph_open:
+                html += u'<p>'
+                paragraph_open = True
+
+            c = ord(d[offset])
+            if c == 0x0:
+                offset += 1
+                c = ord(d[offset])
+                # Page link begins
+                # 2 Bytes
+                # record ID
+                if c == 0x0a:
+                    offset += 2
+                # Targeted page link begins
+                # 3 Bytes
+                # record ID, target
+                elif c == 0x0b:
+                    offset += 3
+                # Paragraph link begins
+                # 4 Bytes
+                # record ID, paragraph number
+                elif c == 0x0c:
+                    offset += 4
+                # Targeted paragraph link begins
+                # 5 Bytes
+                # record ID, paragraph number, target
+                elif c == 0x0d:
+                    offset += 5
+                # Link ends
+                # 0 Bytes
+                elif c == 0x08:
+                    pass
+                # Set font
+                # 1 Bytes
+                # font specifier
+                elif c == 0x11:
+                    offset += 1
+                # Embedded image
+                # 2 Bytes
+                # image record ID
+                elif c == 0x1a:
+                    offset += 1
+                    uid = struct.unpack('>H', d[offset:offset+2])[0]
+                    html += '<img src="images/%s.jpg" />' % uid
+                    offset += 1
+                # Set margin
+                # 2 Bytes
+                # left margin, right margin
+                elif c == 0x22:
+                    offset += 2
+                # Alignment of text
+                # 1 Bytes
+                # alignment
+                elif c == 0x29:
+                    offset += 1
+                # Horizontal rule
+                # 3 Bytes
+                # 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100)
+                elif c == 0x33:
+                    offset += 3
+                    if paragraph_open:
+                        html += u'</p>'
+                        paragraph_open = False
+                    html += u'<hr />'
+                # New line
+                # 0 Bytes
+                elif c == 0x38:
+                    if paragraph_open:
+                        html += u'</p>\n'
+                        paragraph_open = False
+                # Italic text begins
+                # 0 Bytes
+                elif c == 0x40:
+                    html += u'<i>'
+                # Italic text ends
+                # 0 Bytes
+                elif c == 0x48:
+                    html += u'</i>'
+                # Set text color
+                # 3 Bytes
+                # 8-bit red, 8-bit green, 8-bit blue
+                elif c == 0x53:
+                    offset += 3
+                # Multiple embedded image
+                # 4 Bytes
+                # alternate image record ID, image record ID
+                elif c == 0x5c:
+                    offset += 4
+                # Underline text begins
+                # 0 Bytes
+                elif c == 0x60:
+                    html += u'<u>'
+                # Underline text ends
+                # 0 Bytes
+                elif c == 0x68:
+                    html += u'</u>'
+                # Strike-through text begins
+                # 0 Bytes
+                elif c == 0x70:
+                    html += u'<s>'
+                # Strike-through text ends
+                # 0 Bytes
+                elif c == 0x78:
+                    html += u'</s>'
+                # 16-bit Unicode character
+                # 3 Bytes
+                # alternate text length, 16-bit unicode character
+                elif c == 0x83:
+                    #offset += 2
+                    #c16 = d[offset:offset+2]
+                    #html += c16.decode('utf-16')
+                    #offset += 1
+                    offset += 3
+                # 32-bit Unicode character
+                # 5 Bytes
+                # alternate text length, 32-bit unicode character
+                elif c == 0x85:
+                    #offset += 2
+                    #c32 = d[offset:offset+4]
+                    #html += c32.decode('utf-32')
+                    #offset += 3
+                    offset += 5
+                # Begin custom font span
+                # 6 Bytes
+                # font page record ID, X page position, Y page position
+                elif c == 0x8e:
+                    offset += 6
+                # Adjust custom font glyph position
+                # 4 Bytes
+                # X page position, Y page position
+                elif c == 0x8c:
+                    offset += 4
+                # Change font page
+                # 2 Bytes
+                # font record ID
+                elif c == 0x8a:
+                    offset += 2
+                # End custom font span
+                # 0 Bytes
+                elif c == 0x88:
+                    pass
+                # Begin new table row
+                # 0 Bytes
+                elif c == 0x90:
+                    pass
+                # Insert table (or table link)
+                # 2 Bytes
+                # table record ID
+                elif c == 0x92:
+                    offset += 2
+                # Table cell data
+                # 7 Bytes
+                # 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length
+                elif c == 0x97:
+                    offset += 7
+                # Exact link modifier
+                # 2 Bytes
+                # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or Targeted Paragraph Link function to specify an exact byte offset within the paragraph. This function must be followed immediately by the function it modifies).
+                elif c == 0x9a:
+                    offset += 2
+            else:
+                html += unichr(c)
+            offset += 1
+            if offset in paragraph_offsets:
+                if paragraph_open:
+                    html += u'</p>\n'
+                    paragraph_open = False
+        
+        if paragraph_open:
+            html += u'</p>'
+        
+        return html
+
+    def get_text_uid_encoding(self, uid):
+        return self.uid_text_secion_encoding.get(uid, self.default_encoding)

    def create_opf(self, output_dir, images):
        with CurrentDir(output_dir):