Basic plucker working (text, non-composite images).

2025-08-30 23:00:21 -04:00 · 2011-04-11 19:04:56 -04:00 · 2011-04-11 19:04:56 -04:00 · 0f3228e658
commit 0f3228e658
parent 0f62ea111c
1 changed files with 425 additions and 30 deletions
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@ -10,9 +10,13 @@ import os
 import struct
 import zlib
 from collections import OrderedDict
 from calibre import CurrentDir
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.pdb.formatreader import FormatReader
 from calibre.ptempfile import TemporaryFile
 from calibre.utils.magick import Image
 DATATYPE_PHTML = 0
 DATATYPE_PHTML_COMPRESSED = 1
@ -38,6 +42,100 @@ DATATYPE_EXT_ANCHOR_INDEX = 20
 DATATYPE_EXT_ANCHOR = 21
 DATATYPE_EXT_ANCHOR_COMPRESSED = 22
 # IETF IANA MIBenum value for the character set.
 # See the http://www.iana.org/assignments/character-sets for valid values.
 # Not all character sets are handled by Python. This is a small subset that
 # the MIBenum maps to Python standard encodings
 # from http://docs.python.org/library/codecs.html#standard-encodings
 MIBNUM_TO_NAME = {
    3: 'ascii',
    4: 'latin_1',
    5: 'iso8859_2',
    6: 'iso8859_3',
    7: 'iso8859_4',
    8: 'iso8859_5',
    9: 'iso8859_6',
    10: 'iso8859_7',
    11: 'iso8859_8',
    12: 'iso8859_9',
    13: 'iso8859_10',
    17: 'shift_jis',
    18: 'euc_jp',
    27: 'utf_7',
    36: 'euc_kr',
    37: 'iso2022_kr',
    38: 'euc_kr',
    39: 'iso2022_jp',
    40: 'iso2022_jp_2',
    106: 'utf-8',
    109: 'iso8859_13',
    110: 'iso8859_14',
    111: 'iso8859_15',
    112: 'iso8859_16',
    1013: 'utf_16_be',
    1014: 'utf_16_le',
    1015: 'utf_16',
    2009: 'cp850',
    2010: 'cp852',
    2011: 'cp437',
    2013: 'cp862',
    2025: 'gb2312',
    2026: 'big5',
    2028: 'cp037',
    2043: 'cp424',
    2044: 'cp500',
    2046: 'cp855',
    2047: 'cp857',
    2048: 'cp860',
    2049: 'cp861',
    2050: 'cp863',
    2051: 'cp864',
    2052: 'cp865',
    2054: 'cp869',
    2063: 'cp1026',
    2085: 'hz',
    2086: 'cp866',
    2087: 'cp775',
    2089: 'cp858',
    2091: 'cp1140',
    2102: 'big5hkscs',
    2250: 'cp1250',
    2251: 'cp1251',
    2252: 'cp1252',
    2253: 'cp1253',
    2254: 'cp1254',
    2255: 'cp1255',
    2256: 'cp1256',
    2257: 'cp1257',
    2258: 'cp1258',    
 }
 def decompress_doc(data):
    buffer = [ord(i) for i in data]
    res = []
    i = 0
    while i < len(buffer):
        c = buffer[i]
        i += 1
        if c >= 1 and c <= 8:
            res.extend(buffer[i:i+c])
            i += c
        elif c <= 0x7f:
            res.append(c)
        elif c >= 0xc0:
            res.extend( (ord(' '), c^0x80) )
        else:
            c = (c << 8) + buffer[i]
            i += 1
            di = (c & 0x3fff) >> 3
            j = len(res)
            num = (c & ((1 << 3) - 1)) + 3
            for k in range( num ):
                res.append(res[j - di+k])
    return ''.join([chr(i) for i in res])
 class HeaderRecord(object):
    def __init__(self, raw):
@ -68,14 +166,62 @@ class SectionHeader(object):
 class SectionHeaderText(object):
-    def __init__(self, data_header, raw):
+    def __init__(self, section_header, raw):
        self.sizes = []
        self.attributes = []
-        for i in xrange(data_header.paragraphs):
+        for i in xrange(section_header.paragraphs):
            adv = 4*i
-            self.sizes.append(struct.unpack('>H', raw[8+adv:10+adv])[0])
+            self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0])
-            self.attributes.append(struct.unpack('>H', raw[10+adv:12+adv])[0])
+            self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0])
 class SectionMetadata(object):
    def __init__(self, raw):
        self.default_encoding = 'utf-8'
        self.exceptional_uid_encodings = {}
        self.owner_id = None
        record_count, = struct.unpack('>H', raw[0:2])
        adv = 0
        for i in xrange(record_count):
            type, = struct.unpack('>H', raw[2+adv:4+adv])
            length, = struct.unpack('>H', raw[4+adv:6+adv])
            # CharSet
            if type == 1:
                val, = struct.unpack('>H', raw[6+adv:8+adv])
                self.default_encoding = MIBNUM_TO_NAME.get(val, 'utf-8')
            # ExceptionalCharSets
            elif type == 2:
                ii_adv = 0
                for ii in xrange(length / 2):
                    uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv])
                    mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv])
                    self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'utf-8')
                    ii_adv += 4
            # OwnerID
            elif type == 3:
                self.owner_id = struct.unpack('>I', raw[6+adv:10+adv])
            # Author, Title, PubDate
            # Ignored here. The metadata reader plugin
            # will get this info because if it's missing
            # the metadata reader plugin will use fall
            # back data from elsewhere in the file.
            elif type in (4, 5, 6):
                pass
            # Linked Documents
            elif type == 7:
                pass
            adv += 2*length
 class SectionText(object):
    def __init__(self, section_header, raw):
        self.header = SectionHeaderText(section_header, raw)
        self.data = raw[section_header.paragraphs * 4:]
 class Reader(FormatReader):
@ -84,53 +230,302 @@ class Reader(FormatReader):
        self.stream = stream
        self.log = log
        self.options = options
        self.sections = []
        for i in range(1, header.num_sections):
            start = 8
            raw_data = header.section_data(i)
            data_header = SectionHeader(raw_data)
            sub_header = None
            if data_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
                sub_header = SectionHeaderText(data_header, raw_data)
                start += data_header.paragraphs * 4
            self.sections.append((data_header, sub_header, raw_data[start:]))
        # Mapping of section uid to our internal
        # list of sections.
        self.uid_section_number = OrderedDict()
        self.uid_text_secion_number = OrderedDict()
        self.uid_text_secion_encoding = {}
        self.uid_image_section_number = {}
        self.metadata_section_number = None
        self.default_encoding = 'utf-8'
        self.owner_id = None
        self.sections = []
        self.header_record = HeaderRecord(header.section_data(0))
        for i in range(1, header.num_sections):
            section_number = i - 1
            start = 8
            section = None
            raw_data = header.section_data(i)
            section_header = SectionHeader(raw_data)
            self.uid_section_number[section_header.uid] = section_number
            if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
                self.uid_text_secion_number[section_header.uid] = section_number
                section = SectionText(section_header, raw_data[start:])
            elif section_header.type in (DATATYPE_TBMP, DATATYPE_TBMP_COMPRESSED):
                self.uid_image_section_number[section_header.uid] = section_number
                section = raw_data[start:]
            elif section_header.type == DATATYPE_METADATA:
                self.metadata_section_number = section_number
                section = SectionMetadata(raw_data[start:])
            elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
            self.sections.append((section_header, section))
        if self.metadata_section_number:
            mdata_section = self.sections[self.metadata_section_number][1]
            for k, v in mdata_section.exceptional_uid_encodings.items():
                self.uid_text_secion_encoding[k] = v
            self.default_encoding = mdata_section.default_encoding
            self.owner_id = mdata_section.owner_id
        from calibre.ebooks.metadata.pdb import get_metadata
        self.mi = get_metadata(stream, False)
    def extract_content(self, output_dir):
-        html = u''
+        html = u'<html><body>'
        images = []
-        
+
-        for header, sub_header, data in self.sections:
+        for uid, num in self.uid_text_secion_number.items():
-            if header.type == DATATYPE_PHTML:
+            section_header, section_data = self.sections[num]
-                html += data
+            if section_header.type == DATATYPE_PHTML:
-            elif header.type == DATATYPE_PHTML_COMPRESSED:
+                html += self.process_phtml(section_data.header, section_data.data.decode(self.get_text_uid_encoding(section_header.uid), 'replace'))
-                d = self.decompress_phtml(data).decode('latin-1', 'replace')
+            elif section_header.type == DATATYPE_PHTML_COMPRESSED:
-                print len(d) == header.size
+                d = self.decompress_phtml(section_data.data).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
-                html += d
+                html += self.process_phtml(section_data.header, d)
-        
+
-        print html
+        html += '</body></html>'
        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))
-        
+
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        with CurrentDir(os.path.join(output_dir, 'images/')):
            #im.read('/Users/john/Tmp/plkr/apnx.palm')
            for uid, num in self.uid_image_section_number.items():
                section_header, section_data = self.sections[num]
                if section_data:
                    idata = None
                    if section_header.type == DATATYPE_TBMP:
                        idata = section_data
                    elif section_header.type == DATATYPE_TBMP_COMPRESSED:
                        if self.header_record.compression == 1:
                            idata = decompress_doc(section_data)
                        elif self.header_record.compression == 2:
                            idata = zlib.decompress(section_data)
                    try:
                        with TemporaryFile(suffix='.palm') as itn:
                            with open(itn, 'wb') as itf: 
                                itf.write(idata)
                            im = Image()
                            im.read(itn)
                            im.set_compression_quality(70)
                            im.save('%s.jpg' % uid)
                            self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
                    except Exception as e:
                        self.log.error('Failed to write image with uid %s: %s' % (uid, e))
                    images.append('%s.jpg' % uid)
                else:
                    self.log.error('Failed to write image with uid %s: No data.' % uid)
        opf_path = self.create_opf(output_dir, images)
        return opf_path
    def decompress_phtml(self, data):
        if self.header_record.compression == 2:
-            raise NotImplementedError
+            if self.owner_id:
-            #return zlib.decompress(data)
+                raise NotImplementedError
            return zlib.decompress(data)
        elif self.header_record.compression == 1:
-            from calibre.ebooks.compression.palmdoc import decompress_doc
+            #from calibre.ebooks.compression.palmdoc import decompress_doc
            return decompress_doc(data)
    def process_phtml(self, sub_header, d):
        html = u''
        offset = 0
        paragraph_open = False
        paragraph_offsets = []
        running_offset = 0
        for size in sub_header.sizes:
            running_offset += size
            paragraph_offsets.append(running_offset)
        while offset < len(d):
            if not paragraph_open:
                html += u'<p>'
                paragraph_open = True
            c = ord(d[offset])
            if c == 0x0:
                offset += 1
                c = ord(d[offset])
                # Page link begins
                # 2 Bytes
                # record ID
                if c == 0x0a:
                    offset += 2
                # Targeted page link begins
                # 3 Bytes
                # record ID, target
                elif c == 0x0b:
                    offset += 3
                # Paragraph link begins
                # 4 Bytes
                # record ID, paragraph number
                elif c == 0x0c:
                    offset += 4
                # Targeted paragraph link begins
                # 5 Bytes
                # record ID, paragraph number, target
                elif c == 0x0d:
                    offset += 5
                # Link ends
                # 0 Bytes
                elif c == 0x08:
                    pass
                # Set font
                # 1 Bytes
                # font specifier
                elif c == 0x11:
                    offset += 1
                # Embedded image
                # 2 Bytes
                # image record ID
                elif c == 0x1a:
                    offset += 1
                    uid = struct.unpack('>H', d[offset:offset+2])[0]
                    html += '<img src="images/%s.jpg" />' % uid
                    offset += 1
                # Set margin
                # 2 Bytes
                # left margin, right margin
                elif c == 0x22:
                    offset += 2
                # Alignment of text
                # 1 Bytes
                # alignment
                elif c == 0x29:
                    offset += 1
                # Horizontal rule
                # 3 Bytes
                # 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100)
                elif c == 0x33:
                    offset += 3
                    if paragraph_open:
                        html += u'</p>'
                        paragraph_open = False
                    html += u'<hr />'
                # New line
                # 0 Bytes
                elif c == 0x38:
                    if paragraph_open:
                        html += u'</p>\n'
                        paragraph_open = False
                # Italic text begins
                # 0 Bytes
                elif c == 0x40:
                    html += u'<i>'
                # Italic text ends
                # 0 Bytes
                elif c == 0x48:
                    html += u'</i>'
                # Set text color
                # 3 Bytes
                # 8-bit red, 8-bit green, 8-bit blue
                elif c == 0x53:
                    offset += 3
                # Multiple embedded image
                # 4 Bytes
                # alternate image record ID, image record ID
                elif c == 0x5c:
                    offset += 4
                # Underline text begins
                # 0 Bytes
                elif c == 0x60:
                    html += u'<u>'
                # Underline text ends
                # 0 Bytes
                elif c == 0x68:
                    html += u'</u>'
                # Strike-through text begins
                # 0 Bytes
                elif c == 0x70:
                    html += u'<s>'
                # Strike-through text ends
                # 0 Bytes
                elif c == 0x78:
                    html += u'</s>'
                # 16-bit Unicode character
                # 3 Bytes
                # alternate text length, 16-bit unicode character
                elif c == 0x83:
                    #offset += 2
                    #c16 = d[offset:offset+2]
                    #html += c16.decode('utf-16')
                    #offset += 1
                    offset += 3
                # 32-bit Unicode character
                # 5 Bytes
                # alternate text length, 32-bit unicode character
                elif c == 0x85:
                    #offset += 2
                    #c32 = d[offset:offset+4]
                    #html += c32.decode('utf-32')
                    #offset += 3
                    offset += 5
                # Begin custom font span
                # 6 Bytes
                # font page record ID, X page position, Y page position
                elif c == 0x8e:
                    offset += 6
                # Adjust custom font glyph position
                # 4 Bytes
                # X page position, Y page position
                elif c == 0x8c:
                    offset += 4
                # Change font page
                # 2 Bytes
                # font record ID
                elif c == 0x8a:
                    offset += 2
                # End custom font span
                # 0 Bytes
                elif c == 0x88:
                    pass
                # Begin new table row
                # 0 Bytes
                elif c == 0x90:
                    pass
                # Insert table (or table link)
                # 2 Bytes
                # table record ID
                elif c == 0x92:
                    offset += 2
                # Table cell data
                # 7 Bytes
                # 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length
                elif c == 0x97:
                    offset += 7
                # Exact link modifier
                # 2 Bytes
                # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or Targeted Paragraph Link function to specify an exact byte offset within the paragraph. This function must be followed immediately by the function it modifies).
                elif c == 0x9a:
                    offset += 2
            else:
                html += unichr(c)
            offset += 1
            if offset in paragraph_offsets:
                if paragraph_open:
                    html += u'</p>\n'
                    paragraph_open = False
        if paragraph_open:
            html += u'</p>'
        return html
    def get_text_uid_encoding(self, uid):
        return self.uid_text_secion_encoding.get(uid, self.default_encoding)
    def create_opf(self, output_dir, images):
        with CurrentDir(output_dir):