From b857fd3fd13a3cf57d9f6cd3231898444f00b382 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 9 Apr 2011 20:01:49 -0400 Subject: [PATCH 01/13] Start of plucker input support. --- src/calibre/ebooks/pdb/__init__.py | 4 +- src/calibre/ebooks/pdb/plucker/__init__.py | 0 src/calibre/ebooks/pdb/plucker/reader.py | 149 +++++++++++++++++++++ 3 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/pdb/plucker/__init__.py create mode 100644 src/calibre/ebooks/pdb/plucker/reader.py diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index 092c8a21bd..c8089297db 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -12,6 +12,7 @@ from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader from calibre.ebooks.pdb.pdf.reader import Reader as pdf_reader +from calibre.ebooks.pdb.plucker.reader import Reader as plucker_reader FORMAT_READERS = { 'PNPdPPrs': ereader_reader, @@ -19,6 +20,7 @@ FORMAT_READERS = { 'zTXTGPlm': ztxt_reader, 'TEXtREAd': palmdoc_reader, '.pdfADBE': pdf_reader, + 'DataPlkr': plucker_reader, } from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer @@ -37,6 +39,7 @@ IDENTITY_TO_NAME = { 'zTXTGPlm': 'zTXT', 'TEXtREAd': 'PalmDOC', '.pdfADBE': 'Adobe Reader', + 'DataPlkr': 'Plucker', 'BVokBDIC': 'BDicty', 'DB99DBOS': 'DB (Database program)', @@ -50,7 +53,6 @@ IDENTITY_TO_NAME = { 'DATALSdb': 'LIST', 'Mdb1Mdb1': 'MobileDB', 'BOOKMOBI': 'MobiPocket', - 'DataPlkr': 'Plucker', 'DataSprd': 'QuickSheet', 'SM01SMem': 'SuperMemo', 'TEXtTlDc': 'TealDoc', diff --git a/src/calibre/ebooks/pdb/plucker/__init__.py b/src/calibre/ebooks/pdb/plucker/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py new file mode 100644 index 0000000000..d1e5931580 --- /dev/null +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +#from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL v3' +__copyright__ = '20011, John Schember ' +__docformat__ = 'restructuredtext en' + +import os +import struct +import zlib + +from calibre import CurrentDir +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.pdb.formatreader import FormatReader + +DATATYPE_PHTML = 0 +DATATYPE_PHTML_COMPRESSED = 1 +DATATYPE_TBMP = 2 +DATATYPE_TBMP_COMPRESSED = 3 +DATATYPE_MAILTO = 4 +DATATYPE_LINK_INDEX = 5 +DATATYPE_LINKS = 6 +DATATYPE_LINKS_COMPRESSED = 7 +DATATYPE_BOOKMARKS = 8 +DATATYPE_CATEGORY = 9 +DATATYPE_METADATA = 10 +DATATYPE_STYLE_SHEET = 11 +DATATYPE_FONT_PAGE = 12 +DATATYPE_TABLE = 13 +DATATYPE_TABLE_COMPRESSED = 14 +DATATYPE_COMPOSITE_IMAGE = 15 +DATATYPE_PAGELIST_METADATA = 16 +DATATYPE_SORTED_URL_INDEX = 17 +DATATYPE_SORTED_URL = 18 +DATATYPE_SORTED_URL_COMPRESSED = 19 +DATATYPE_EXT_ANCHOR_INDEX = 20 +DATATYPE_EXT_ANCHOR = 21 +DATATYPE_EXT_ANCHOR_COMPRESSED = 22 + +class HeaderRecord(object): + + def __init__(self, raw): + self.uid, = struct.unpack('>H', raw[0:2]) + # This is labled version in the spec. + # 2 is ZLIB compressed, + # 1 is DOC compressed + self.compression, = struct.unpack('>H', raw[2:4]) + self.records, = struct.unpack('>H', raw[4:6]) + + self.reserved = {} + for i in xrange(self.records): + adv = 4*i + name, = struct.unpack('>H', raw[6+adv:8+adv]) + id, = struct.unpack('>H', raw[8+adv:10+adv]) + self.reserved[id] = name + + +class SectionHeader(object): + + def __init__(self, raw): + self.uid, = struct.unpack('>H', raw[0:2]) + self.paragraphs, = struct.unpack('>H', raw[2:4]) + self.size, = struct.unpack('>H', raw[4:6]) + self.type, = struct.unpack('>B', raw[6]) + self.flags, = struct.unpack('>B', raw[7]) + + +class SectionHeaderText(object): + + def __init__(self, data_header, raw): + self.sizes = [] + self.attributes = [] + + for i in xrange(data_header.paragraphs): + adv = 4*i + self.sizes.append(struct.unpack('>H', raw[8+adv:10+adv])[0]) + self.attributes.append(struct.unpack('>H', raw[10+adv:12+adv])[0]) + + +class Reader(FormatReader): + + def __init__(self, header, stream, log, options): + self.stream = stream + self.log = log + self.options = options + + self.sections = [] + for i in range(1, header.num_sections): + start = 8 + raw_data = header.section_data(i) + data_header = SectionHeader(raw_data) + sub_header = None + if data_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED): + sub_header = SectionHeaderText(data_header, raw_data) + start += data_header.paragraphs * 4 + self.sections.append((data_header, sub_header, raw_data[start:])) + + self.header_record = HeaderRecord(header.section_data(0)) + + from calibre.ebooks.metadata.pdb import get_metadata + self.mi = get_metadata(stream, False) + + def extract_content(self, output_dir): + html = u'' + images = [] + + for header, sub_header, data in self.sections: + if header.type == DATATYPE_PHTML: + html += data + elif header.type == DATATYPE_PHTML_COMPRESSED: + d = self.decompress_phtml(data).decode('latin-1', 'replace') + print len(d) == header.size + html += d + + print html + with CurrentDir(output_dir): + with open('index.html', 'wb') as index: + self.log.debug('Writing text to index.html') + index.write(html.encode('utf-8')) + + opf_path = self.create_opf(output_dir, images) + + return opf_path + + def decompress_phtml(self, data): + if self.header_record.compression == 2: + raise NotImplementedError + #return zlib.decompress(data) + elif self.header_record.compression == 1: + from calibre.ebooks.compression.palmdoc import decompress_doc + return decompress_doc(data) + + + def create_opf(self, output_dir, images): + with CurrentDir(output_dir): + opf = OPFCreator(output_dir, self.mi) + + manifest = [('index.html', None)] + + for i in images: + manifest.append((os.path.join('images/', i), None)) + + opf.create_manifest(manifest) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(output_dir, 'metadata.opf') From 0f3228e6585dadcf6f4aa6110ed3619966bbfff2 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 11 Apr 2011 19:04:56 -0400 Subject: [PATCH 02/13] Basic plucker working (text, non-composite images). --- src/calibre/ebooks/pdb/plucker/reader.py | 455 +++++++++++++++++++++-- 1 file changed, 425 insertions(+), 30 deletions(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index d1e5931580..502682baba 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -10,9 +10,13 @@ import os import struct import zlib +from collections import OrderedDict + from calibre import CurrentDir from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ptempfile import TemporaryFile +from calibre.utils.magick import Image DATATYPE_PHTML = 0 DATATYPE_PHTML_COMPRESSED = 1 @@ -38,6 +42,100 @@ DATATYPE_EXT_ANCHOR_INDEX = 20 DATATYPE_EXT_ANCHOR = 21 DATATYPE_EXT_ANCHOR_COMPRESSED = 22 +# IETF IANA MIBenum value for the character set. +# See the http://www.iana.org/assignments/character-sets for valid values. +# Not all character sets are handled by Python. This is a small subset that +# the MIBenum maps to Python standard encodings +# from http://docs.python.org/library/codecs.html#standard-encodings +MIBNUM_TO_NAME = { + 3: 'ascii', + 4: 'latin_1', + 5: 'iso8859_2', + 6: 'iso8859_3', + 7: 'iso8859_4', + 8: 'iso8859_5', + 9: 'iso8859_6', + 10: 'iso8859_7', + 11: 'iso8859_8', + 12: 'iso8859_9', + 13: 'iso8859_10', + 17: 'shift_jis', + 18: 'euc_jp', + 27: 'utf_7', + 36: 'euc_kr', + 37: 'iso2022_kr', + 38: 'euc_kr', + 39: 'iso2022_jp', + 40: 'iso2022_jp_2', + 106: 'utf-8', + 109: 'iso8859_13', + 110: 'iso8859_14', + 111: 'iso8859_15', + 112: 'iso8859_16', + 1013: 'utf_16_be', + 1014: 'utf_16_le', + 1015: 'utf_16', + 2009: 'cp850', + 2010: 'cp852', + 2011: 'cp437', + 2013: 'cp862', + 2025: 'gb2312', + 2026: 'big5', + 2028: 'cp037', + 2043: 'cp424', + 2044: 'cp500', + 2046: 'cp855', + 2047: 'cp857', + 2048: 'cp860', + 2049: 'cp861', + 2050: 'cp863', + 2051: 'cp864', + 2052: 'cp865', + 2054: 'cp869', + 2063: 'cp1026', + 2085: 'hz', + 2086: 'cp866', + 2087: 'cp775', + 2089: 'cp858', + 2091: 'cp1140', + 2102: 'big5hkscs', + 2250: 'cp1250', + 2251: 'cp1251', + 2252: 'cp1252', + 2253: 'cp1253', + 2254: 'cp1254', + 2255: 'cp1255', + 2256: 'cp1256', + 2257: 'cp1257', + 2258: 'cp1258', +} + +def decompress_doc(data): + buffer = [ord(i) for i in data] + res = [] + i = 0 + while i < len(buffer): + c = buffer[i] + i += 1 + if c >= 1 and c <= 8: + res.extend(buffer[i:i+c]) + i += c + elif c <= 0x7f: + res.append(c) + elif c >= 0xc0: + res.extend( (ord(' '), c^0x80) ) + else: + c = (c << 8) + buffer[i] + i += 1 + di = (c & 0x3fff) >> 3 + j = len(res) + num = (c & ((1 << 3) - 1)) + 3 + + for k in range( num ): + res.append(res[j - di+k]) + + return ''.join([chr(i) for i in res]) + class HeaderRecord(object): def __init__(self, raw): @@ -68,14 +166,62 @@ class SectionHeader(object): class SectionHeaderText(object): - def __init__(self, data_header, raw): + def __init__(self, section_header, raw): self.sizes = [] self.attributes = [] - for i in xrange(data_header.paragraphs): + for i in xrange(section_header.paragraphs): adv = 4*i - self.sizes.append(struct.unpack('>H', raw[8+adv:10+adv])[0]) - self.attributes.append(struct.unpack('>H', raw[10+adv:12+adv])[0]) + self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0]) + self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0]) + +class SectionMetadata(object): + + def __init__(self, raw): + self.default_encoding = 'utf-8' + self.exceptional_uid_encodings = {} + self.owner_id = None + + record_count, = struct.unpack('>H', raw[0:2]) + + adv = 0 + for i in xrange(record_count): + type, = struct.unpack('>H', raw[2+adv:4+adv]) + length, = struct.unpack('>H', raw[4+adv:6+adv]) + + # CharSet + if type == 1: + val, = struct.unpack('>H', raw[6+adv:8+adv]) + self.default_encoding = MIBNUM_TO_NAME.get(val, 'utf-8') + # ExceptionalCharSets + elif type == 2: + ii_adv = 0 + for ii in xrange(length / 2): + uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv]) + mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv]) + self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'utf-8') + ii_adv += 4 + # OwnerID + elif type == 3: + self.owner_id = struct.unpack('>I', raw[6+adv:10+adv]) + # Author, Title, PubDate + # Ignored here. The metadata reader plugin + # will get this info because if it's missing + # the metadata reader plugin will use fall + # back data from elsewhere in the file. + elif type in (4, 5, 6): + pass + # Linked Documents + elif type == 7: + pass + + adv += 2*length + +class SectionText(object): + + def __init__(self, section_header, raw): + self.header = SectionHeaderText(section_header, raw) + self.data = raw[section_header.paragraphs * 4:] class Reader(FormatReader): @@ -84,53 +230,302 @@ class Reader(FormatReader): self.stream = stream self.log = log self.options = options - - self.sections = [] - for i in range(1, header.num_sections): - start = 8 - raw_data = header.section_data(i) - data_header = SectionHeader(raw_data) - sub_header = None - if data_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED): - sub_header = SectionHeaderText(data_header, raw_data) - start += data_header.paragraphs * 4 - self.sections.append((data_header, sub_header, raw_data[start:])) + # Mapping of section uid to our internal + # list of sections. + self.uid_section_number = OrderedDict() + self.uid_text_secion_number = OrderedDict() + self.uid_text_secion_encoding = {} + self.uid_image_section_number = {} + self.metadata_section_number = None + self.default_encoding = 'utf-8' + self.owner_id = None + self.sections = [] + self.header_record = HeaderRecord(header.section_data(0)) + + for i in range(1, header.num_sections): + section_number = i - 1 + start = 8 + section = None + + raw_data = header.section_data(i) + section_header = SectionHeader(raw_data) + + self.uid_section_number[section_header.uid] = section_number + + if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED): + self.uid_text_secion_number[section_header.uid] = section_number + section = SectionText(section_header, raw_data[start:]) + elif section_header.type in (DATATYPE_TBMP, DATATYPE_TBMP_COMPRESSED): + self.uid_image_section_number[section_header.uid] = section_number + section = raw_data[start:] + elif section_header.type == DATATYPE_METADATA: + self.metadata_section_number = section_number + section = SectionMetadata(raw_data[start:]) + elif section_header.type == DATATYPE_COMPOSITE_IMAGE: + + + self.sections.append((section_header, section)) + + if self.metadata_section_number: + mdata_section = self.sections[self.metadata_section_number][1] + for k, v in mdata_section.exceptional_uid_encodings.items(): + self.uid_text_secion_encoding[k] = v + self.default_encoding = mdata_section.default_encoding + self.owner_id = mdata_section.owner_id from calibre.ebooks.metadata.pdb import get_metadata self.mi = get_metadata(stream, False) def extract_content(self, output_dir): - html = u'' + html = u'' images = [] - - for header, sub_header, data in self.sections: - if header.type == DATATYPE_PHTML: - html += data - elif header.type == DATATYPE_PHTML_COMPRESSED: - d = self.decompress_phtml(data).decode('latin-1', 'replace') - print len(d) == header.size - html += d - - print html + + for uid, num in self.uid_text_secion_number.items(): + section_header, section_data = self.sections[num] + if section_header.type == DATATYPE_PHTML: + html += self.process_phtml(section_data.header, section_data.data.decode(self.get_text_uid_encoding(section_header.uid), 'replace')) + elif section_header.type == DATATYPE_PHTML_COMPRESSED: + d = self.decompress_phtml(section_data.data).decode(self.get_text_uid_encoding(section_header.uid), 'replace') + html += self.process_phtml(section_data.header, d) + + html += '' + with CurrentDir(output_dir): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') index.write(html.encode('utf-8')) - + + if not os.path.exists(os.path.join(output_dir, 'images/')): + os.makedirs(os.path.join(output_dir, 'images/')) + with CurrentDir(os.path.join(output_dir, 'images/')): + #im.read('/Users/john/Tmp/plkr/apnx.palm') + for uid, num in self.uid_image_section_number.items(): + section_header, section_data = self.sections[num] + if section_data: + idata = None + if section_header.type == DATATYPE_TBMP: + idata = section_data + elif section_header.type == DATATYPE_TBMP_COMPRESSED: + if self.header_record.compression == 1: + idata = decompress_doc(section_data) + elif self.header_record.compression == 2: + idata = zlib.decompress(section_data) + try: + with TemporaryFile(suffix='.palm') as itn: + with open(itn, 'wb') as itf: + itf.write(idata) + im = Image() + im.read(itn) + im.set_compression_quality(70) + im.save('%s.jpg' % uid) + self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) + except Exception as e: + self.log.error('Failed to write image with uid %s: %s' % (uid, e)) + images.append('%s.jpg' % uid) + else: + self.log.error('Failed to write image with uid %s: No data.' % uid) + opf_path = self.create_opf(output_dir, images) return opf_path def decompress_phtml(self, data): if self.header_record.compression == 2: - raise NotImplementedError - #return zlib.decompress(data) + if self.owner_id: + raise NotImplementedError + return zlib.decompress(data) elif self.header_record.compression == 1: - from calibre.ebooks.compression.palmdoc import decompress_doc + #from calibre.ebooks.compression.palmdoc import decompress_doc return decompress_doc(data) + def process_phtml(self, sub_header, d): + html = u'' + offset = 0 + paragraph_open = False + paragraph_offsets = [] + running_offset = 0 + for size in sub_header.sizes: + running_offset += size + paragraph_offsets.append(running_offset) + + while offset < len(d): + if not paragraph_open: + html += u'

' + paragraph_open = True + + c = ord(d[offset]) + if c == 0x0: + offset += 1 + c = ord(d[offset]) + # Page link begins + # 2 Bytes + # record ID + if c == 0x0a: + offset += 2 + # Targeted page link begins + # 3 Bytes + # record ID, target + elif c == 0x0b: + offset += 3 + # Paragraph link begins + # 4 Bytes + # record ID, paragraph number + elif c == 0x0c: + offset += 4 + # Targeted paragraph link begins + # 5 Bytes + # record ID, paragraph number, target + elif c == 0x0d: + offset += 5 + # Link ends + # 0 Bytes + elif c == 0x08: + pass + # Set font + # 1 Bytes + # font specifier + elif c == 0x11: + offset += 1 + # Embedded image + # 2 Bytes + # image record ID + elif c == 0x1a: + offset += 1 + uid = struct.unpack('>H', d[offset:offset+2])[0] + html += '' % uid + offset += 1 + # Set margin + # 2 Bytes + # left margin, right margin + elif c == 0x22: + offset += 2 + # Alignment of text + # 1 Bytes + # alignment + elif c == 0x29: + offset += 1 + # Horizontal rule + # 3 Bytes + # 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100) + elif c == 0x33: + offset += 3 + if paragraph_open: + html += u'

' + paragraph_open = False + html += u'
' + # New line + # 0 Bytes + elif c == 0x38: + if paragraph_open: + html += u'

\n' + paragraph_open = False + # Italic text begins + # 0 Bytes + elif c == 0x40: + html += u'' + # Italic text ends + # 0 Bytes + elif c == 0x48: + html += u'' + # Set text color + # 3 Bytes + # 8-bit red, 8-bit green, 8-bit blue + elif c == 0x53: + offset += 3 + # Multiple embedded image + # 4 Bytes + # alternate image record ID, image record ID + elif c == 0x5c: + offset += 4 + # Underline text begins + # 0 Bytes + elif c == 0x60: + html += u'' + # Underline text ends + # 0 Bytes + elif c == 0x68: + html += u'' + # Strike-through text begins + # 0 Bytes + elif c == 0x70: + html += u'' + # Strike-through text ends + # 0 Bytes + elif c == 0x78: + html += u'' + # 16-bit Unicode character + # 3 Bytes + # alternate text length, 16-bit unicode character + elif c == 0x83: + #offset += 2 + #c16 = d[offset:offset+2] + #html += c16.decode('utf-16') + #offset += 1 + offset += 3 + # 32-bit Unicode character + # 5 Bytes + # alternate text length, 32-bit unicode character + elif c == 0x85: + #offset += 2 + #c32 = d[offset:offset+4] + #html += c32.decode('utf-32') + #offset += 3 + offset += 5 + # Begin custom font span + # 6 Bytes + # font page record ID, X page position, Y page position + elif c == 0x8e: + offset += 6 + # Adjust custom font glyph position + # 4 Bytes + # X page position, Y page position + elif c == 0x8c: + offset += 4 + # Change font page + # 2 Bytes + # font record ID + elif c == 0x8a: + offset += 2 + # End custom font span + # 0 Bytes + elif c == 0x88: + pass + # Begin new table row + # 0 Bytes + elif c == 0x90: + pass + # Insert table (or table link) + # 2 Bytes + # table record ID + elif c == 0x92: + offset += 2 + # Table cell data + # 7 Bytes + # 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length + elif c == 0x97: + offset += 7 + # Exact link modifier + # 2 Bytes + # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or Targeted Paragraph Link function to specify an exact byte offset within the paragraph. This function must be followed immediately by the function it modifies). + elif c == 0x9a: + offset += 2 + else: + html += unichr(c) + offset += 1 + if offset in paragraph_offsets: + if paragraph_open: + html += u'

\n' + paragraph_open = False + + if paragraph_open: + html += u'

' + + return html + + def get_text_uid_encoding(self, uid): + return self.uid_text_secion_encoding.get(uid, self.default_encoding) def create_opf(self, output_dir, images): with CurrentDir(output_dir): From acaa06de53fe280084c753408b682df835b1cf2d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 16 Apr 2011 14:13:45 -0400 Subject: [PATCH 03/13] Fix decoding text. Add internal link support. --- src/calibre/ebooks/pdb/plucker/reader.py | 43 +++++++++++++++++------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index 502682baba..13dea343a7 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -263,7 +263,7 @@ class Reader(FormatReader): elif section_header.type == DATATYPE_METADATA: self.metadata_section_number = section_number section = SectionMetadata(raw_data[start:]) - elif section_header.type == DATATYPE_COMPOSITE_IMAGE: + #elif section_header.type == DATATYPE_COMPOSITE_IMAGE: self.sections.append((section_header, section)) @@ -285,10 +285,10 @@ class Reader(FormatReader): for uid, num in self.uid_text_secion_number.items(): section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: - html += self.process_phtml(section_data.header, section_data.data.decode(self.get_text_uid_encoding(section_header.uid), 'replace')) + html += self.process_phtml(section_data.header, section_data.data) elif section_header.type == DATATYPE_PHTML_COMPRESSED: - d = self.decompress_phtml(section_data.data).decode(self.get_text_uid_encoding(section_header.uid), 'replace') - html += self.process_phtml(section_data.header, d) + d = self.decompress_phtml(section_data.data) + html += self.process_phtml(section_header.uid, section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace') html += '' @@ -300,7 +300,6 @@ class Reader(FormatReader): if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): - #im.read('/Users/john/Tmp/plkr/apnx.palm') for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: @@ -340,10 +339,12 @@ class Reader(FormatReader): #from calibre.ebooks.compression.palmdoc import decompress_doc return decompress_doc(data) - def process_phtml(self, sub_header, d): - html = u'' + def process_phtml(self, uid, sub_header, d): + html = u'

' % (uid, uid) offset = 0 - paragraph_open = False + paragraph_open = True + need_set_p_id = False + p_num = 1 paragraph_offsets = [] running_offset = 0 for size in sub_header.sizes: @@ -352,7 +353,12 @@ class Reader(FormatReader): while offset < len(d): if not paragraph_open: - html += u'

' + if need_set_p_id: + html += u'

' % (uid, p_num) + p_num += 1 + need_set_p_id = False + else: + html += u'

' paragraph_open = True c = ord(d[offset]) @@ -363,26 +369,36 @@ class Reader(FormatReader): # 2 Bytes # record ID if c == 0x0a: - offset += 2 + offset += 1 + id = struct.unpack('>H', d[offset:offset+2])[0] + html += '' % id + offset += 1 # Targeted page link begins # 3 Bytes # record ID, target elif c == 0x0b: offset += 3 + html += '' # Paragraph link begins # 4 Bytes # record ID, paragraph number elif c == 0x0c: - offset += 4 + offset += 1 + id = struct.unpack('>H', d[offset:offset+2])[0] + offset += 2 + pid = struct.unpack('>H', d[offset:offset+2])[0] + html += '' % (id, pid) + offset += 1 # Targeted paragraph link begins # 5 Bytes # record ID, paragraph number, target elif c == 0x0d: offset += 5 + html += '' # Link ends # 0 Bytes elif c == 0x08: - pass + html += '' # Set font # 1 Bytes # font specifier @@ -515,10 +531,11 @@ class Reader(FormatReader): html += unichr(c) offset += 1 if offset in paragraph_offsets: + need_set_p_id = True if paragraph_open: html += u'

\n' paragraph_open = False - + if paragraph_open: html += u'

' From 8557981a51d551907154684b7b16f4d89c56247b Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 16 Apr 2011 14:43:36 -0400 Subject: [PATCH 04/13] Don't put every PHTML record into one ordered html file. Plucker documents are groups of separate PHTML pages that are linked via hyperlinks. --- src/calibre/ebooks/pdb/plucker/reader.py | 78 ++++++++++++------------ 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index 13dea343a7..171c051bbd 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -145,6 +145,7 @@ class HeaderRecord(object): # 1 is DOC compressed self.compression, = struct.unpack('>H', raw[2:4]) self.records, = struct.unpack('>H', raw[4:6]) + self.home_html = None self.reserved = {} for i in xrange(self.records): @@ -152,6 +153,8 @@ class HeaderRecord(object): name, = struct.unpack('>H', raw[6+adv:8+adv]) id, = struct.unpack('>H', raw[8+adv:10+adv]) self.reserved[id] = name + if name == 0: + self.home_html = id class SectionHeader(object): @@ -279,24 +282,21 @@ class Reader(FormatReader): self.mi = get_metadata(stream, False) def extract_content(self, output_dir): - html = u'' - images = [] - - for uid, num in self.uid_text_secion_number.items(): - section_header, section_data = self.sections[num] - if section_header.type == DATATYPE_PHTML: - html += self.process_phtml(section_data.header, section_data.data) - elif section_header.type == DATATYPE_PHTML_COMPRESSED: - d = self.decompress_phtml(section_data.data) - html += self.process_phtml(section_header.uid, section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace') - - html += '' - with CurrentDir(output_dir): - with open('index.html', 'wb') as index: - self.log.debug('Writing text to index.html') - index.write(html.encode('utf-8')) + for uid, num in self.uid_text_secion_number.items(): + self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid))) + with open('%s.html' % uid, 'wb') as htmlf: + html = u'' + section_header, section_data = self.sections[num] + if section_header.type == DATATYPE_PHTML: + html += self.process_phtml(section_data.header, section_data.data) + elif section_header.type == DATATYPE_PHTML_COMPRESSED: + d = self.decompress_phtml(section_data.data) + html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace') + html += '' + htmlf.write(html.encode('utf-8')) + images = [] if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): @@ -326,9 +326,25 @@ class Reader(FormatReader): else: self.log.error('Failed to write image with uid %s: No data.' % uid) - opf_path = self.create_opf(output_dir, images) + # Run the HTML through the html processing plugin. + from calibre.customize.ui import plugin_for_input_format + html_input = plugin_for_input_format('html') + for opt in html_input.options: + setattr(self.options, opt.option.name, opt.recommended_value) + self.options.input_encoding = 'utf-8' + odi = self.options.debug_pipeline + self.options.debug_pipeline = None + # Generate oeb from html conversion. + try: + home_html = self.header_record.home_html + if not home_html: + home_html = self.uid_text_secion_number.items()[0][0] + except: + raise Exception(_('Could not determine home.html')) + oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) + self.options.debug_pipeline = odi - return opf_path + return oeb def decompress_phtml(self, data): if self.header_record.compression == 2: @@ -339,8 +355,8 @@ class Reader(FormatReader): #from calibre.ebooks.compression.palmdoc import decompress_doc return decompress_doc(data) - def process_phtml(self, uid, sub_header, d): - html = u'

' % (uid, uid) + def process_phtml(self, sub_header, d): + html = u'

' offset = 0 paragraph_open = True need_set_p_id = False @@ -354,7 +370,7 @@ class Reader(FormatReader): while offset < len(d): if not paragraph_open: if need_set_p_id: - html += u'

' % (uid, p_num) + html += u'

' % p_num p_num += 1 need_set_p_id = False else: @@ -371,7 +387,7 @@ class Reader(FormatReader): if c == 0x0a: offset += 1 id = struct.unpack('>H', d[offset:offset+2])[0] - html += '' % id + html += '' % id offset += 1 # Targeted page link begins # 3 Bytes @@ -387,7 +403,7 @@ class Reader(FormatReader): id = struct.unpack('>H', d[offset:offset+2])[0] offset += 2 pid = struct.unpack('>H', d[offset:offset+2])[0] - html += '' % (id, pid) + html += '' % (id, pid) offset += 1 # Targeted paragraph link begins # 5 Bytes @@ -543,19 +559,3 @@ class Reader(FormatReader): def get_text_uid_encoding(self, uid): return self.uid_text_secion_encoding.get(uid, self.default_encoding) - - def create_opf(self, output_dir, images): - with CurrentDir(output_dir): - opf = OPFCreator(output_dir, self.mi) - - manifest = [('index.html', None)] - - for i in images: - manifest.append((os.path.join('images/', i), None)) - - opf.create_manifest(manifest) - opf.create_spine(['index.html']) - with open('metadata.opf', 'wb') as opffile: - opf.render(opffile) - - return os.path.join(output_dir, 'metadata.opf') From 644335d97b1494ee04c3c657d435a3aeef44551c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 16 Apr 2011 21:23:13 -0400 Subject: [PATCH 05/13] Ignore non internal links. Support composite images. --- src/calibre/ebooks/pdb/plucker/reader.py | 109 ++++++++++++++++++++--- 1 file changed, 99 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index 171c051bbd..c6c404b125 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -13,10 +13,9 @@ import zlib from collections import OrderedDict from calibre import CurrentDir -from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ptempfile import TemporaryFile -from calibre.utils.magick import Image +from calibre.utils.magick import Image, create_canvas DATATYPE_PHTML = 0 DATATYPE_PHTML_COMPRESSED = 1 @@ -178,6 +177,7 @@ class SectionHeaderText(object): self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0]) self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0]) + class SectionMetadata(object): def __init__(self, raw): @@ -220,6 +220,7 @@ class SectionMetadata(object): adv += 2*length + class SectionText(object): def __init__(self, section_header, raw): @@ -227,6 +228,34 @@ class SectionText(object): self.data = raw[section_header.paragraphs * 4:] +class SectionCompositeImage(object): + + def __init__(self, raw): + self.columns, = struct.unpack('>H', raw[0:2]) + self.rows, = struct.unpack('>H', raw[2:4]) + + # [ + # row [col, col, col...], + # row [col, col, col...], + # ... + # ] + # + # Each item in the layout is in it's + # correct position in the final + # composite. + # + # Each item in the layout is a uid + # to an image record. + self.layout = [] + offset = 4 + for i in xrange(self.rows): + col = [] + for j in xrange(self.columns): + col.append(struct.unpack('>H', raw[offset:offset+2])[0]) + offset += 2 + self.layout.append(col) + + class Reader(FormatReader): def __init__(self, header, stream, log, options): @@ -240,6 +269,7 @@ class Reader(FormatReader): self.uid_text_secion_number = OrderedDict() self.uid_text_secion_encoding = {} self.uid_image_section_number = {} + self.uid_composite_image_section_number = {} self.metadata_section_number = None self.default_encoding = 'utf-8' self.owner_id = None @@ -266,8 +296,9 @@ class Reader(FormatReader): elif section_header.type == DATATYPE_METADATA: self.metadata_section_number = section_number section = SectionMetadata(raw_data[start:]) - #elif section_header.type == DATATYPE_COMPOSITE_IMAGE: - + elif section_header.type == DATATYPE_COMPOSITE_IMAGE: + self.uid_composite_image_section_number[section_header.uid] = section_number + section = SectionCompositeImage(raw_data[start:]) self.sections.append((section_header, section)) @@ -282,6 +313,9 @@ class Reader(FormatReader): self.mi = get_metadata(stream, False) def extract_content(self, output_dir): + # Each text record is independent (unless the continuation + # value is set in the previous record). Put each converted + # text recored into a separate file. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid))) @@ -297,9 +331,11 @@ class Reader(FormatReader): htmlf.write(html.encode('utf-8')) images = [] + image_sizes = {} if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): + # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: @@ -317,6 +353,7 @@ class Reader(FormatReader): itf.write(idata) im = Image() im.read(itn) + image_sizes[uid] = im.size im.set_compression_quality(70) im.save('%s.jpg' % uid) self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) @@ -325,6 +362,49 @@ class Reader(FormatReader): images.append('%s.jpg' % uid) else: self.log.error('Failed to write image with uid %s: No data.' % uid) + # Composite images. + for uid, num in self.uid_composite_image_section_number.items(): + try: + section_header, section_data = self.sections[num] + # Get the final width and height. + width = 0 + height = 0 + for row in section_data.layout: + row_width = 0 + col_height = 0 + for col in row: + if col not in image_sizes: + raise Exception('Image with uid: %s missing.' % col) + im = Image() + im.read('%s.jpg' % col) + w, h = im.size + row_width += w + if col_height < h: + col_height = h + if width < row_width: + width = row_width + height += col_height + # Create a new image the total size of all image + # parts. Put the parts into the new image. + canvas = create_canvas(width, height) + y_off = 0 + for row in section_data.layout: + x_off = 0 + largest_height = 0 + for col in row: + im = Image() + im.read('%s.jpg' % col) + canvas.compose(im, x_off, y_off) + w, h = im.size + x_off += w + if largest_height < h: + largest_height = h + y_off += largest_height + canvas.set_compression_quality(70) + canvas.save('%s.jpg' % uid) + self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid)) + except Exception as e: + self.log.error('Failed to write composite image with uid %s: %s' % (uid, e)) # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format @@ -334,13 +414,17 @@ class Reader(FormatReader): self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None - # Generate oeb from html conversion. + # Determine the home.html record uid. This should be set in the + # reserved values in the metadata recored. home.html is the first + # text record (should have hyper link references to other records) + # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception(_('Could not determine home.html')) + # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi @@ -359,6 +443,7 @@ class Reader(FormatReader): html = u'

' offset = 0 paragraph_open = True + link_open = False need_set_p_id = False p_num = 1 paragraph_offsets = [] @@ -387,14 +472,15 @@ class Reader(FormatReader): if c == 0x0a: offset += 1 id = struct.unpack('>H', d[offset:offset+2])[0] - html += '' % id + if id in self.uid_text_secion_number: + html += '' % id + link_open = True offset += 1 # Targeted page link begins # 3 Bytes # record ID, target elif c == 0x0b: offset += 3 - html += '' # Paragraph link begins # 4 Bytes # record ID, paragraph number @@ -403,18 +489,21 @@ class Reader(FormatReader): id = struct.unpack('>H', d[offset:offset+2])[0] offset += 2 pid = struct.unpack('>H', d[offset:offset+2])[0] - html += '' % (id, pid) + if id in self.uid_text_secion_number: + html += '' % (id, pid) + link_open = True offset += 1 # Targeted paragraph link begins # 5 Bytes # record ID, paragraph number, target elif c == 0x0d: offset += 5 - html += '' # Link ends # 0 Bytes elif c == 0x08: - html += '' + if link_open: + html += '' + link_open = False # Set font # 1 Bytes # font specifier From 494c040d36b2ee5620143f5bf70600a811d2de1e Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 16 Apr 2011 21:54:28 -0400 Subject: [PATCH 06/13] Comments. --- src/calibre/ebooks/pdb/plucker/reader.py | 74 ++++++++++++++++++++---- 1 file changed, 62 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index c6c404b125..20943be3f0 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -136,6 +136,9 @@ def decompress_doc(data): return ''.join([chr(i) for i in res]) class HeaderRecord(object): + ''' + Plucker header. PDB record 0. + ''' def __init__(self, raw): self.uid, = struct.unpack('>H', raw[0:2]) @@ -144,6 +147,8 @@ class HeaderRecord(object): # 1 is DOC compressed self.compression, = struct.unpack('>H', raw[2:4]) self.records, = struct.unpack('>H', raw[4:6]) + # uid of the first html file. This should link + # to other files which in turn may link to others. self.home_html = None self.reserved = {} @@ -157,6 +162,10 @@ class HeaderRecord(object): class SectionHeader(object): + ''' + Every sections (record) has this header. It gives + details about the section such as it's uid. + ''' def __init__(self, raw): self.uid, = struct.unpack('>H', raw[0:2]) @@ -167,9 +176,14 @@ class SectionHeader(object): class SectionHeaderText(object): + ''' + Sub header for text records. + ''' def __init__(self, section_header, raw): + # The uncompressed size of each paragraph. self.sizes = [] + # Paragraph attributes. self.attributes = [] for i in xrange(section_header.paragraphs): @@ -179,6 +193,19 @@ class SectionHeaderText(object): class SectionMetadata(object): + ''' + Metadata. + + This does not store metadata such as title, or author. + That metadata would be best retrieved with the PDB (plucker) + metdata reader. + + This stores document specific information such as the + text encoding. + + Note: There is a default encoding but each text section + can be assigned a different encoding. + ''' def __init__(self, raw): self.default_encoding = 'utf-8' @@ -222,6 +249,9 @@ class SectionMetadata(object): class SectionText(object): + ''' + Text data. Stores a text section header and the PHTML. + ''' def __init__(self, section_header, raw): self.header = SectionHeaderText(section_header, raw) @@ -229,14 +259,19 @@ class SectionText(object): class SectionCompositeImage(object): + ''' + A composite image consists of a a 2D array + of rows and columns. The entries in the array + are uid's. + ''' def __init__(self, raw): self.columns, = struct.unpack('>H', raw[0:2]) self.rows, = struct.unpack('>H', raw[2:4]) # [ - # row [col, col, col...], - # row [col, col, col...], + # [uid, uid, uid, ...], + # [uid, uid, uid, ...], # ... # ] # @@ -275,18 +310,21 @@ class Reader(FormatReader): self.owner_id = None self.sections = [] + # The Plucker record0 header self.header_record = HeaderRecord(header.section_data(0)) for i in range(1, header.num_sections): - section_number = i - 1 + section_number = len(self.sections) + # The length of the section header. + # Where the actual data in the section starts. start = 8 section = None raw_data = header.section_data(i) + # Every sections has a section header. section_header = SectionHeader(raw_data) - - self.uid_section_number[section_header.uid] = section_number - + + # Store sections we care able. if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED): self.uid_text_secion_number[section_header.uid] = section_number section = SectionText(section_header, raw_data[start:]) @@ -300,8 +338,13 @@ class Reader(FormatReader): self.uid_composite_image_section_number[section_header.uid] = section_number section = SectionCompositeImage(raw_data[start:]) - self.sections.append((section_header, section)) + # Store the section. + if section: + self.uid_section_number[section_header.uid] = section_number + self.sections.append((section_header, section)) + # Store useful information from the metadata section locally + # to make access easier. if self.metadata_section_number: mdata_section = self.sections[self.metadata_section_number][1] for k, v in mdata_section.exceptional_uid_encodings.items(): @@ -309,13 +352,16 @@ class Reader(FormatReader): self.default_encoding = mdata_section.default_encoding self.owner_id = mdata_section.owner_id + # Get the metadata (tile, author, ...) with the metadata reader. from calibre.ebooks.metadata.pdb import get_metadata self.mi = get_metadata(stream, False) def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted - # text recored into a separate file. + # text recored into a separate file. We will reference the + # home.html file as the first file and let the HTML input + # plugin assemble the order based on hyperlinks. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid))) @@ -329,8 +375,9 @@ class Reader(FormatReader): html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace') html += '' htmlf.write(html.encode('utf-8')) - - images = [] + + # Images. + # Cache the image sizes in case they are used by a composite image. image_sizes = {} if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) @@ -359,10 +406,10 @@ class Reader(FormatReader): self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write image with uid %s: %s' % (uid, e)) - images.append('%s.jpg' % uid) else: self.log.error('Failed to write image with uid %s: No data.' % uid) # Composite images. + # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] @@ -559,7 +606,10 @@ class Reader(FormatReader): # 4 Bytes # alternate image record ID, image record ID elif c == 0x5c: - offset += 4 + offset += 3 + uid = struct.unpack('>H', d[offset:offset+2])[0] + html += '' % uid + offset += 1 # Underline text begins # 0 Bytes elif c == 0x60: From 93492a9ec8f01233723bd8b6038a0440c738a705 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 17 Apr 2011 09:42:00 -0400 Subject: [PATCH 07/13] Add font changes. --- src/calibre/ebooks/pdb/plucker/reader.py | 50 ++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index 20943be3f0..5c128fa3d3 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -493,6 +493,7 @@ class Reader(FormatReader): link_open = False need_set_p_id = False p_num = 1 + font_specifier_close = '' paragraph_offsets = [] running_offset = 0 for size in sub_header.sizes: @@ -556,6 +557,55 @@ class Reader(FormatReader): # font specifier elif c == 0x11: offset += 1 + specifier = d[offset] + html += font_specifier_close + # Regular text + if specifier == 0: + font_specifier_close = '' + # h1 + elif specifier == 1: + html += '

' + font_specifier_close = '

' + # h2 + elif specifier == 2: + html += '

' + font_specifier_close = '

' + # h3 + elif specifier == 3: + html += '' + font_specifier_close = '' + # h4 + elif specifier == 4: + html += '

' + font_specifier_close = '

' + # h5 + elif specifier == 5: + html += '
' + font_specifier_close = '
' + # h6 + elif specifier == 6: + html += '
' + font_specifier_close = '
' + # Bold + elif specifier == 7: + html += '' + font_specifier_close = '' + # Fixed-width + elif specifier == 8: + html += '' + font_specifier_close = '' + # Small + elif specifier == 9: + html += '' + font_specifier_close = '' + # Subscript + elif specifier == 10: + html += '' + font_specifier_close = '' + # Superscript + elif specifier == 11: + html += '' + font_specifier_close = '' # Embedded image # 2 Bytes # image record ID From 87bb34d9940a39553d4f72a056486cb20a88e587 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 17 Apr 2011 10:03:50 -0400 Subject: [PATCH 08/13] Use latin-1 instead of utf-8 for default encoding. --- src/calibre/ebooks/pdb/plucker/reader.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index 5c128fa3d3..9ae449e579 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -208,7 +208,7 @@ class SectionMetadata(object): ''' def __init__(self, raw): - self.default_encoding = 'utf-8' + self.default_encoding = 'latin-1' self.exceptional_uid_encodings = {} self.owner_id = None @@ -222,14 +222,14 @@ class SectionMetadata(object): # CharSet if type == 1: val, = struct.unpack('>H', raw[6+adv:8+adv]) - self.default_encoding = MIBNUM_TO_NAME.get(val, 'utf-8') + self.default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1') # ExceptionalCharSets elif type == 2: ii_adv = 0 for ii in xrange(length / 2): uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv]) mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv]) - self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'utf-8') + self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'latin-1') ii_adv += 4 # OwnerID elif type == 3: @@ -306,7 +306,7 @@ class Reader(FormatReader): self.uid_image_section_number = {} self.uid_composite_image_section_number = {} self.metadata_section_number = None - self.default_encoding = 'utf-8' + self.default_encoding = 'latin-1' self.owner_id = None self.sections = [] @@ -680,10 +680,12 @@ class Reader(FormatReader): # 3 Bytes # alternate text length, 16-bit unicode character elif c == 0x83: - #offset += 2 + #offset += 1 + #alt_len = struct.unpack('>B', str(d[offset]))[0] + #offset += 1 #c16 = d[offset:offset+2] #html += c16.decode('utf-16') - #offset += 1 + #offset += 1 + alt_len offset += 3 # 32-bit Unicode character # 5 Bytes From 15a0384481e3e8ec9ee3adce0b082c21bde51fd2 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 17 Apr 2011 10:59:15 -0400 Subject: [PATCH 09/13] .. --- src/calibre/ebooks/pdb/plucker/reader.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index 9ae449e579..ced9dafc0f 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -680,21 +680,11 @@ class Reader(FormatReader): # 3 Bytes # alternate text length, 16-bit unicode character elif c == 0x83: - #offset += 1 - #alt_len = struct.unpack('>B', str(d[offset]))[0] - #offset += 1 - #c16 = d[offset:offset+2] - #html += c16.decode('utf-16') - #offset += 1 + alt_len offset += 3 # 32-bit Unicode character # 5 Bytes # alternate text length, 32-bit unicode character elif c == 0x85: - #offset += 2 - #c32 = d[offset:offset+4] - #html += c32.decode('utf-32') - #offset += 3 offset += 5 # Begin custom font span # 6 Bytes From 05fc3eec93fd3b05af981bc3d20a1627673aa043 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 17 Apr 2011 11:09:46 -0400 Subject: [PATCH 10/13] Add todo for non supported features. --- src/calibre/ebooks/pdb/plucker/reader.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index ced9dafc0f..207a466178 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -292,6 +292,18 @@ class SectionCompositeImage(object): class Reader(FormatReader): + ''' + Convert a plucker archive into HTML. + + TODO: + * UTF 16 and 32 characters. + * Margins. + * Alignment. + * DATATYPE_MAILTO + * DATATYPE_TABLE(_COMPRESSED) + * DATATYPE_EXT_ANCHOR_INDEX + * DATATYPE_EXT_ANCHOR(_COMPRESSED) + ''' def __init__(self, header, stream, log, options): self.stream = stream From c0cf0e91d47b1213b2093bac4cdd1317a87b258f Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 17 Apr 2011 19:28:04 -0400 Subject: [PATCH 11/13] Allow user specify input encoding and override what is specified by the file. Turn 0xa0 character into nbsp entity. --- src/calibre/ebooks/pdb/plucker/reader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index 207a466178..5fa66e1246 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -523,6 +523,7 @@ class Reader(FormatReader): paragraph_open = True c = ord(d[offset]) + # PHTML "functions" if c == 0x0: offset += 1 c = ord(d[offset]) @@ -736,6 +737,8 @@ class Reader(FormatReader): # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or Targeted Paragraph Link function to specify an exact byte offset within the paragraph. This function must be followed immediately by the function it modifies). elif c == 0x9a: offset += 2 + elif c == 0xa0: + html += ' ' else: html += unichr(c) offset += 1 @@ -751,4 +754,4 @@ class Reader(FormatReader): return html def get_text_uid_encoding(self, uid): - return self.uid_text_secion_encoding.get(uid, self.default_encoding) + return self.options.input_encoding if self.options.input_encoding else self.uid_text_secion_encoding.get(uid, self.default_encoding) From 377313df7d4343ab9e4035877ac2d184f5dd73ba Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 17 Apr 2011 19:42:39 -0400 Subject: [PATCH 12/13] cleanup. --- src/calibre/ebooks/pdb/plucker/reader.py | 25 +++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index 5fa66e1246..9f1d2ad426 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -183,6 +183,9 @@ class SectionHeaderText(object): def __init__(self, section_header, raw): # The uncompressed size of each paragraph. self.sizes = [] + # uncompressed offset of each paragraph starting + # at the beginning of the PHTML. + self.paragraph_offsets = [] # Paragraph attributes. self.attributes = [] @@ -191,6 +194,11 @@ class SectionHeaderText(object): self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0]) self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0]) + running_offset = 0 + for size in self.sizes: + running_offset += size + self.paragraph_offsets.append(running_offset) + class SectionMetadata(object): ''' @@ -299,6 +307,7 @@ class Reader(FormatReader): * UTF 16 and 32 characters. * Margins. * Alignment. + * Font color. * DATATYPE_MAILTO * DATATYPE_TABLE(_COMPRESSED) * DATATYPE_EXT_ANCHOR_INDEX @@ -381,13 +390,13 @@ class Reader(FormatReader): html = u'' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: - html += self.process_phtml(section_data.header, section_data.data) + html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) - html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace') + html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace') html += '' htmlf.write(html.encode('utf-8')) - + # Images. # Cache the image sizes in case they are used by a composite image. image_sizes = {} @@ -498,7 +507,7 @@ class Reader(FormatReader): #from calibre.ebooks.compression.palmdoc import decompress_doc return decompress_doc(data) - def process_phtml(self, sub_header, d): + def process_phtml(self, d, paragraph_offsets=[]): html = u'

' offset = 0 paragraph_open = True @@ -506,11 +515,6 @@ class Reader(FormatReader): need_set_p_id = False p_num = 1 font_specifier_close = '' - paragraph_offsets = [] - running_offset = 0 - for size in sub_header.sizes: - running_offset += size - paragraph_offsets.append(running_offset) while offset < len(d): if not paragraph_open: @@ -754,4 +758,7 @@ class Reader(FormatReader): return html def get_text_uid_encoding(self, uid): + # Return the user sepcified input encoding, + # otherwise return the alternate encoding specified for the uid, + # otherwise retur the default encoding for the document. return self.options.input_encoding if self.options.input_encoding else self.uid_text_secion_encoding.get(uid, self.default_encoding) From e690e7196e094787985dd148038d38a6d5e08163 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 24 Apr 2011 09:44:50 -0400 Subject: [PATCH 13/13] Plucker metadata reader. --- src/calibre/ebooks/metadata/pdb.py | 4 +- src/calibre/ebooks/metadata/plucker.py | 73 ++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/metadata/plucker.py diff --git a/src/calibre/ebooks/metadata/pdb.py b/src/calibre/ebooks/metadata/pdb.py index ddf2b0c818..d01bb0ecdb 100644 --- a/src/calibre/ebooks/metadata/pdb.py +++ b/src/calibre/ebooks/metadata/pdb.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ''' -Read meta information from eReader pdb files. +Read meta information from pdb files. ''' __license__ = 'GPL v3' @@ -13,10 +13,12 @@ import re from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.metadata.ereader import get_metadata as get_eReader +from calibre.ebooks.metadata.plucker import get_metadata as get_plucker MREADER = { 'PNPdPPrs' : get_eReader, 'PNRdPPrs' : get_eReader, + 'DataPlkr' : get_plucker, } from calibre.ebooks.metadata.ereader import set_metadata as set_eReader diff --git a/src/calibre/ebooks/metadata/plucker.py b/src/calibre/ebooks/metadata/plucker.py new file mode 100644 index 0000000000..991945f42b --- /dev/null +++ b/src/calibre/ebooks/metadata/plucker.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +''' +Read meta information from Plucker pdb files. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import struct +from datetime import datetime + +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.pdb.header import PdbHeaderReader +from calibre.ebooks.pdb.plucker.reader import SectionHeader, DATATYPE_METADATA, \ + MIBNUM_TO_NAME + +def get_metadata(stream, extract_cover=True): + ''' + Return metadata as a L{MetaInfo} object + ''' + mi = MetaInformation(_('Unknown'), [_('Unknown')]) + stream.seek(0) + + pheader = PdbHeaderReader(stream) + section_data = None + for i in range(1, pheader.num_sections): + raw_data = pheader.section_data(i) + section_header = SectionHeader(raw_data) + if section_header.type == DATATYPE_METADATA: + section_data = raw_data[8:] + break + + if not section_data: + return mi + + default_encoding = 'latin-1' + record_count, = struct.unpack('>H', section_data[0:2]) + adv = 0 + title = None + author = None + pubdate = 0 + for i in xrange(record_count): + type, = struct.unpack('>H', section_data[2+adv:4+adv]) + length, = struct.unpack('>H', section_data[4+adv:6+adv]) + + # CharSet + if type == 1: + val, = struct.unpack('>H', section_data[6+adv:8+adv]) + default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1') + # Author + elif type == 4: + author = section_data[6+adv+(2*length)] + # Title + elif type == 5: + title = section_data[6+adv+(2*length)] + # Publication Date + elif type == 6: + pubdate, = struct.unpack('>I', section_data[6+adv:6+adv+4]) + + adv += 2*length + + if title: + mi.title = title.replace('\0', '').decode(default_encoding, 'replace') + if author: + author = author.replace('\0', '').decode(default_encoding, 'replace') + mi.author = author.split(',') + mi.pubdate = datetime.fromtimestamp(pubdate) + + return mi