Update eReader PDB code to produce files that are closer to what DropBook produces: Set the text size record, Write image size, Handle 1.5.2 sidebar and footnote changes. Fix PML output stripping backslash character.

This commit is contained in:
John Schember 2009-10-10 23:38:00 -04:00
parent 61e8c4222a
commit 552735c41e
3 changed files with 80 additions and 37 deletions

View File

@ -34,7 +34,6 @@ class HeaderRecord(object):
self.has_metadata, = struct.unpack('>H', raw[24:26]) self.has_metadata, = struct.unpack('>H', raw[24:26])
self.footnote_rec, = struct.unpack('>H', raw[28:30]) self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32]) self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
self.image_data_offset, = struct.unpack('>H', raw[40:42]) self.image_data_offset, = struct.unpack('>H', raw[40:42])
self.metadata_offset, = struct.unpack('>H', raw[44:46]) self.metadata_offset, = struct.unpack('>H', raw[44:46])
self.footnote_offset, = struct.unpack('>H', raw[48:50]) self.footnote_offset, = struct.unpack('>H', raw[48:50])

View File

@ -28,7 +28,7 @@ IDENTITY = 'PNRdPPrs'
# This is an arbitrary number that is small enough to work. The actual maximum # This is an arbitrary number that is small enough to work. The actual maximum
# record size is unknown. # record size is unknown.
MAX_RECORD_SIZE = 3560 MAX_RECORD_SIZE = 8192
class Writer(FormatWriter): class Writer(FormatWriter):
@ -37,13 +37,28 @@ class Writer(FormatWriter):
self.log = log self.log = log
def write_content(self, oeb_book, out_stream, metadata=None): def write_content(self, oeb_book, out_stream, metadata=None):
text, image_hrefs = self._text(oeb_book) text, image_hrefs, text_sizes = self._text(oeb_book)
images = self._images(oeb_book.manifest, image_hrefs) images = self._images(oeb_book.manifest, image_hrefs)
metadata = [self._metadata(metadata)] metadata = [self._metadata(metadata)]
hr = [self._header_record(len(text), len(images))] hr = [self._header_record(len(text), len(images))]
sections = hr+text+images+metadata+['MeTaInFo\x00'] '''
Record order as generated by Dropbook.
1. eReader Header
2. Compressed text
3. Small font page index
4. Large font page index
5. Chapter index
6. Links index
7. Images
8. (Extrapolation: there should be one more record type here though yet uncovered what it might be).
9. Metadata
10. Sidebar records
11. Footnote records
12. Text block size record
13. "MeTaInFo\x00" word record
'''
sections = hr+text+images+metadata+[text_sizes]+['MeTaInFo\x00']
lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections] lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]
@ -62,12 +77,38 @@ class Writer(FormatWriter):
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
pml_pages = [] pml_pages = []
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): text_sizes = ''
pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])) index = 0
while index < len(pml):
'''
Split on the space character closest to MAX_RECORD_SIZE when possible.
'''
split = pml.rfind(' ', index, MAX_RECORD_SIZE)
if split == -1:
len_end = len(pml[index:])
if len_end > MAX_RECORD_SIZE:
split = MAX_RECORD_SIZE
else:
split = len_end
if split == 0:
split = 1
pml_pages.append(zlib.compress(pml[index:index+split]))
text_sizes += struct.pack('>H', split)
index += split
return pml_pages, pmlmlizer.image_hrefs return pml_pages, pmlmlizer.image_hrefs, text_sizes
def _images(self, manifest, image_hrefs): def _images(self, manifest, image_hrefs):
'''
Image format.
0-4 : 'PNG '. There must be a space after PNG.
4-36 : Image name. Must be exactly 32 bytes long. Pad with \x00 for names shorter than 32 bytes
36-58 : Unknown.
58-60 : Width.
60-62 : Height.
62-...: Raw image data in 8 bit PNG format.
'''
images = [] images = []
for item in manifest: for item in manifest:
@ -82,6 +123,8 @@ class Writer(FormatWriter):
header = 'PNG ' header = 'PNG '
header += image_hrefs[item.href].ljust(32, '\x00')[:32] header += image_hrefs[item.href].ljust(32, '\x00')[:32]
header = header.ljust(58, '\x00')
header += struct.pack('>HH', im.size[0], im.size[1])
header = header.ljust(62, '\x00') header = header.ljust(62, '\x00')
if len(data) + len(header) < 65505: if len(data) + len(header) < 65505:
@ -126,7 +169,7 @@ class Writer(FormatWriter):
text_items = the number of text pages text_items = the number of text pages
image_items = the number of images image_items = the number of images
''' '''
version = 10 # Zlib compression compression = 10 # zlib compression.
non_text_offset = text_items + 1 non_text_offset = text_items + 1
if image_items > 0: if image_items > 0:
@ -140,33 +183,33 @@ class Writer(FormatWriter):
record = '' record = ''
record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
record += struct.pack('>H', 0) # [2:4] record += struct.pack('>H', 0) # [2:4] # Unknown.
record += struct.pack('>H', 0) # [4:6] record += struct.pack('>H', 0) # [4:6] # Unknown.
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text
record += struct.pack('>H', 0) # [8:10] record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built.
record += struct.pack('>H', 0) # [10:12] record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built.
record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start.
record += struct.pack('>H', 0) # [14:16] record += struct.pack('>H', 0) # [14:16] # Number of chapter index records.
record += struct.pack('>H', 0) # [16:18] record += struct.pack('>H', 0) # [16:18] # Number of small font page index records.
record += struct.pack('>H', 0) # [18:20] record += struct.pack('>H', 0) # [18:20] # Number of large font page index records.
record += struct.pack('>H', image_items) # [20:22] # Number of images record += struct.pack('>H', image_items) # [20:22] # Number of images.
record += struct.pack('>H', 0) # [22:24] record += struct.pack('>H', 0) # [22:24] # Number of links.
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not.
record += struct.pack('>H', 0) # [26:28] record += struct.pack('>H', 0) # [26:28] # Unknown.
record += struct.pack('>H', 0) # [28:30] # footnote_rec record += struct.pack('>H', 0) # [28:30] # Number of Footnotes.
record += struct.pack('>H', 0) # [30:32] # sidebar_rec record += struct.pack('>H', 0) # [30:32] # Number of Sidebars.
record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset record += struct.pack('>H', last_data_offset) # [32:34] # Chapter index offset.
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC.
record += struct.pack('>H', 0) # [36:38] record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none.
record += struct.pack('>H', 0) # [38:40] record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none.
record += struct.pack('>H', image_data_offset) # [40:42] # image_data_offset. This will be the last data offset if there are no images record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none.
record += struct.pack('>H', 0) # [42:44] record += struct.pack('>H', image_data_offset) # [42:44] # Links offset. This will be the last data offset if there are none.
record += struct.pack('>H', meta_data_offset) # [44:46] # meta_data_offset. This will be the last data offset if there are no images record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none.
record += struct.pack('>H', 0) # [46:48] record += struct.pack('>H', 0) # [46:48] # Unknown.
record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset. This will be the last data offset if there are no images record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none.
record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset. This will be the last data offset if there are no images record += struct.pack('>H', last_data_offset) # [50:52] # Sidebar offset. This will be the last data offset if there are none.
record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset record += struct.pack('>H', last_data_offset) # [52:54] # Last data offset.
for i in range(54, 132, 2): for i in range(54, 132, 2):
record += struct.pack('>H', 0) # [54:132] record += struct.pack('>H', 0) # [54:132]

View File

@ -64,7 +64,7 @@ PML_HTML_RULES = [
(re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''), (re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''),
(re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''), (re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''),
# Remove invalid single item pml codes. # Remove invalid single item pml codes.
(re.compile(r'(?<=[^\\])\\.'), lambda match: ''), (re.compile(r'(?<=[^\\])\\[^\\]'), lambda match: ''),
# Replace \\ with \. # Replace \\ with \.
(re.compile(r'\\\\'), lambda match: '\\'), (re.compile(r'\\\\'), lambda match: '\\'),
@ -78,6 +78,7 @@ def pml_to_html(pml):
return html return html
def footnote_sidebar_to_html(id, pml): def footnote_sidebar_to_html(id, pml):
if id.startswith('\x01'):
id = id[2:]
html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml)) html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
return html return html