mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update eReader PDB code to produce files that are closer to what DropBook produces: Set the text size record, Write image size, Handle 1.5.2 sidebar and footnote changes. Fix PML output stripping backslash character.
This commit is contained in:
parent
61e8c4222a
commit
552735c41e
@ -34,7 +34,6 @@ class HeaderRecord(object):
|
||||
self.has_metadata, = struct.unpack('>H', raw[24:26])
|
||||
self.footnote_rec, = struct.unpack('>H', raw[28:30])
|
||||
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
|
||||
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
|
||||
self.image_data_offset, = struct.unpack('>H', raw[40:42])
|
||||
self.metadata_offset, = struct.unpack('>H', raw[44:46])
|
||||
self.footnote_offset, = struct.unpack('>H', raw[48:50])
|
||||
|
@ -28,7 +28,7 @@ IDENTITY = 'PNRdPPrs'
|
||||
|
||||
# This is an arbitrary number that is small enough to work. The actual maximum
|
||||
# record size is unknown.
|
||||
MAX_RECORD_SIZE = 3560
|
||||
MAX_RECORD_SIZE = 8192
|
||||
|
||||
class Writer(FormatWriter):
|
||||
|
||||
@ -37,13 +37,28 @@ class Writer(FormatWriter):
|
||||
self.log = log
|
||||
|
||||
def write_content(self, oeb_book, out_stream, metadata=None):
|
||||
text, image_hrefs = self._text(oeb_book)
|
||||
text, image_hrefs, text_sizes = self._text(oeb_book)
|
||||
images = self._images(oeb_book.manifest, image_hrefs)
|
||||
metadata = [self._metadata(metadata)]
|
||||
|
||||
hr = [self._header_record(len(text), len(images))]
|
||||
|
||||
sections = hr+text+images+metadata+['MeTaInFo\x00']
|
||||
'''
|
||||
Record order as generated by Dropbook.
|
||||
1. eReader Header
|
||||
2. Compressed text
|
||||
3. Small font page index
|
||||
4. Large font page index
|
||||
5. Chapter index
|
||||
6. Links index
|
||||
7. Images
|
||||
8. (Extrapolation: there should be one more record type here though yet uncovered what it might be).
|
||||
9. Metadata
|
||||
10. Sidebar records
|
||||
11. Footnote records
|
||||
12. Text block size record
|
||||
13. "MeTaInFo\x00" word record
|
||||
'''
|
||||
sections = hr+text+images+metadata+[text_sizes]+['MeTaInFo\x00']
|
||||
|
||||
lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]
|
||||
|
||||
@ -62,12 +77,38 @@ class Writer(FormatWriter):
|
||||
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
|
||||
|
||||
pml_pages = []
|
||||
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
|
||||
pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]))
|
||||
text_sizes = ''
|
||||
index = 0
|
||||
while index < len(pml):
|
||||
'''
|
||||
Split on the space character closest to MAX_RECORD_SIZE when possible.
|
||||
'''
|
||||
split = pml.rfind(' ', index, MAX_RECORD_SIZE)
|
||||
if split == -1:
|
||||
len_end = len(pml[index:])
|
||||
if len_end > MAX_RECORD_SIZE:
|
||||
split = MAX_RECORD_SIZE
|
||||
else:
|
||||
split = len_end
|
||||
if split == 0:
|
||||
split = 1
|
||||
pml_pages.append(zlib.compress(pml[index:index+split]))
|
||||
text_sizes += struct.pack('>H', split)
|
||||
index += split
|
||||
|
||||
return pml_pages, pmlmlizer.image_hrefs
|
||||
return pml_pages, pmlmlizer.image_hrefs, text_sizes
|
||||
|
||||
def _images(self, manifest, image_hrefs):
|
||||
'''
|
||||
Image format.
|
||||
|
||||
0-4 : 'PNG '. There must be a space after PNG.
|
||||
4-36 : Image name. Must be exactly 32 bytes long. Pad with \x00 for names shorter than 32 bytes
|
||||
36-58 : Unknown.
|
||||
58-60 : Width.
|
||||
60-62 : Height.
|
||||
62-...: Raw image data in 8 bit PNG format.
|
||||
'''
|
||||
images = []
|
||||
|
||||
for item in manifest:
|
||||
@ -82,6 +123,8 @@ class Writer(FormatWriter):
|
||||
|
||||
header = 'PNG '
|
||||
header += image_hrefs[item.href].ljust(32, '\x00')[:32]
|
||||
header = header.ljust(58, '\x00')
|
||||
header += struct.pack('>HH', im.size[0], im.size[1])
|
||||
header = header.ljust(62, '\x00')
|
||||
|
||||
if len(data) + len(header) < 65505:
|
||||
@ -126,7 +169,7 @@ class Writer(FormatWriter):
|
||||
text_items = the number of text pages
|
||||
image_items = the number of images
|
||||
'''
|
||||
version = 10 # Zlib compression
|
||||
compression = 10 # zlib compression.
|
||||
non_text_offset = text_items + 1
|
||||
|
||||
if image_items > 0:
|
||||
@ -140,33 +183,33 @@ class Writer(FormatWriter):
|
||||
|
||||
record = ''
|
||||
|
||||
record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
|
||||
record += struct.pack('>H', 0) # [2:4]
|
||||
record += struct.pack('>H', 0) # [4:6]
|
||||
record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
|
||||
record += struct.pack('>H', 0) # [2:4] # Unknown.
|
||||
record += struct.pack('>H', 0) # [4:6] # Unknown.
|
||||
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text
|
||||
record += struct.pack('>H', 0) # [8:10]
|
||||
record += struct.pack('>H', 0) # [10:12]
|
||||
record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset
|
||||
record += struct.pack('>H', 0) # [14:16]
|
||||
record += struct.pack('>H', 0) # [16:18]
|
||||
record += struct.pack('>H', 0) # [18:20]
|
||||
record += struct.pack('>H', image_items) # [20:22] # Number of images
|
||||
record += struct.pack('>H', 0) # [22:24]
|
||||
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not
|
||||
record += struct.pack('>H', 0) # [26:28]
|
||||
record += struct.pack('>H', 0) # [28:30] # footnote_rec
|
||||
record += struct.pack('>H', 0) # [30:32] # sidebar_rec
|
||||
record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset
|
||||
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC
|
||||
record += struct.pack('>H', 0) # [36:38]
|
||||
record += struct.pack('>H', 0) # [38:40]
|
||||
record += struct.pack('>H', image_data_offset) # [40:42] # image_data_offset. This will be the last data offset if there are no images
|
||||
record += struct.pack('>H', 0) # [42:44]
|
||||
record += struct.pack('>H', meta_data_offset) # [44:46] # meta_data_offset. This will be the last data offset if there are no images
|
||||
record += struct.pack('>H', 0) # [46:48]
|
||||
record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset. This will be the last data offset if there are no images
|
||||
record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset. This will be the last data offset if there are no images
|
||||
record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset
|
||||
record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built.
|
||||
record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built.
|
||||
record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start.
|
||||
record += struct.pack('>H', 0) # [14:16] # Number of chapter index records.
|
||||
record += struct.pack('>H', 0) # [16:18] # Number of small font page index records.
|
||||
record += struct.pack('>H', 0) # [18:20] # Number of large font page index records.
|
||||
record += struct.pack('>H', image_items) # [20:22] # Number of images.
|
||||
record += struct.pack('>H', 0) # [22:24] # Number of links.
|
||||
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not.
|
||||
record += struct.pack('>H', 0) # [26:28] # Unknown.
|
||||
record += struct.pack('>H', 0) # [28:30] # Number of Footnotes.
|
||||
record += struct.pack('>H', 0) # [30:32] # Number of Sidebars.
|
||||
record += struct.pack('>H', last_data_offset) # [32:34] # Chapter index offset.
|
||||
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC.
|
||||
record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', image_data_offset) # [42:44] # Links offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', 0) # [46:48] # Unknown.
|
||||
record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', last_data_offset) # [50:52] # Sidebar offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', last_data_offset) # [52:54] # Last data offset.
|
||||
|
||||
for i in range(54, 132, 2):
|
||||
record += struct.pack('>H', 0) # [54:132]
|
||||
|
@ -64,7 +64,7 @@ PML_HTML_RULES = [
|
||||
(re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''),
|
||||
(re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''),
|
||||
# Remove invalid single item pml codes.
|
||||
(re.compile(r'(?<=[^\\])\\.'), lambda match: ''),
|
||||
(re.compile(r'(?<=[^\\])\\[^\\]'), lambda match: ''),
|
||||
|
||||
# Replace \\ with \.
|
||||
(re.compile(r'\\\\'), lambda match: '\\'),
|
||||
@ -78,6 +78,7 @@ def pml_to_html(pml):
|
||||
return html
|
||||
|
||||
def footnote_sidebar_to_html(id, pml):
|
||||
if id.startswith('\x01'):
|
||||
id = id[2:]
|
||||
html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
|
||||
return html
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user