eReader PDB output: proper length of indexes and do not try to add them if they are not avaliable. PML Outpu: cleanup. PML Input: read unicode and entity PML tags correctly.

This commit is contained in:
Kovid Goyal 2009-10-12 07:35:27 -06:00
parent 4865f55e40
commit b3ad9f0160
5 changed files with 149 additions and 57 deletions

View File

@ -75,7 +75,6 @@ class FB2MLizer(object):
output.append(self.fb2mlize_images())
output.append(self.fb2_footer())
output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc())
return output
return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
def fb2_header(self):

View File

@ -34,7 +34,6 @@ class HeaderRecord(object):
self.has_metadata, = struct.unpack('>H', raw[24:26])
self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
self.image_data_offset, = struct.unpack('>H', raw[40:42])
self.metadata_offset, = struct.unpack('>H', raw[44:46])
self.footnote_offset, = struct.unpack('>H', raw[48:50])

View File

@ -8,6 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
import struct
import zlib
@ -28,7 +29,7 @@ IDENTITY = 'PNRdPPrs'
# This is an arbitrary number that is small enough to work. The actual maximum
# record size is unknown.
MAX_RECORD_SIZE = 3560
MAX_RECORD_SIZE = 8192
class Writer(FormatWriter):
@ -37,13 +38,33 @@ class Writer(FormatWriter):
self.log = log
def write_content(self, oeb_book, out_stream, metadata=None):
text, image_hrefs = self._text(oeb_book)
images = self._images(oeb_book.manifest, image_hrefs)
pmlmlizer = PMLMLizer(self.log)
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
text, text_sizes = self._text(pml)
chapter_index = self._chapter_index(pml)
link_index = self._link_index(pml)
images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
metadata = [self._metadata(metadata)]
hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))]
hr = [self._header_record(len(text), len(images))]
sections = hr+text+images+metadata+['MeTaInFo\x00']
'''
Record order as generated by Dropbook.
1. eReader Header
2. Compressed text
3. Small font page index
4. Large font page index
5. Chapter index
6. Links index
7. Images
8. (Extrapolation: there should be one more record type here though yet uncovered what it might be).
9. Metadata
10. Sidebar records
11. Footnote records
12. Text block size record
13. "MeTaInFo\x00" word record
'''
sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+['MeTaInFo\x00']
lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]
@ -57,17 +78,74 @@ class Writer(FormatWriter):
else:
out_stream.write(item)
def _text(self, oeb_book):
pmlmlizer = PMLMLizer(self.log)
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
def _text(self, pml):
pml_pages = []
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]))
text_sizes = ''
index = 0
while index < len(pml):
'''
Split on the space character closest to MAX_RECORD_SIZE when possible.
'''
split = pml.rfind(' ', index, MAX_RECORD_SIZE)
if split == -1:
len_end = len(pml[index:])
if len_end > MAX_RECORD_SIZE:
split = MAX_RECORD_SIZE
else:
split = len_end
if split == 0:
split = 1
pml_pages.append(zlib.compress(pml[index:index+split]))
text_sizes += struct.pack('>H', split)
index += split
return pml_pages, pmlmlizer.image_hrefs
return pml_pages, text_sizes
def _index_item(self, mo):
index = ''
if 'text' in mo.groupdict().keys():
index += struct.pack('>L', mo.start())
text = mo.group('text')
# Strip all PML tags from text
text = re.sub(r'\\U[0-9a-z]{4}', '', text)
text = re.sub(r'\\a\d{3}', '', text)
text = re.sub(r'\\.', '', text)
# Add appropriate spacing to denote the various levels of headings
if 'val' in mo.groupdict().keys():
text = '%s%s' % (' ' * 4 * int(mo.group('val')), text)
index += text
index += '\x00'
return index
def _chapter_index(self, pml):
chapter_marks = [
r'(?s)\\x(?P<text>.+?)\\x',
r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]',
r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"',
]
index = []
for chapter_mark in chapter_marks:
for mo in re.finditer(chapter_mark, pml):
index.append(self._index_item(mo))
return index
def _link_index(self, pml):
index = []
for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml):
index.append(self._index_item(mo))
return index
def _images(self, manifest, image_hrefs):
'''
Image format.
0-4 : 'PNG '. There must be a space after PNG.
4-36 : Image name. Must be exactly 32 bytes long. Pad with \x00 for names shorter than 32 bytes
36-58 : Unknown.
58-60 : Width.
60-62 : Height.
62-...: Raw image data in 8 bit PNG format.
'''
images = []
for item in manifest:
@ -82,6 +160,8 @@ class Writer(FormatWriter):
header = 'PNG '
header += image_hrefs[item.href].ljust(32, '\x00')[:32]
header = header.ljust(58, '\x00')
header += struct.pack('>HH', im.size[0], im.size[1])
header = header.ljust(62, '\x00')
if len(data) + len(header) < 65505:
@ -121,52 +201,60 @@ class Writer(FormatWriter):
return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn)
def _header_record(self, text_items, image_items):
def _header_record(self, text_count, chapter_count, link_count, image_count):
'''
text_items = the number of text pages
image_items = the number of images
text_count = the number of text pages
image_count = the number of images
'''
version = 10 # Zlib compression
non_text_offset = text_items + 1
compression = 10 # zlib compression.
non_text_offset = text_count + 1
if image_items > 0:
image_data_offset = text_items + 1
meta_data_offset = image_data_offset + image_items
chapter_offset = non_text_offset
link_offset = chapter_offset + chapter_count
if image_count > 0:
image_data_offset = link_offset + link_count
meta_data_offset = image_data_offset + image_count
last_data_offset = meta_data_offset + 1
else:
meta_data_offset = text_items + 1
meta_data_offset = link_offset + link_count
last_data_offset = meta_data_offset + 1
image_data_offset = last_data_offset
if chapter_count == 0:
chapter_offset = last_data_offset
if link_count == 0:
link_offset = last_data_offset
record = ''
record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
record += struct.pack('>H', 0) # [2:4]
record += struct.pack('>H', 0) # [4:6]
record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
record += struct.pack('>H', 0) # [2:4] # Unknown.
record += struct.pack('>H', 0) # [4:6] # Unknown.
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text
record += struct.pack('>H', 0) # [8:10]
record += struct.pack('>H', 0) # [10:12]
record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset
record += struct.pack('>H', 0) # [14:16]
record += struct.pack('>H', 0) # [16:18]
record += struct.pack('>H', 0) # [18:20]
record += struct.pack('>H', image_items) # [20:22] # Number of images
record += struct.pack('>H', 0) # [22:24]
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not
record += struct.pack('>H', 0) # [26:28]
record += struct.pack('>H', 0) # [28:30] # footnote_rec
record += struct.pack('>H', 0) # [30:32] # sidebar_rec
record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC
record += struct.pack('>H', 0) # [36:38]
record += struct.pack('>H', 0) # [38:40]
record += struct.pack('>H', image_data_offset) # [40:42] # image_data_offset. This will be the last data offset if there are no images
record += struct.pack('>H', 0) # [42:44]
record += struct.pack('>H', meta_data_offset) # [44:46] # meta_data_offset. This will be the last data offset if there are no images
record += struct.pack('>H', 0) # [46:48]
record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset. This will be the last data offset if there are no images
record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset. This will be the last data offset if there are no images
record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset
record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built.
record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built.
record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start.
record += struct.pack('>H', chapter_count) # [14:16] # Number of chapter index records.
record += struct.pack('>H', 0) # [16:18] # Number of small font page index records.
record += struct.pack('>H', 0) # [18:20] # Number of large font page index records.
record += struct.pack('>H', image_count) # [20:22] # Number of images.
record += struct.pack('>H', link_count) # [22:24] # Number of links.
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not.
record += struct.pack('>H', 0) # [26:28] # Unknown.
record += struct.pack('>H', 0) # [28:30] # Number of Footnotes.
record += struct.pack('>H', 0) # [30:32] # Number of Sidebars.
record += struct.pack('>H', chapter_offset) # [32:34] # Chapter index offset.
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC.
record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none.
record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none.
record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none.
record += struct.pack('>H', link_offset) # [42:44] # Links offset. This will be the last data offset if there are none.
record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none.
record += struct.pack('>H', 0) # [46:48] # Unknown.
record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none.
record += struct.pack('>H', last_data_offset) # [50:52] # Sidebar offset. This will be the last data offset if there are none.
record += struct.pack('>H', last_data_offset) # [52:54] # Last data offset.
for i in range(54, 132, 2):
record += struct.pack('>H', 0) # [54:132]

View File

@ -18,10 +18,10 @@ PML_HTML_RULES = [
(re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
(re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry
(re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<span style="text-align: center; display: block; margin: auto;">%s</span>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<span style="text-align: right; display: block;">%s</span>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<span style="text-decoration: underline;">%s</span>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''),
@ -35,8 +35,8 @@ PML_HTML_RULES = [
(re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text').upper() if match.group('text') else ''),
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
(re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
(re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')),
(re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
@ -64,7 +64,7 @@ PML_HTML_RULES = [
(re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''),
(re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''),
# Remove invalid single item pml codes.
(re.compile(r'(?<=[^\\])\\.'), lambda match: ''),
(re.compile(r'(?<=[^\\])\\[^\\]'), lambda match: ''),
# Replace \\ with \.
(re.compile(r'\\\\'), lambda match: '\\'),
@ -78,6 +78,7 @@ def pml_to_html(pml):
return html
def footnote_sidebar_to_html(id, pml):
if id.startswith('\x01'):
id = id[2:]
html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
return html

View File

@ -154,10 +154,15 @@ class PMLMLizer(object):
for unused in anchors.difference(links):
text = text.replace('\\Q="%s"' % unused, '')
# Turn all html entities into unicode. This should not be necessary as
# lxml should have already done this but we want to be sure it happens.
for entity in set(re.findall('&.+?;', text)):
mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
# Turn all unicode characters into their PML hex equivelent
text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text)
return text
def dump_text(self, elem, stylizer, page, tag_stack=[]):