From b3ad9f0160839ecc1115a038608128f594261ee3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 Oct 2009 07:35:27 -0600 Subject: [PATCH] eReader PDB output: proper length of indexes and do not try to add them if they are not avaliable. PML Outpu: cleanup. PML Input: read unicode and entity PML tags correctly. --- src/calibre/ebooks/fb2/fb2ml.py | 1 - src/calibre/ebooks/pdb/ereader/reader132.py | 1 - src/calibre/ebooks/pdb/ereader/writer.py | 184 +++++++++++++++----- src/calibre/ebooks/pml/pmlconverter.py | 15 +- src/calibre/ebooks/pml/pmlml.py | 5 + 5 files changed, 149 insertions(+), 57 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index ff914568d2..aaf8361b99 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -75,7 +75,6 @@ class FB2MLizer(object): output.append(self.fb2mlize_images()) output.append(self.fb2_footer()) output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc()) - return output return u'\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) def fb2_header(self): diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index 98dbe13790..49fdfb8980 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -34,7 +34,6 @@ class HeaderRecord(object): self.has_metadata, = struct.unpack('>H', raw[24:26]) self.footnote_rec, = struct.unpack('>H', raw[28:30]) self.sidebar_rec, = struct.unpack('>H', raw[30:32]) - self.bookmark_offset, = struct.unpack('>H', raw[32:34]) self.image_data_offset, = struct.unpack('>H', raw[40:42]) self.metadata_offset, = struct.unpack('>H', raw[44:46]) self.footnote_offset, = struct.unpack('>H', raw[48:50]) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 2f4e3bf16f..263f6964bf 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -8,6 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' +import re import struct import zlib @@ -28,7 +29,7 @@ IDENTITY = 'PNRdPPrs' # This is an arbitrary number that is small enough to work. The actual maximum # record size is unknown. -MAX_RECORD_SIZE = 3560 +MAX_RECORD_SIZE = 8192 class Writer(FormatWriter): @@ -37,13 +38,33 @@ class Writer(FormatWriter): self.log = log def write_content(self, oeb_book, out_stream, metadata=None): - text, image_hrefs = self._text(oeb_book) - images = self._images(oeb_book.manifest, image_hrefs) + pmlmlizer = PMLMLizer(self.log) + pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') + + text, text_sizes = self._text(pml) + chapter_index = self._chapter_index(pml) + link_index = self._link_index(pml) + images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) metadata = [self._metadata(metadata)] + hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))] - hr = [self._header_record(len(text), len(images))] - - sections = hr+text+images+metadata+['MeTaInFo\x00'] + ''' + Record order as generated by Dropbook. + 1. eReader Header + 2. Compressed text + 3. Small font page index + 4. Large font page index + 5. Chapter index + 6. Links index + 7. Images + 8. (Extrapolation: there should be one more record type here though yet uncovered what it might be). + 9. Metadata + 10. Sidebar records + 11. Footnote records + 12. Text block size record + 13. "MeTaInFo\x00" word record + ''' + sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+['MeTaInFo\x00'] lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections] @@ -57,17 +78,74 @@ class Writer(FormatWriter): else: out_stream.write(item) - def _text(self, oeb_book): - pmlmlizer = PMLMLizer(self.log) - pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') - + def _text(self, pml): pml_pages = [] - for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): - pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])) + text_sizes = '' + index = 0 + while index < len(pml): + ''' + Split on the space character closest to MAX_RECORD_SIZE when possible. + ''' + split = pml.rfind(' ', index, MAX_RECORD_SIZE) + if split == -1: + len_end = len(pml[index:]) + if len_end > MAX_RECORD_SIZE: + split = MAX_RECORD_SIZE + else: + split = len_end + if split == 0: + split = 1 + pml_pages.append(zlib.compress(pml[index:index+split])) + text_sizes += struct.pack('>H', split) + index += split - return pml_pages, pmlmlizer.image_hrefs + return pml_pages, text_sizes + + def _index_item(self, mo): + index = '' + if 'text' in mo.groupdict().keys(): + index += struct.pack('>L', mo.start()) + text = mo.group('text') + # Strip all PML tags from text + text = re.sub(r'\\U[0-9a-z]{4}', '', text) + text = re.sub(r'\\a\d{3}', '', text) + text = re.sub(r'\\.', '', text) + # Add appropriate spacing to denote the various levels of headings + if 'val' in mo.groupdict().keys(): + text = '%s%s' % (' ' * 4 * int(mo.group('val')), text) + index += text + index += '\x00' + return index + + def _chapter_index(self, pml): + chapter_marks = [ + r'(?s)\\x(?P.+?)\\x', + r'(?s)\\X(?P[0-4])(?P.*?)\\X[0-4]', + r'(?s)\\C(?P\d)="(?P.+?)"', + ] + index = [] + for chapter_mark in chapter_marks: + for mo in re.finditer(chapter_mark, pml): + index.append(self._index_item(mo)) + return index + + def _link_index(self, pml): + index = [] + for mo in re.finditer(r'(?s)\\Q="(?P.+?)"', pml): + index.append(self._index_item(mo)) + return index def _images(self, manifest, image_hrefs): + ''' + Image format. + + 0-4 : 'PNG '. There must be a space after PNG. + 4-36 : Image name. Must be exactly 32 bytes long. Pad with \x00 for names shorter than 32 bytes + 36-58 : Unknown. + 58-60 : Width. + 60-62 : Height. + 62-...: Raw image data in 8 bit PNG format. + ''' images = [] for item in manifest: @@ -82,6 +160,8 @@ class Writer(FormatWriter): header = 'PNG ' header += image_hrefs[item.href].ljust(32, '\x00')[:32] + header = header.ljust(58, '\x00') + header += struct.pack('>HH', im.size[0], im.size[1]) header = header.ljust(62, '\x00') if len(data) + len(header) < 65505: @@ -121,52 +201,60 @@ class Writer(FormatWriter): return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn) - def _header_record(self, text_items, image_items): + def _header_record(self, text_count, chapter_count, link_count, image_count): ''' - text_items = the number of text pages - image_items = the number of images + text_count = the number of text pages + image_count = the number of images ''' - version = 10 # Zlib compression - non_text_offset = text_items + 1 + compression = 10 # zlib compression. + non_text_offset = text_count + 1 - if image_items > 0: - image_data_offset = text_items + 1 - meta_data_offset = image_data_offset + image_items + chapter_offset = non_text_offset + link_offset = chapter_offset + chapter_count + + if image_count > 0: + image_data_offset = link_offset + link_count + meta_data_offset = image_data_offset + image_count last_data_offset = meta_data_offset + 1 else: - meta_data_offset = text_items + 1 + meta_data_offset = link_offset + link_count last_data_offset = meta_data_offset + 1 image_data_offset = last_data_offset + if chapter_count == 0: + chapter_offset = last_data_offset + if link_count == 0: + link_offset = last_data_offset + record = '' - record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM - record += struct.pack('>H', 0) # [2:4] - record += struct.pack('>H', 0) # [4:6] + record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM + record += struct.pack('>H', 0) # [2:4] # Unknown. + record += struct.pack('>H', 0) # [4:6] # Unknown. record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text - record += struct.pack('>H', 0) # [8:10] - record += struct.pack('>H', 0) # [10:12] - record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset - record += struct.pack('>H', 0) # [14:16] - record += struct.pack('>H', 0) # [16:18] - record += struct.pack('>H', 0) # [18:20] - record += struct.pack('>H', image_items) # [20:22] # Number of images - record += struct.pack('>H', 0) # [22:24] - record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not - record += struct.pack('>H', 0) # [26:28] - record += struct.pack('>H', 0) # [28:30] # footnote_rec - record += struct.pack('>H', 0) # [30:32] # sidebar_rec - record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset - record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC - record += struct.pack('>H', 0) # [36:38] - record += struct.pack('>H', 0) # [38:40] - record += struct.pack('>H', image_data_offset) # [40:42] # image_data_offset. This will be the last data offset if there are no images - record += struct.pack('>H', 0) # [42:44] - record += struct.pack('>H', meta_data_offset) # [44:46] # meta_data_offset. This will be the last data offset if there are no images - record += struct.pack('>H', 0) # [46:48] - record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset. This will be the last data offset if there are no images - record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset. This will be the last data offset if there are no images - record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset + record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built. + record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built. + record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start. + record += struct.pack('>H', chapter_count) # [14:16] # Number of chapter index records. + record += struct.pack('>H', 0) # [16:18] # Number of small font page index records. + record += struct.pack('>H', 0) # [18:20] # Number of large font page index records. + record += struct.pack('>H', image_count) # [20:22] # Number of images. + record += struct.pack('>H', link_count) # [22:24] # Number of links. + record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not. + record += struct.pack('>H', 0) # [26:28] # Unknown. + record += struct.pack('>H', 0) # [28:30] # Number of Footnotes. + record += struct.pack('>H', 0) # [30:32] # Number of Sidebars. + record += struct.pack('>H', chapter_offset) # [32:34] # Chapter index offset. + record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC. + record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none. + record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none. + record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none. + record += struct.pack('>H', link_offset) # [42:44] # Links offset. This will be the last data offset if there are none. + record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none. + record += struct.pack('>H', 0) # [46:48] # Unknown. + record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none. + record += struct.pack('>H', last_data_offset) # [50:52] # Sidebar offset. This will be the last data offset if there are none. + record += struct.pack('>H', last_data_offset) # [52:54] # Last data offset. for i in range(54, 132, 2): record += struct.pack('>H', 0) # [54:132] diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index b4ab238da9..c72a21a5f9 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -18,10 +18,10 @@ PML_HTML_RULES = [ (re.compile(r'\\x(?P.*?)\\x', re.DOTALL), lambda match: '

%s

' % match.group('text') if match.group('text') else ''), (re.compile(r'\\X(?P[0-4])(?P.*?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile(r'\\c(?P.*?)\\c', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\r(?P.*?)\\r', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\c(?P.*?)\\c', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\r(?P.*?)\\r', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\i(?P.*?)\\i', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\u(?P.*?)\\u', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\u(?P.*?)\\u', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\o(?P.*?)\\o', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\v(?P.*?)\\v', re.DOTALL), lambda match: '' % match.group('text') if match.group('text') else ''), (re.compile(r'\\t(?P.*?)\\t', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), @@ -35,8 +35,8 @@ PML_HTML_RULES = [ (re.compile(r'\\Sp(?P.*?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\Sb(?P.*?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\k(?P.*?)\\k', re.DOTALL), lambda match: '%s' % match.group('text').upper() if match.group('text') else ''), - (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), - (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), + (re.compile(r'\\a(?P\d{3})'), lambda match: '&#%s;' % match.group('num')), + (re.compile(r'\\U(?P[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), (re.compile(r'\\q="(?P#.+?)"(?P.*?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), (re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')), @@ -64,7 +64,7 @@ PML_HTML_RULES = [ (re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''), (re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''), # Remove invalid single item pml codes. - (re.compile(r'(?<=[^\\])\\.'), lambda match: ''), + (re.compile(r'(?<=[^\\])\\[^\\]'), lambda match: ''), # Replace \\ with \. (re.compile(r'\\\\'), lambda match: '\\'), @@ -78,6 +78,7 @@ def pml_to_html(pml): return html def footnote_sidebar_to_html(id, pml): + if id.startswith('\x01'): + id = id[2:] html = '
%s
' % (id, id, pml_to_html(pml)) return html - diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 2438fd9bef..9582d2bfbb 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -154,10 +154,15 @@ class PMLMLizer(object): for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') + # Turn all html entities into unicode. This should not be necessary as + # lxml should have already done this but we want to be sure it happens. for entity in set(re.findall('&.+?;', text)): mo = re.search('(%s)' % entity[1:-1], text) text = text.replace(entity, entity_to_unicode(mo)) + # Turn all unicode characters into their PML hex equivelent + text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text) + return text def dump_text(self, elem, stylizer, page, tag_stack=[]):