From 4b2f26f123b5d0ee0172c4960e3442adedeecb07 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 11 Oct 2009 08:48:07 -0400 Subject: [PATCH] PML input cleanup. Generate chapter and link index with eReader PDB output. --- src/calibre/ebooks/pdb/ereader/writer.py | 93 ++++++++++++++++++------ src/calibre/ebooks/pml/pmlconverter.py | 6 +- 2 files changed, 74 insertions(+), 25 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 8a88c6a689..1a172ea07d 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -8,6 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' +import re import struct import zlib @@ -37,10 +38,15 @@ class Writer(FormatWriter): self.log = log def write_content(self, oeb_book, out_stream, metadata=None): - text, image_hrefs, text_sizes = self._text(oeb_book) - images = self._images(oeb_book.manifest, image_hrefs) + pmlmlizer = PMLMLizer(self.log) + pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') + + text, text_sizes = self._text(pml) + chapter_index = self._chapter_index(pml) + link_index = self._link_index(pml) + images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) metadata = [self._metadata(metadata)] - hr = [self._header_record(len(text), len(images))] + hr = [self._header_record(len(text), len(chapter_index.split('\x00')), len(link_index.split('\x00')), len(images))] ''' Record order as generated by Dropbook. @@ -58,7 +64,7 @@ class Writer(FormatWriter): 12. Text block size record 13. "MeTaInFo\x00" word record ''' - sections = hr+text+images+metadata+[text_sizes]+['MeTaInFo\x00'] + sections = hr+text+[chapter_index]+[link_index]+images+metadata+[text_sizes]+['MeTaInFo\x00'] lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections] @@ -72,10 +78,7 @@ class Writer(FormatWriter): else: out_stream.write(item) - def _text(self, oeb_book): - pmlmlizer = PMLMLizer(self.log) - pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') - + def _text(self, pml): pml_pages = [] text_sizes = '' index = 0 @@ -96,7 +99,38 @@ class Writer(FormatWriter): text_sizes += struct.pack('>H', split) index += split - return pml_pages, pmlmlizer.image_hrefs, text_sizes + return pml_pages, text_sizes + + def _index_item(self, mo): + index = '' + if 'text' in mo.groupdict().keys(): + index += struct.pack('>L', mo.start('text')) + # Strip all PML tags from text + text = re.sub(r'[^\\]\\[^\\]', '', mo.group('text')) + text = re.sub(r'\\\\', r'\\', mo.group('text')) + if 'val' in mo.groupdict().keys(): + text = '%s%s' % ('\x20' * 4 * int(mo.group('val')), text) + index += text + index += '\x00' + return index + + def _chapter_index(self, pml): + chapter_marks = [ + r'(?s)\\x(?P.+?)\\x', + r'(?s)\\X(?P[0-4])(?P.*?)\\X[0-4]', + r'(?s)\\C(?P\d)="(?P.+?)"', + ] + index = '' + for chapter_mark in chapter_marks: + for mo in re.finditer(chapter_mark, pml): + index += self._index_item(mo) + return index + + def _link_index(self, pml): + index = '' + for mo in re.finditer(r'(?s)\\Q="(?P.+?)"', pml): + index += self._index_item(mo) + return index def _images(self, manifest, image_hrefs): ''' @@ -164,23 +198,38 @@ class Writer(FormatWriter): return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn) - def _header_record(self, text_items, image_items): + def _header_record(self, text_count, chapter_count, link_count, image_count): ''' - text_items = the number of text pages - image_items = the number of images + text_count = the number of text pages + image_count = the number of images ''' compression = 10 # zlib compression. - non_text_offset = text_items + 1 + non_text_offset = text_count + 1 - if image_items > 0: - image_data_offset = text_items + 1 - meta_data_offset = image_data_offset + image_items + if chapter_count > 0: + chapter_offset = text_count + 1 + else: + chapter_offset = text_count + + if link_count > 0: + link_offset = chapter_offset + 1 + else: + link_offset = chapter_offset + + if image_count > 0: + image_data_offset = link_offset + 1 + meta_data_offset = image_data_offset + image_count last_data_offset = meta_data_offset + 1 else: - meta_data_offset = text_items + 1 + meta_data_offset = link_offset + 1 last_data_offset = meta_data_offset + 1 image_data_offset = last_data_offset + if chapter_count <= 0: + chapter_offset = last_data_offset + if link_count <= 0: + link_offset = last_data_offset + record = '' record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM @@ -190,21 +239,21 @@ class Writer(FormatWriter): record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built. record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built. record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start. - record += struct.pack('>H', 0) # [14:16] # Number of chapter index records. + record += struct.pack('>H', chapter_count) # [14:16] # Number of chapter index records. record += struct.pack('>H', 0) # [16:18] # Number of small font page index records. record += struct.pack('>H', 0) # [18:20] # Number of large font page index records. - record += struct.pack('>H', image_items) # [20:22] # Number of images. - record += struct.pack('>H', 0) # [22:24] # Number of links. + record += struct.pack('>H', image_count) # [20:22] # Number of images. + record += struct.pack('>H', link_count) # [22:24] # Number of links. record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not. record += struct.pack('>H', 0) # [26:28] # Unknown. record += struct.pack('>H', 0) # [28:30] # Number of Footnotes. record += struct.pack('>H', 0) # [30:32] # Number of Sidebars. - record += struct.pack('>H', last_data_offset) # [32:34] # Chapter index offset. + record += struct.pack('>H', chapter_offset) # [32:34] # Chapter index offset. record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC. record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none. record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none. record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none. - record += struct.pack('>H', image_data_offset) # [42:44] # Links offset. This will be the last data offset if there are none. + record += struct.pack('>H', link_offset) # [42:44] # Links offset. This will be the last data offset if there are none. record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none. record += struct.pack('>H', 0) # [46:48] # Unknown. record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none. diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index ca7721350c..3e1b3b4828 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -18,10 +18,10 @@ PML_HTML_RULES = [ (re.compile(r'\\x(?P.*?)\\x', re.DOTALL), lambda match: '

%s

' % match.group('text') if match.group('text') else ''), (re.compile(r'\\X(?P[0-4])(?P.*?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile(r'\\c(?P.*?)\\c', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\r(?P.*?)\\r', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\c(?P.*?)\\c', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\r(?P.*?)\\r', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\i(?P.*?)\\i', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\u(?P.*?)\\u', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\u(?P.*?)\\u', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\o(?P.*?)\\o', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\v(?P.*?)\\v', re.DOTALL), lambda match: '' % match.group('text') if match.group('text') else ''), (re.compile(r'\\t(?P.*?)\\t', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''),