mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PML input cleanup. Generate chapter and link index with eReader PDB output.
This commit is contained in:
parent
552735c41e
commit
4b2f26f123
@ -8,6 +8,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
import struct
|
||||
import zlib
|
||||
|
||||
@ -37,10 +38,15 @@ class Writer(FormatWriter):
|
||||
self.log = log
|
||||
|
||||
def write_content(self, oeb_book, out_stream, metadata=None):
|
||||
text, image_hrefs, text_sizes = self._text(oeb_book)
|
||||
images = self._images(oeb_book.manifest, image_hrefs)
|
||||
pmlmlizer = PMLMLizer(self.log)
|
||||
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
|
||||
|
||||
text, text_sizes = self._text(pml)
|
||||
chapter_index = self._chapter_index(pml)
|
||||
link_index = self._link_index(pml)
|
||||
images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
|
||||
metadata = [self._metadata(metadata)]
|
||||
hr = [self._header_record(len(text), len(images))]
|
||||
hr = [self._header_record(len(text), len(chapter_index.split('\x00')), len(link_index.split('\x00')), len(images))]
|
||||
|
||||
'''
|
||||
Record order as generated by Dropbook.
|
||||
@ -58,7 +64,7 @@ class Writer(FormatWriter):
|
||||
12. Text block size record
|
||||
13. "MeTaInFo\x00" word record
|
||||
'''
|
||||
sections = hr+text+images+metadata+[text_sizes]+['MeTaInFo\x00']
|
||||
sections = hr+text+[chapter_index]+[link_index]+images+metadata+[text_sizes]+['MeTaInFo\x00']
|
||||
|
||||
lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]
|
||||
|
||||
@ -72,10 +78,7 @@ class Writer(FormatWriter):
|
||||
else:
|
||||
out_stream.write(item)
|
||||
|
||||
def _text(self, oeb_book):
|
||||
pmlmlizer = PMLMLizer(self.log)
|
||||
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
|
||||
|
||||
def _text(self, pml):
|
||||
pml_pages = []
|
||||
text_sizes = ''
|
||||
index = 0
|
||||
@ -96,7 +99,38 @@ class Writer(FormatWriter):
|
||||
text_sizes += struct.pack('>H', split)
|
||||
index += split
|
||||
|
||||
return pml_pages, pmlmlizer.image_hrefs, text_sizes
|
||||
return pml_pages, text_sizes
|
||||
|
||||
def _index_item(self, mo):
|
||||
index = ''
|
||||
if 'text' in mo.groupdict().keys():
|
||||
index += struct.pack('>L', mo.start('text'))
|
||||
# Strip all PML tags from text
|
||||
text = re.sub(r'[^\\]\\[^\\]', '', mo.group('text'))
|
||||
text = re.sub(r'\\\\', r'\\', mo.group('text'))
|
||||
if 'val' in mo.groupdict().keys():
|
||||
text = '%s%s' % ('\x20' * 4 * int(mo.group('val')), text)
|
||||
index += text
|
||||
index += '\x00'
|
||||
return index
|
||||
|
||||
def _chapter_index(self, pml):
|
||||
chapter_marks = [
|
||||
r'(?s)\\x(?P<text>.+?)\\x',
|
||||
r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]',
|
||||
r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"',
|
||||
]
|
||||
index = ''
|
||||
for chapter_mark in chapter_marks:
|
||||
for mo in re.finditer(chapter_mark, pml):
|
||||
index += self._index_item(mo)
|
||||
return index
|
||||
|
||||
def _link_index(self, pml):
|
||||
index = ''
|
||||
for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml):
|
||||
index += self._index_item(mo)
|
||||
return index
|
||||
|
||||
def _images(self, manifest, image_hrefs):
|
||||
'''
|
||||
@ -164,23 +198,38 @@ class Writer(FormatWriter):
|
||||
|
||||
return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn)
|
||||
|
||||
def _header_record(self, text_items, image_items):
|
||||
def _header_record(self, text_count, chapter_count, link_count, image_count):
|
||||
'''
|
||||
text_items = the number of text pages
|
||||
image_items = the number of images
|
||||
text_count = the number of text pages
|
||||
image_count = the number of images
|
||||
'''
|
||||
compression = 10 # zlib compression.
|
||||
non_text_offset = text_items + 1
|
||||
non_text_offset = text_count + 1
|
||||
|
||||
if image_items > 0:
|
||||
image_data_offset = text_items + 1
|
||||
meta_data_offset = image_data_offset + image_items
|
||||
if chapter_count > 0:
|
||||
chapter_offset = text_count + 1
|
||||
else:
|
||||
chapter_offset = text_count
|
||||
|
||||
if link_count > 0:
|
||||
link_offset = chapter_offset + 1
|
||||
else:
|
||||
link_offset = chapter_offset
|
||||
|
||||
if image_count > 0:
|
||||
image_data_offset = link_offset + 1
|
||||
meta_data_offset = image_data_offset + image_count
|
||||
last_data_offset = meta_data_offset + 1
|
||||
else:
|
||||
meta_data_offset = text_items + 1
|
||||
meta_data_offset = link_offset + 1
|
||||
last_data_offset = meta_data_offset + 1
|
||||
image_data_offset = last_data_offset
|
||||
|
||||
if chapter_count <= 0:
|
||||
chapter_offset = last_data_offset
|
||||
if link_count <= 0:
|
||||
link_offset = last_data_offset
|
||||
|
||||
record = ''
|
||||
|
||||
record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
|
||||
@ -190,21 +239,21 @@ class Writer(FormatWriter):
|
||||
record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built.
|
||||
record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built.
|
||||
record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start.
|
||||
record += struct.pack('>H', 0) # [14:16] # Number of chapter index records.
|
||||
record += struct.pack('>H', chapter_count) # [14:16] # Number of chapter index records.
|
||||
record += struct.pack('>H', 0) # [16:18] # Number of small font page index records.
|
||||
record += struct.pack('>H', 0) # [18:20] # Number of large font page index records.
|
||||
record += struct.pack('>H', image_items) # [20:22] # Number of images.
|
||||
record += struct.pack('>H', 0) # [22:24] # Number of links.
|
||||
record += struct.pack('>H', image_count) # [20:22] # Number of images.
|
||||
record += struct.pack('>H', link_count) # [22:24] # Number of links.
|
||||
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not.
|
||||
record += struct.pack('>H', 0) # [26:28] # Unknown.
|
||||
record += struct.pack('>H', 0) # [28:30] # Number of Footnotes.
|
||||
record += struct.pack('>H', 0) # [30:32] # Number of Sidebars.
|
||||
record += struct.pack('>H', last_data_offset) # [32:34] # Chapter index offset.
|
||||
record += struct.pack('>H', chapter_offset) # [32:34] # Chapter index offset.
|
||||
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC.
|
||||
record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', image_data_offset) # [42:44] # Links offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', link_offset) # [42:44] # Links offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none.
|
||||
record += struct.pack('>H', 0) # [46:48] # Unknown.
|
||||
record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none.
|
||||
|
@ -18,10 +18,10 @@ PML_HTML_RULES = [
|
||||
(re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
|
||||
(re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry
|
||||
(re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<span style="text-align: center; display: block; margin: auto;">%s</span>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<span style="text-align: right; display: block;">%s</span>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<span style="text-decoration: underline;">%s</span>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''),
|
||||
|
Loading…
x
Reference in New Issue
Block a user