PML Output: Generate \CX Tags as chapter anchors. PDB eReader Output: Use \CX tags to generate chapter index.

This commit is contained in:
John Schember 2009-12-15 18:11:36 -05:00
parent 288b64529c
commit 35fc570d24
2 changed files with 43 additions and 38 deletions

View File

@ -42,8 +42,8 @@ class Writer(FormatWriter):
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
text, text_sizes = self._text(pml) text, text_sizes = self._text(pml)
chapter_index = self._chapter_index(pml) chapter_index = self._index_item(r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"', pml)
link_index = self._link_index(pml) link_index = self._index_item(r'(?s)\\Q="(?P<text>.+?)"', pml)
images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
metadata = [self._metadata(metadata)] metadata = [self._metadata(metadata)]
hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))] hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))]
@ -101,38 +101,24 @@ class Writer(FormatWriter):
return pml_pages, text_sizes return pml_pages, text_sizes
def _index_item(self, mo): def _index_item(self, regex, pml):
index = ''
if 'text' in mo.groupdict().keys():
index += struct.pack('>L', mo.start())
text = mo.group('text')
# Strip all PML tags from text
text = re.sub(r'\\U[0-9a-z]{4}', '', text)
text = re.sub(r'\\a\d{3}', '', text)
text = re.sub(r'\\.', '', text)
# Add appropriate spacing to denote the various levels of headings
if 'val' in mo.groupdict().keys():
text = '%s%s' % (' ' * 4 * int(mo.group('val')), text)
index += text
index += '\x00'
return index
def _chapter_index(self, pml):
chapter_marks = [
r'(?s)\\x(?P<text>.+?)\\x',
r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]',
r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"',
]
index = [] index = []
for chapter_mark in chapter_marks: for mo in re.finditer(regex, pml):
for mo in re.finditer(chapter_mark, pml): item = ''
index.append(self._index_item(mo)) if 'text' in mo.groupdict().keys():
return index item += struct.pack('>L', mo.start())
text = mo.group('text')
def _link_index(self, pml): # Strip all PML tags from text
index = [] text = re.sub(r'\\U[0-9a-z]{4}', '', text)
for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml): text = re.sub(r'\\a\d{3}', '', text)
index.append(self._index_item(mo)) text = re.sub(r'\\.', '', text)
# Add appropriate spacing to denote the various levels of headings
if 'val' in mo.groupdict().keys():
text = '%s%s' % (' ' * 4 * int(mo.group('val')), text)
item += text
item += '\x00'
if item:
index.append(item)
return index return index
def _images(self, manifest, image_hrefs): def _images(self, manifest, image_hrefs):

View File

@ -79,6 +79,16 @@ class PMLMLizer(object):
self.log.info('Converting XHTML to PML markup...') self.log.info('Converting XHTML to PML markup...')
self.oeb_book = oeb_book self.oeb_book = oeb_book
self.opts = opts self.opts = opts
# This is used for adding \CX tags chapter markers. This is separate
# from the optional inline toc.
self.toc = {}
for item in oeb_book.toc:
page, mid, id = item.href.partition('#')
if not self.toc.get(page, None):
self.toc[page] = {}
self.toc[page][id] = item.title
return self.pmlmlize_spine() return self.pmlmlize_spine()
def pmlmlize_spine(self): def pmlmlize_spine(self):
@ -107,7 +117,11 @@ class PMLMLizer(object):
return output return output
def get_toc(self): def get_toc(self):
toc = [u''] '''
Generation of inline TOC
'''
toc = []
if self.opts.inline_toc: if self.opts.inline_toc:
self.log.debug('Generating table of contents...') self.log.debug('Generating table of contents...')
toc.append(u'\\X0%s\\X0\n\n' % _('Table of Contents:')) toc.append(u'\\X0%s\\X0\n\n' % _('Table of Contents:'))
@ -177,14 +191,14 @@ class PMLMLizer(object):
def dump_text(self, elem, stylizer, page, tag_stack=[]): def dump_text(self, elem, stylizer, page, tag_stack=[]):
if not isinstance(elem.tag, basestring) \ if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS: or namespace(elem.tag) != XHTML_NS:
return [u''] return []
text = [u''] text = []
style = stylizer.style(elem) style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden': or style['visibility'] == 'hidden':
return [u''] return []
tag = barename(elem.tag) tag = barename(elem.tag)
tag_count = 0 tag_count = 0
@ -213,6 +227,12 @@ class PMLMLizer(object):
else: else:
w += '="50%"' w += '="50%"'
text.append(w) text.append(w)
toc_id = elem.attrib.get('id', None)
if toc_id:
if self.toc.get(page.href, None):
toc_title = self.toc[page.href].get(toc_id, None)
if toc_title:
text.append('\\C1="%s"' % toc_title)
# Process style information that needs holds a single tag # Process style information that needs holds a single tag
# Commented out because every page in an OEB book starts with this style # Commented out because every page in an OEB book starts with this style
@ -287,4 +307,3 @@ class PMLMLizer(object):
if tag != 'block': if tag != 'block':
text.append('\\%s' % tag) text.append('\\%s' % tag)
return text return text