eReader PDB output: proper length of indexes and do not try to add them if they are not avaliable. PML Outpu: cleanup. PML Input: read unicode and entity PML tags correctly.

This commit is contained in:
John Schember 2009-10-11 10:32:38 -04:00
parent 4b2f26f123
commit 599de056d0
3 changed files with 15 additions and 6 deletions

View File

@ -43,10 +43,14 @@ class Writer(FormatWriter):
text, text_sizes = self._text(pml)
chapter_index = self._chapter_index(pml)
chapter_index = [chapter_index] if chapter_index != '' else []
link_index = self._link_index(pml)
link_index = [link_index] if link_index != '' else []
images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
metadata = [self._metadata(metadata)]
hr = [self._header_record(len(text), len(chapter_index.split('\x00')), len(link_index.split('\x00')), len(images))]
chapter_index_count = len(chapter_index[0].split('\x00')) - 1 if len(chapter_index) >= 1 else 0
link_index_count = len(link_index[0].split('\x00')) - 1 if len(link_index) >= 1 else 0
hr = [self._header_record(len(text), chapter_index_count, link_index_count, len(images))]
'''
Record order as generated by Dropbook.
@ -64,7 +68,7 @@ class Writer(FormatWriter):
12. Text block size record
13. "MeTaInFo\x00" word record
'''
sections = hr+text+[chapter_index]+[link_index]+images+metadata+[text_sizes]+['MeTaInFo\x00']
sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+['MeTaInFo\x00']
lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]
@ -106,8 +110,8 @@ class Writer(FormatWriter):
if 'text' in mo.groupdict().keys():
index += struct.pack('>L', mo.start('text'))
# Strip all PML tags from text
text = re.sub(r'[^\\]\\[^\\]', '', mo.group('text'))
text = re.sub(r'\\\\', r'\\', mo.group('text'))
text = re.sub(r'\\.', '', mo.group('text'))
# Add appropriate spacing to denote the various levels of headings
if 'val' in mo.groupdict().keys():
text = '%s%s' % ('\x20' * 4 * int(mo.group('val')), text)
index += text

View File

@ -35,8 +35,8 @@ PML_HTML_RULES = [
(re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text').upper() if match.group('text') else ''),
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
(re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
(re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')),
(re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),

View File

@ -154,10 +154,15 @@ class PMLMLizer(object):
for unused in anchors.difference(links):
text = text.replace('\\Q="%s"' % unused, '')
# Turn all html entities into unicode. This should not be necessary as
# lxml should have already done this but we want to be sure it happens.
for entity in set(re.findall('&.+?;', text)):
mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
# Turn all unicode characters into their PML hex equivelent
text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text)
return text
def dump_text(self, elem, stylizer, page, tag_stack=[]):