mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
eReader PDB output: proper length of indexes and do not try to add them if they are not avaliable. PML Outpu: cleanup. PML Input: read unicode and entity PML tags correctly.
This commit is contained in:
parent
4b2f26f123
commit
599de056d0
@ -43,10 +43,14 @@ class Writer(FormatWriter):
|
||||
|
||||
text, text_sizes = self._text(pml)
|
||||
chapter_index = self._chapter_index(pml)
|
||||
chapter_index = [chapter_index] if chapter_index != '' else []
|
||||
link_index = self._link_index(pml)
|
||||
link_index = [link_index] if link_index != '' else []
|
||||
images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
|
||||
metadata = [self._metadata(metadata)]
|
||||
hr = [self._header_record(len(text), len(chapter_index.split('\x00')), len(link_index.split('\x00')), len(images))]
|
||||
chapter_index_count = len(chapter_index[0].split('\x00')) - 1 if len(chapter_index) >= 1 else 0
|
||||
link_index_count = len(link_index[0].split('\x00')) - 1 if len(link_index) >= 1 else 0
|
||||
hr = [self._header_record(len(text), chapter_index_count, link_index_count, len(images))]
|
||||
|
||||
'''
|
||||
Record order as generated by Dropbook.
|
||||
@ -64,7 +68,7 @@ class Writer(FormatWriter):
|
||||
12. Text block size record
|
||||
13. "MeTaInFo\x00" word record
|
||||
'''
|
||||
sections = hr+text+[chapter_index]+[link_index]+images+metadata+[text_sizes]+['MeTaInFo\x00']
|
||||
sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+['MeTaInFo\x00']
|
||||
|
||||
lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]
|
||||
|
||||
@ -106,8 +110,8 @@ class Writer(FormatWriter):
|
||||
if 'text' in mo.groupdict().keys():
|
||||
index += struct.pack('>L', mo.start('text'))
|
||||
# Strip all PML tags from text
|
||||
text = re.sub(r'[^\\]\\[^\\]', '', mo.group('text'))
|
||||
text = re.sub(r'\\\\', r'\\', mo.group('text'))
|
||||
text = re.sub(r'\\.', '', mo.group('text'))
|
||||
# Add appropriate spacing to denote the various levels of headings
|
||||
if 'val' in mo.groupdict().keys():
|
||||
text = '%s%s' % ('\x20' * 4 * int(mo.group('val')), text)
|
||||
index += text
|
||||
|
@ -35,8 +35,8 @@ PML_HTML_RULES = [
|
||||
(re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
|
||||
(re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text').upper() if match.group('text') else ''),
|
||||
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
|
||||
(re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
|
||||
(re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')),
|
||||
(re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
|
||||
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
|
||||
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
|
||||
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
|
||||
|
@ -154,10 +154,15 @@ class PMLMLizer(object):
|
||||
for unused in anchors.difference(links):
|
||||
text = text.replace('\\Q="%s"' % unused, '')
|
||||
|
||||
# Turn all html entities into unicode. This should not be necessary as
|
||||
# lxml should have already done this but we want to be sure it happens.
|
||||
for entity in set(re.findall('&.+?;', text)):
|
||||
mo = re.search('(%s)' % entity[1:-1], text)
|
||||
text = text.replace(entity, entity_to_unicode(mo))
|
||||
|
||||
# Turn all unicode characters into their PML hex equivelent
|
||||
text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text)
|
||||
|
||||
return text
|
||||
|
||||
def dump_text(self, elem, stylizer, page, tag_stack=[]):
|
||||
|
Loading…
x
Reference in New Issue
Block a user