eReader PDB output: proper length of indexes and do not try to add them if they are not avaliable. PML Outpu: cleanup. PML Input: read unicode and entity PML tags correctly.

2025-07-09 03:04:10 -04:00 · 2009-10-11 10:32:38 -04:00 · 2009-10-11 10:32:38 -04:00 · 599de056d0
commit 599de056d0
parent 4b2f26f123
3 changed files with 15 additions and 6 deletions
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@ -43,10 +43,14 @@ class Writer(FormatWriter):

        text, text_sizes = self._text(pml)
        chapter_index = self._chapter_index(pml)
+        chapter_index = [chapter_index] if chapter_index != '' else []
        link_index = self._link_index(pml)
+        link_index = [link_index] if link_index != '' else []
        images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
        metadata = [self._metadata(metadata)]
-        hr = [self._header_record(len(text), len(chapter_index.split('\x00')), len(link_index.split('\x00')), len(images))]
+        chapter_index_count = len(chapter_index[0].split('\x00')) - 1 if len(chapter_index) >= 1 else 0
+        link_index_count = len(link_index[0].split('\x00')) - 1 if len(link_index) >= 1 else 0
+        hr = [self._header_record(len(text), chapter_index_count, link_index_count, len(images))]

        '''
        Record order as generated by Dropbook.
@ -64,7 +68,7 @@ class Writer(FormatWriter):
           12. Text block size record
           13. "MeTaInFo\x00" word record
        '''
-        sections = hr+text+[chapter_index]+[link_index]+images+metadata+[text_sizes]+['MeTaInFo\x00']
+        sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+['MeTaInFo\x00']

        lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]

@ -106,8 +110,8 @@ class Writer(FormatWriter):
        if 'text' in mo.groupdict().keys():
            index += struct.pack('>L', mo.start('text'))
            # Strip all PML tags from text
-            text = re.sub(r'[^\\]\\[^\\]', '', mo.group('text'))
-            text = re.sub(r'\\\\', r'\\', mo.group('text'))
+            text = re.sub(r'\\.', '', mo.group('text'))
+            # Add appropriate spacing to denote the various levels of headings
            if 'val' in mo.groupdict().keys():
                text = '%s%s' % ('\x20' * 4 * int(mo.group('val')), text)
            index += text
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@ -35,8 +35,8 @@ PML_HTML_RULES = [
    (re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text').upper() if match.group('text') else ''),
-    (re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
-    (re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
+    (re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')),
+    (re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
    (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
    (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
    (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@ -154,10 +154,15 @@ class PMLMLizer(object):
        for unused in anchors.difference(links):
            text = text.replace('\\Q="%s"' % unused, '')

+        # Turn all html entities into unicode. This should not be necessary as
+        # lxml should have already done this but we want to be sure it happens.
        for entity in set(re.findall('&.+?;', text)):
            mo = re.search('(%s)' % entity[1:-1], text)
            text = text.replace(entity, entity_to_unicode(mo))

+        # Turn all unicode characters into their PML hex equivelent
+        text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text)
+
        return text

    def dump_text(self, elem, stylizer, page, tag_stack=[]):