diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 1a172ea07d..1e108d113b 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -43,10 +43,14 @@ class Writer(FormatWriter): text, text_sizes = self._text(pml) chapter_index = self._chapter_index(pml) + chapter_index = [chapter_index] if chapter_index != '' else [] link_index = self._link_index(pml) + link_index = [link_index] if link_index != '' else [] images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) metadata = [self._metadata(metadata)] - hr = [self._header_record(len(text), len(chapter_index.split('\x00')), len(link_index.split('\x00')), len(images))] + chapter_index_count = len(chapter_index[0].split('\x00')) - 1 if len(chapter_index) >= 1 else 0 + link_index_count = len(link_index[0].split('\x00')) - 1 if len(link_index) >= 1 else 0 + hr = [self._header_record(len(text), chapter_index_count, link_index_count, len(images))] ''' Record order as generated by Dropbook. @@ -64,7 +68,7 @@ class Writer(FormatWriter): 12. Text block size record 13. "MeTaInFo\x00" word record ''' - sections = hr+text+[chapter_index]+[link_index]+images+metadata+[text_sizes]+['MeTaInFo\x00'] + sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+['MeTaInFo\x00'] lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections] @@ -106,8 +110,8 @@ class Writer(FormatWriter): if 'text' in mo.groupdict().keys(): index += struct.pack('>L', mo.start('text')) # Strip all PML tags from text - text = re.sub(r'[^\\]\\[^\\]', '', mo.group('text')) - text = re.sub(r'\\\\', r'\\', mo.group('text')) + text = re.sub(r'\\.', '', mo.group('text')) + # Add appropriate spacing to denote the various levels of headings if 'val' in mo.groupdict().keys(): text = '%s%s' % ('\x20' * 4 * int(mo.group('val')), text) index += text diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 3e1b3b4828..c72a21a5f9 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -35,8 +35,8 @@ PML_HTML_RULES = [ (re.compile(r'\\Sp(?P.*?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\Sb(?P.*?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\k(?P.*?)\\k', re.DOTALL), lambda match: '%s' % match.group('text').upper() if match.group('text') else ''), - (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), - (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), + (re.compile(r'\\a(?P\d{3})'), lambda match: '&#%s;' % match.group('num')), + (re.compile(r'\\U(?P[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), (re.compile(r'\\q="(?P#.+?)"(?P.*?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), (re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')), diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 72b55d00b1..b6a62e7c1f 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -154,10 +154,15 @@ class PMLMLizer(object): for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') + # Turn all html entities into unicode. This should not be necessary as + # lxml should have already done this but we want to be sure it happens. for entity in set(re.findall('&.+?;', text)): mo = re.search('(%s)' % entity[1:-1], text) text = text.replace(entity, entity_to_unicode(mo)) + # Turn all unicode characters into their PML hex equivelent + text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text) + return text def dump_text(self, elem, stylizer, page, tag_stack=[]):