From 91b7cbc5808cb65aa69f51a59ccb0b5cbb604291 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 19 May 2009 18:57:07 -0400 Subject: [PATCH] PML: turn html entities into characters, internal links produced properly. --- src/calibre/ebooks/pdb/ereader/writer.py | 4 ++-- src/calibre/ebooks/pml/pmlml.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index c99c75a929..875aae764a 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -41,7 +41,7 @@ class Writer(FormatWriter): lengths = [len(i) for i in sections] - pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].parition()[0]) + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0]) pdbHeaderBuilder.build_header(lengths, out_stream) for item in sections: @@ -49,7 +49,7 @@ class Writer(FormatWriter): def _text(self, oeb_book): pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables) - pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252') + pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') pml_pages = [] for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index d32d391004..cdf3bf69e8 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -13,6 +13,7 @@ import os, re from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.pdb.ereader import image_name +from calibre import entity_to_unicode TAG_MAP = { 'b' : 'B', @@ -78,9 +79,12 @@ class PMLMLizer(object): # Remove anchors that do not have links anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text)) - links = set(re.findall(r'(?<=\\q=").+?(?=")', text)) + links = set(re.findall(r'(?<=\\q="#).+?(?=")', text)) for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') + + for entity in set(re.findall('&.+?;', text)): + text = text.replace(entity, entity_to_unicode(entity[1:-1])) return text @@ -136,10 +140,10 @@ class PMLMLizer(object): href = elem.get('href') if href and '://' not in href: if '#' in href: - href = href.partition('#')[2][1:] + href = href.partition('#')[2] href = os.path.splitext(os.path.basename(href))[0] tag_count += 1 - text += '\\q="%s"' % href + text += '\\q="#%s"' % href tag_stack.append('q') # Anchor ids id_name = elem.get('id')