use entity_to_unicode properly

2025-07-09 03:04:10 -04:00 · 2009-05-24 11:43:53 -04:00 · 2009-05-24 11:43:53 -04:00 · bebf905648
commit bebf905648
parent 4ac041cace
3 changed files with 25 additions and 9 deletions
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -9,8 +9,10 @@ Transform OEB content into FB2 markup
 '''
 import os
 import re
 from base64 import b64encode
 from calibre import entity_to_unicode
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks.oeb.base import OEB_IMAGES
@ -75,7 +77,13 @@ class FB2MLizer(object):
        return images
    def clean_text(self, text):
-        return text.replace('&', '')
+        for entity in set(re.findall('&.+?;', text)):
            mo = re.search('(%s)' % entity[1:-1], text)
            text = text.replace(entity, entity_to_unicode(mo))
        text = text.replace('&', '')
        return text
    def dump_text(self, elem, stylizer, tag_stack=[]):
        if not isinstance(elem.tag, basestring) \
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@ -88,7 +88,7 @@ class PMLMLizer(object):
    def add_page_anchor(self, href):
        href = os.path.splitext(os.path.basename(href))[0]
-        return '\\Q="%s"' % href
+        return u'\\Q="%s"' % href
    def clean_text(self, text):
        # Remove excess spaces at beginning and end of lines
@ -110,7 +110,8 @@ class PMLMLizer(object):
            text = text.replace('\\Q="%s"' % unused, '')
        for entity in set(re.findall('&.+?;', text)):
-            text = text.replace(entity, entity_to_unicode(entity[1:-1]))
+            mo = re.search('(%s)' % entity[1:-1], text)
            text = text.replace(entity, entity_to_unicode(mo))
        return text
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@ -1,15 +1,17 @@
 # -*- coding: utf-8 -*-
 from __future__ import with_statement
 '''
 Write content to TXT.
 '''
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
-import os, re, sys
+'''
 Write content to TXT.
 '''
 import os
 import re
 from calibre import entity_to_unicode
 from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
 from BeautifulSoup import BeautifulSoup
@ -83,6 +85,11 @@ class TxtWriter(object):
        for symbol in HTML_SYMBOLS:
            for code in HTML_SYMBOLS[symbol]:
                content = content.replace(code, symbol)
        for entity in set(re.findall('&.+?;', content)):
            mo = re.search('(%s)' % entity[1:-1], content)
            content = content.replace(entity, entity_to_unicode(mo))
        return content
    def cleanup_text(self, text):