use entity_to_unicode properly

2025-07-09 03:04:10 -04:00 · 2009-05-24 11:43:53 -04:00 · 2009-05-24 11:43:53 -04:00 · bebf905648
commit bebf905648
parent 4ac041cace
3 changed files with 25 additions and 9 deletions
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -9,8 +9,10 @@ Transform OEB content into FB2 markup
 '''

 import os
+import re
 from base64 import b64encode

+from calibre import entity_to_unicode
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks.oeb.base import OEB_IMAGES
@ -75,7 +77,13 @@ class FB2MLizer(object):
        return images

    def clean_text(self, text):
-        return text.replace('&', '')
+        for entity in set(re.findall('&.+?;', text)):
+            mo = re.search('(%s)' % entity[1:-1], text)
+            text = text.replace(entity, entity_to_unicode(mo))
+
+        text = text.replace('&', '')
+
+        return text

    def dump_text(self, elem, stylizer, tag_stack=[]):
        if not isinstance(elem.tag, basestring) \
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@ -88,7 +88,7 @@ class PMLMLizer(object):

    def add_page_anchor(self, href):
        href = os.path.splitext(os.path.basename(href))[0]
-        return '\\Q="%s"' % href
+        return u'\\Q="%s"' % href

    def clean_text(self, text):
        # Remove excess spaces at beginning and end of lines
@ -108,9 +108,10 @@ class PMLMLizer(object):
        links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
        for unused in anchors.difference(links):
            text = text.replace('\\Q="%s"' % unused, '')
-            
+
        for entity in set(re.findall('&.+?;', text)):
-            text = text.replace(entity, entity_to_unicode(entity[1:-1]))
+            mo = re.search('(%s)' % entity[1:-1], text)
+            text = text.replace(entity, entity_to_unicode(mo))
        
        return text

--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@ -1,15 +1,17 @@
 # -*- coding: utf-8 -*-
-from __future__ import with_statement
-'''
-Write content to TXT.
-'''

 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

-import os, re, sys
+'''
+Write content to TXT.
+'''

+import os
+import re
+
+from calibre import entity_to_unicode
 from calibre.ebooks.htmlsymbols import HTML_SYMBOLS

 from BeautifulSoup import BeautifulSoup
@ -83,6 +85,11 @@ class TxtWriter(object):
        for symbol in HTML_SYMBOLS:
            for code in HTML_SYMBOLS[symbol]:
                content = content.replace(code, symbol)
+
+        for entity in set(re.findall('&.+?;', content)):
+            mo = re.search('(%s)' % entity[1:-1], content)
+            content = content.replace(entity, entity_to_unicode(mo))
+
        return content
        
    def cleanup_text(self, text):