diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index b6893e395d..f10cf95e87 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -9,8 +9,10 @@ Transform OEB content into FB2 markup ''' import os +import re from base64 import b64encode +from calibre import entity_to_unicode from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import OEB_IMAGES @@ -75,7 +77,13 @@ class FB2MLizer(object): return images def clean_text(self, text): - return text.replace('&', '') + for entity in set(re.findall('&.+?;', text)): + mo = re.search('(%s)' % entity[1:-1], text) + text = text.replace(entity, entity_to_unicode(mo)) + + text = text.replace('&', '') + + return text def dump_text(self, elem, stylizer, tag_stack=[]): if not isinstance(elem.tag, basestring) \ diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index c34ada3317..01f777caae 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -88,7 +88,7 @@ class PMLMLizer(object): def add_page_anchor(self, href): href = os.path.splitext(os.path.basename(href))[0] - return '\\Q="%s"' % href + return u'\\Q="%s"' % href def clean_text(self, text): # Remove excess spaces at beginning and end of lines @@ -108,9 +108,10 @@ class PMLMLizer(object): links = set(re.findall(r'(?<=\\q="#).+?(?=")', text)) for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') - + for entity in set(re.findall('&.+?;', text)): - text = text.replace(entity, entity_to_unicode(entity[1:-1])) + mo = re.search('(%s)' % entity[1:-1], text) + text = text.replace(entity, entity_to_unicode(mo)) return text diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 09a79d322d..313250bcf2 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement -''' -Write content to TXT. -''' __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, re, sys +''' +Write content to TXT. +''' +import os +import re + +from calibre import entity_to_unicode from calibre.ebooks.htmlsymbols import HTML_SYMBOLS from BeautifulSoup import BeautifulSoup @@ -83,6 +85,11 @@ class TxtWriter(object): for symbol in HTML_SYMBOLS: for code in HTML_SYMBOLS[symbol]: content = content.replace(code, symbol) + + for entity in set(re.findall('&.+?;', content)): + mo = re.search('(%s)' % entity[1:-1], content) + content = content.replace(entity, entity_to_unicode(mo)) + return content def cleanup_text(self, text):