use entity_to_unicode properly

This commit is contained in:
John Schember 2009-05-24 11:43:53 -04:00
parent 4ac041cace
commit bebf905648
3 changed files with 25 additions and 9 deletions

View File

@ -9,8 +9,10 @@ Transform OEB content into FB2 markup
''' '''
import os import os
import re
from base64 import b64encode from base64 import b64encode
from calibre import entity_to_unicode
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.oeb.base import OEB_IMAGES
@ -75,7 +77,13 @@ class FB2MLizer(object):
return images return images
def clean_text(self, text): def clean_text(self, text):
return text.replace('&', '') for entity in set(re.findall('&.+?;', text)):
mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
text = text.replace('&', '')
return text
def dump_text(self, elem, stylizer, tag_stack=[]): def dump_text(self, elem, stylizer, tag_stack=[]):
if not isinstance(elem.tag, basestring) \ if not isinstance(elem.tag, basestring) \

View File

@ -88,7 +88,7 @@ class PMLMLizer(object):
def add_page_anchor(self, href): def add_page_anchor(self, href):
href = os.path.splitext(os.path.basename(href))[0] href = os.path.splitext(os.path.basename(href))[0]
return '\\Q="%s"' % href return u'\\Q="%s"' % href
def clean_text(self, text): def clean_text(self, text):
# Remove excess spaces at beginning and end of lines # Remove excess spaces at beginning and end of lines
@ -110,7 +110,8 @@ class PMLMLizer(object):
text = text.replace('\\Q="%s"' % unused, '') text = text.replace('\\Q="%s"' % unused, '')
for entity in set(re.findall('&.+?;', text)): for entity in set(re.findall('&.+?;', text)):
text = text.replace(entity, entity_to_unicode(entity[1:-1])) mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
return text return text

View File

@ -1,15 +1,17 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Write content to TXT.
'''
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, re, sys '''
Write content to TXT.
'''
import os
import re
from calibre import entity_to_unicode
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
@ -83,6 +85,11 @@ class TxtWriter(object):
for symbol in HTML_SYMBOLS: for symbol in HTML_SYMBOLS:
for code in HTML_SYMBOLS[symbol]: for code in HTML_SYMBOLS[symbol]:
content = content.replace(code, symbol) content = content.replace(code, symbol)
for entity in set(re.findall('&.+?;', content)):
mo = re.search('(%s)' % entity[1:-1], content)
content = content.replace(entity, entity_to_unicode(mo))
return content return content
def cleanup_text(self, text): def cleanup_text(self, text):