use entity_to_unicode properly

This commit is contained in:
John Schember 2009-05-24 11:43:53 -04:00
parent 4ac041cace
commit bebf905648
3 changed files with 25 additions and 9 deletions

View File

@ -9,8 +9,10 @@ Transform OEB content into FB2 markup
'''
import os
import re
from base64 import b64encode
from calibre import entity_to_unicode
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.base import OEB_IMAGES
@ -75,7 +77,13 @@ class FB2MLizer(object):
return images
def clean_text(self, text):
return text.replace('&', '')
for entity in set(re.findall('&.+?;', text)):
mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
text = text.replace('&', '')
return text
def dump_text(self, elem, stylizer, tag_stack=[]):
if not isinstance(elem.tag, basestring) \

View File

@ -88,7 +88,7 @@ class PMLMLizer(object):
def add_page_anchor(self, href):
href = os.path.splitext(os.path.basename(href))[0]
return '\\Q="%s"' % href
return u'\\Q="%s"' % href
def clean_text(self, text):
# Remove excess spaces at beginning and end of lines
@ -108,9 +108,10 @@ class PMLMLizer(object):
links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
for unused in anchors.difference(links):
text = text.replace('\\Q="%s"' % unused, '')
for entity in set(re.findall('&.+?;', text)):
text = text.replace(entity, entity_to_unicode(entity[1:-1]))
mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
return text

View File

@ -1,15 +1,17 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Write content to TXT.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, re, sys
'''
Write content to TXT.
'''
import os
import re
from calibre import entity_to_unicode
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
from BeautifulSoup import BeautifulSoup
@ -83,6 +85,11 @@ class TxtWriter(object):
for symbol in HTML_SYMBOLS:
for code in HTML_SYMBOLS[symbol]:
content = content.replace(code, symbol)
for entity in set(re.findall('&.+?;', content)):
mo = re.search('(%s)' % entity[1:-1], content)
content = content.replace(entity, entity_to_unicode(mo))
return content
def cleanup_text(self, text):