mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
use entity_to_unicode properly
This commit is contained in:
parent
4ac041cace
commit
bebf905648
@ -9,8 +9,10 @@ Transform OEB content into FB2 markup
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
from base64 import b64encode
|
||||
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||
@ -75,7 +77,13 @@ class FB2MLizer(object):
|
||||
return images
|
||||
|
||||
def clean_text(self, text):
|
||||
return text.replace('&', '')
|
||||
for entity in set(re.findall('&.+?;', text)):
|
||||
mo = re.search('(%s)' % entity[1:-1], text)
|
||||
text = text.replace(entity, entity_to_unicode(mo))
|
||||
|
||||
text = text.replace('&', '')
|
||||
|
||||
return text
|
||||
|
||||
def dump_text(self, elem, stylizer, tag_stack=[]):
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
|
@ -88,7 +88,7 @@ class PMLMLizer(object):
|
||||
|
||||
def add_page_anchor(self, href):
|
||||
href = os.path.splitext(os.path.basename(href))[0]
|
||||
return '\\Q="%s"' % href
|
||||
return u'\\Q="%s"' % href
|
||||
|
||||
def clean_text(self, text):
|
||||
# Remove excess spaces at beginning and end of lines
|
||||
@ -108,9 +108,10 @@ class PMLMLizer(object):
|
||||
links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
|
||||
for unused in anchors.difference(links):
|
||||
text = text.replace('\\Q="%s"' % unused, '')
|
||||
|
||||
|
||||
for entity in set(re.findall('&.+?;', text)):
|
||||
text = text.replace(entity, entity_to_unicode(entity[1:-1]))
|
||||
mo = re.search('(%s)' % entity[1:-1], text)
|
||||
text = text.replace(entity, entity_to_unicode(mo))
|
||||
|
||||
return text
|
||||
|
||||
|
@ -1,15 +1,17 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import with_statement
|
||||
'''
|
||||
Write content to TXT.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, sys
|
||||
'''
|
||||
Write content to TXT.
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
@ -83,6 +85,11 @@ class TxtWriter(object):
|
||||
for symbol in HTML_SYMBOLS:
|
||||
for code in HTML_SYMBOLS[symbol]:
|
||||
content = content.replace(code, symbol)
|
||||
|
||||
for entity in set(re.findall('&.+?;', content)):
|
||||
mo = re.search('(%s)' % entity[1:-1], content)
|
||||
content = content.replace(entity, entity_to_unicode(mo))
|
||||
|
||||
return content
|
||||
|
||||
def cleanup_text(self, text):
|
||||
|
Loading…
x
Reference in New Issue
Block a user