From a16d26a1166445db8b84d9c43ef7e1b32a97d944 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 3 Dec 2007 04:40:27 +0000 Subject: [PATCH] Fix #355 --- src/libprs500/ebooks/lrf/__init__.py | 6 ++++-- src/libprs500/ebooks/lrf/html/convert_from.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/libprs500/ebooks/lrf/__init__.py b/src/libprs500/ebooks/lrf/__init__.py index b7e260a04e..052c4abf19 100644 --- a/src/libprs500/ebooks/lrf/__init__.py +++ b/src/libprs500/ebooks/lrf/__init__.py @@ -313,8 +313,10 @@ def Book(options, logger, font_delta=0, header=None, raise ConversionError, 'Could not find the normal version of the ' + family + ' font' return book, fonts -def entity_to_unicode(match): +def entity_to_unicode(match, exceptions=[]): ent = match.group(1) + if ent in exceptions: + return '&'+ent+';' if ent.startswith(u'#x'): return unichr(int(ent[2:], 16)) if ent.startswith(u'#'): @@ -322,4 +324,4 @@ def entity_to_unicode(match): try: return unichr(name2codepoint[ent]) except KeyError: - return ent + return '&'+ent+';' diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 8e40ba3a19..f24c2abedf 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -25,6 +25,8 @@ from collections import deque from urllib import unquote from urlparse import urlparse from math import ceil, floor +from functools import partial + try: from PIL import Image as PILImage except ImportError: @@ -63,7 +65,6 @@ def munge_paths(basepath, url): path = os.path.join(os.path.dirname(basepath), path) return os.path.normpath(path), fragment - class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) @@ -84,7 +85,7 @@ class HTMLConverter(object): # Workaround bug in BeautifulSoup   handling (re.compile(u' | | |\xa0', re.IGNORECASE), lambda match : u'\uffff'), # Replace entities - (re.compile(ur'&(\S+?);'), entity_to_unicode), + (re.compile(ur'&(\S+?);'), partial(entity_to_unicode, exceptions=['lt', 'gt'])), ] # Fix Baen markup BAEN = [