Fix #355

2025-07-08 10:44:09 -04:00 · 2007-12-03 04:40:27 +00:00 · 2007-12-03 04:40:27 +00:00 · a16d26a116
commit a16d26a116
parent 67c0062b03
2 changed files with 7 additions and 4 deletions
--- a/src/libprs500/ebooks/lrf/init.py
+++ b/src/libprs500/ebooks/lrf/init.py
@ -313,8 +313,10 @@ def Book(options, logger, font_delta=0, header=None,
            raise ConversionError, 'Could not find the normal version of the ' + family + ' font'
    return book, fonts

-def entity_to_unicode(match):
+def entity_to_unicode(match, exceptions=[]):
    ent = match.group(1)
+    if ent in exceptions:
+        return '&'+ent+';'
    if ent.startswith(u'#x'):
        return unichr(int(ent[2:], 16))
    if ent.startswith(u'#'):
@ -322,4 +324,4 @@ def entity_to_unicode(match):
    try:
        return unichr(name2codepoint[ent])
    except KeyError:
-        return ent
+        return '&'+ent+';'
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -25,6 +25,8 @@ from collections import deque
 from urllib import unquote
 from urlparse import urlparse
 from math import ceil, floor
+from functools import partial
+
 try:
    from PIL import Image as PILImage
 except ImportError:
@ -63,7 +65,6 @@ def munge_paths(basepath, url):
        path = os.path.join(os.path.dirname(basepath), path)
    return os.path.normpath(path), fragment

-                              
 class HTMLConverter(object):
    SELECTOR_PAT   = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
@ -84,7 +85,7 @@ class HTMLConverter(object):
                        # Workaround bug in BeautifulSoup &nbsp; handling
                        (re.compile(u'&nbsp;|&#160;|&#xa0;|\xa0', re.IGNORECASE), lambda match : u'\uffff'),
                        # Replace entities
-                        (re.compile(ur'&(\S+?);'), entity_to_unicode),
+                        (re.compile(ur'&(\S+?);'), partial(entity_to_unicode, exceptions=['lt', 'gt'])),
                        ]
    # Fix Baen markup
    BAEN = [