Replace all entities by their unicode equivalents in pre-processing stage.

2025-07-09 03:04:10 -04:00 · 2007-11-21 17:15:43 +00:00 · 2007-11-21 17:15:43 +00:00 · 0daa63e395
commit 0daa63e395
parent 893863a670
1 changed files with 5 additions and 15 deletions
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -22,7 +22,6 @@ and to Falstaff for pylrs.
 """
 import os, re, sys, copy, glob, logging, tempfile
 from collections import deque
 from htmlentitydefs import name2codepoint
 from urllib import unquote
 from urlparse import urlparse
 from math import ceil, floor
@ -38,7 +37,7 @@ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream,
                Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
                LrsError, Sup, Sub, properties_different, EmpLine
 from libprs500.ebooks.lrf.pylrs.pylrs import Span 
-from libprs500.ebooks.lrf import Book
+from libprs500.ebooks.lrf import Book, entity_to_unicode
 from libprs500.ebooks.lrf import option_parser as lrf_option_parser
 from libprs500.ebooks import ConversionError
 from libprs500.ebooks.lrf.html.table import Table 
@ -65,16 +64,10 @@ def munge_paths(basepath, url):
    return os.path.normpath(path), fragment
 class HTMLConverter(object):
    SELECTOR_PAT   = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
    IGNORED_TAGS   = (Comment, Declaration, ProcessingInstruction)
    replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo']
    patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
    targets  = [ unichr(name2codepoint[i]) for i in replaced_entities ]
    ENTITY_RULES = zip(patterns, targets) + [(re.compile('&apos;'), "'")]
    MARKUP_MASSAGE   = [
@ -89,7 +82,9 @@ class HTMLConverter(object):
                        (re.compile(r'<a.*?>(.*?)</a\s*>', re.DOTALL|re.IGNORECASE),
                         lambda match: re.compile(r'<\s*?p.*?>', re.IGNORECASE).sub('', match.group())),
                        # Workaround bug in BeautifulSoup &nbsp; handling
-                        (re.compile(u'&nbsp;|&#160;|&#xa0;|\xa0', re.IGNORECASE), lambda match : u'\uffff')
+                        (re.compile(u'&nbsp;|&#160;|&#xa0;|\xa0', re.IGNORECASE), lambda match : u'\uffff'),
                        # Replace entities
                        (re.compile(ur'&(\S+?);'), entity_to_unicode),
                        ]
    # Fix Baen markup
    BAEN = [ 
@ -523,9 +518,6 @@ class HTMLConverter(object):
                        text += c['alt']
                        return text
                    text += self.get_text(c)
            if text:
                for rule, sub in self.__class__.ENTITY_RULES:
                    text = rule.sub(sub, text)
            return text
    def process_links(self):
@ -740,8 +732,6 @@ class HTMLConverter(object):
        def append_text(src):
            fp, key, variant = self.font_properties(css)
            for pat, repl in self.__class__.ENTITY_RULES:
                src = pat.sub(repl, src)
            src = src.replace(u'\uffff', ' ') # &nbsp; becomes u'\uffff'
            normal_font_size = int(fp['fontsize'])
            if variant == 'small-caps':