From 0daa63e3958e382b72b20d547a437d04230eb1b3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 21 Nov 2007 17:15:43 +0000 Subject: [PATCH] Replace all entities by their unicode equivalents in pre-processing stage. --- src/libprs500/ebooks/lrf/html/convert_from.py | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index d2f39c104c..9332a6dfe2 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -22,7 +22,6 @@ and to Falstaff for pylrs. """ import os, re, sys, copy, glob, logging, tempfile from collections import deque -from htmlentitydefs import name2codepoint from urllib import unquote from urlparse import urlparse from math import ceil, floor @@ -38,7 +37,7 @@ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ LrsError, Sup, Sub, properties_different, EmpLine from libprs500.ebooks.lrf.pylrs.pylrs import Span -from libprs500.ebooks.lrf import Book +from libprs500.ebooks.lrf import Book, entity_to_unicode from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError from libprs500.ebooks.lrf.html.table import Table @@ -65,16 +64,10 @@ def munge_paths(basepath, url): return os.path.normpath(path), fragment - - class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo'] - patterns = [ re.compile('&'+i+';') for i in replaced_entities ] - targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] - ENTITY_RULES = zip(patterns, targets) + [(re.compile('''), "'")] MARKUP_MASSAGE = [ @@ -89,8 +82,10 @@ class HTMLConverter(object): (re.compile(r'(.*?)', re.DOTALL|re.IGNORECASE), lambda match: re.compile(r'<\s*?p.*?>', re.IGNORECASE).sub('', match.group())), # Workaround bug in BeautifulSoup   handling - (re.compile(u' | | |\xa0', re.IGNORECASE), lambda match : u'\uffff') - ] + (re.compile(u' | | |\xa0', re.IGNORECASE), lambda match : u'\uffff'), + # Replace entities + (re.compile(ur'&(\S+?);'), entity_to_unicode), + ] # Fix Baen markup BAEN = [ (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), @@ -523,9 +518,6 @@ class HTMLConverter(object): text += c['alt'] return text text += self.get_text(c) - if text: - for rule, sub in self.__class__.ENTITY_RULES: - text = rule.sub(sub, text) return text def process_links(self): @@ -740,8 +732,6 @@ class HTMLConverter(object): def append_text(src): fp, key, variant = self.font_properties(css) - for pat, repl in self.__class__.ENTITY_RULES: - src = pat.sub(repl, src) src = src.replace(u'\uffff', ' ') #   becomes u'\uffff' normal_font_size = int(fp['fontsize']) if variant == 'small-caps':