Replace all entities by their unicode equivalents in pre-processing stage.

This commit is contained in:
Kovid Goyal 2007-11-21 17:15:43 +00:00
parent 893863a670
commit 0daa63e395

View File

@ -22,7 +22,6 @@ and to Falstaff for pylrs.
""" """
import os, re, sys, copy, glob, logging, tempfile import os, re, sys, copy, glob, logging, tempfile
from collections import deque from collections import deque
from htmlentitydefs import name2codepoint
from urllib import unquote from urllib import unquote
from urlparse import urlparse from urlparse import urlparse
from math import ceil, floor from math import ceil, floor
@ -38,7 +37,7 @@ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream,
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
LrsError, Sup, Sub, properties_different, EmpLine LrsError, Sup, Sub, properties_different, EmpLine
from libprs500.ebooks.lrf.pylrs.pylrs import Span from libprs500.ebooks.lrf.pylrs.pylrs import Span
from libprs500.ebooks.lrf import Book from libprs500.ebooks.lrf import Book, entity_to_unicode
from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table from libprs500.ebooks.lrf.html.table import Table
@ -65,16 +64,10 @@ def munge_paths(basepath, url):
return os.path.normpath(path), fragment return os.path.normpath(path), fragment
class HTMLConverter(object): class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo']
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
ENTITY_RULES = zip(patterns, targets) + [(re.compile('''), "'")]
MARKUP_MASSAGE = [ MARKUP_MASSAGE = [
@ -89,7 +82,9 @@ class HTMLConverter(object):
(re.compile(r'<a.*?>(.*?)</a\s*>', re.DOTALL|re.IGNORECASE), (re.compile(r'<a.*?>(.*?)</a\s*>', re.DOTALL|re.IGNORECASE),
lambda match: re.compile(r'<\s*?p.*?>', re.IGNORECASE).sub('', match.group())), lambda match: re.compile(r'<\s*?p.*?>', re.IGNORECASE).sub('', match.group())),
# Workaround bug in BeautifulSoup &nbsp; handling # Workaround bug in BeautifulSoup &nbsp; handling
(re.compile(u'&nbsp;|&#160;|&#xa0;|\xa0', re.IGNORECASE), lambda match : u'\uffff') (re.compile(u'&nbsp;|&#160;|&#xa0;|\xa0', re.IGNORECASE), lambda match : u'\uffff'),
# Replace entities
(re.compile(ur'&(\S+?);'), entity_to_unicode),
] ]
# Fix Baen markup # Fix Baen markup
BAEN = [ BAEN = [
@ -523,9 +518,6 @@ class HTMLConverter(object):
text += c['alt'] text += c['alt']
return text return text
text += self.get_text(c) text += self.get_text(c)
if text:
for rule, sub in self.__class__.ENTITY_RULES:
text = rule.sub(sub, text)
return text return text
def process_links(self): def process_links(self):
@ -740,8 +732,6 @@ class HTMLConverter(object):
def append_text(src): def append_text(src):
fp, key, variant = self.font_properties(css) fp, key, variant = self.font_properties(css)
for pat, repl in self.__class__.ENTITY_RULES:
src = pat.sub(repl, src)
src = src.replace(u'\uffff', ' ') # &nbsp; becomes u'\uffff' src = src.replace(u'\uffff', ' ') # &nbsp; becomes u'\uffff'
normal_font_size = int(fp['fontsize']) normal_font_size = int(fp['fontsize'])
if variant == 'small-caps': if variant == 'small-caps':