This commit is contained in:
Kovid Goyal 2007-12-03 04:40:27 +00:00
parent 67c0062b03
commit a16d26a116
2 changed files with 7 additions and 4 deletions

View File

@ -313,8 +313,10 @@ def Book(options, logger, font_delta=0, header=None,
raise ConversionError, 'Could not find the normal version of the ' + family + ' font'
return book, fonts
def entity_to_unicode(match):
def entity_to_unicode(match, exceptions=[]):
ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
if ent.startswith(u'#x'):
return unichr(int(ent[2:], 16))
if ent.startswith(u'#'):
@ -322,4 +324,4 @@ def entity_to_unicode(match):
try:
return unichr(name2codepoint[ent])
except KeyError:
return ent
return '&'+ent+';'

View File

@ -25,6 +25,8 @@ from collections import deque
from urllib import unquote
from urlparse import urlparse
from math import ceil, floor
from functools import partial
try:
from PIL import Image as PILImage
except ImportError:
@ -63,7 +65,6 @@ def munge_paths(basepath, url):
path = os.path.join(os.path.dirname(basepath), path)
return os.path.normpath(path), fragment
class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
@ -84,7 +85,7 @@ class HTMLConverter(object):
# Workaround bug in BeautifulSoup   handling
(re.compile(u' | | |\xa0', re.IGNORECASE), lambda match : u'\uffff'),
# Replace entities
(re.compile(ur'&(\S+?);'), entity_to_unicode),
(re.compile(ur'&(\S+?);'), partial(entity_to_unicode, exceptions=['lt', 'gt'])),
]
# Fix Baen markup
BAEN = [