This commit is contained in:
Kovid Goyal 2007-12-03 04:40:27 +00:00
parent 67c0062b03
commit a16d26a116
2 changed files with 7 additions and 4 deletions

View File

@ -313,8 +313,10 @@ def Book(options, logger, font_delta=0, header=None,
raise ConversionError, 'Could not find the normal version of the ' + family + ' font' raise ConversionError, 'Could not find the normal version of the ' + family + ' font'
return book, fonts return book, fonts
def entity_to_unicode(match): def entity_to_unicode(match, exceptions=[]):
ent = match.group(1) ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
if ent.startswith(u'#x'): if ent.startswith(u'#x'):
return unichr(int(ent[2:], 16)) return unichr(int(ent[2:], 16))
if ent.startswith(u'#'): if ent.startswith(u'#'):
@ -322,4 +324,4 @@ def entity_to_unicode(match):
try: try:
return unichr(name2codepoint[ent]) return unichr(name2codepoint[ent])
except KeyError: except KeyError:
return ent return '&'+ent+';'

View File

@ -25,6 +25,8 @@ from collections import deque
from urllib import unquote from urllib import unquote
from urlparse import urlparse from urlparse import urlparse
from math import ceil, floor from math import ceil, floor
from functools import partial
try: try:
from PIL import Image as PILImage from PIL import Image as PILImage
except ImportError: except ImportError:
@ -63,7 +65,6 @@ def munge_paths(basepath, url):
path = os.path.join(os.path.dirname(basepath), path) path = os.path.join(os.path.dirname(basepath), path)
return os.path.normpath(path), fragment return os.path.normpath(path), fragment
class HTMLConverter(object): class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
@ -84,7 +85,7 @@ class HTMLConverter(object):
# Workaround bug in BeautifulSoup   handling # Workaround bug in BeautifulSoup   handling
(re.compile(u' | | |\xa0', re.IGNORECASE), lambda match : u'\uffff'), (re.compile(u' | | |\xa0', re.IGNORECASE), lambda match : u'\uffff'),
# Replace entities # Replace entities
(re.compile(ur'&(\S+?);'), entity_to_unicode), (re.compile(ur'&(\S+?);'), partial(entity_to_unicode, exceptions=['lt', 'gt'])),
] ]
# Fix Baen markup # Fix Baen markup
BAEN = [ BAEN = [