Switch to RE-based unescaped ampersand post-processing

This commit is contained in:
Marshall T. Vandegrift 2008-07-21 19:42:40 -04:00
parent 318de321f1
commit 675510e428

View File

@ -10,6 +10,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
import sys, struct, cStringIO, os import sys, struct, cStringIO, os
import functools import functools
import codecs import codecs
import re
from itertools import repeat from itertools import repeat
from calibre import relpath from calibre import relpath
@ -98,33 +99,21 @@ def read_utf8_char(bytes, pos):
return unichr(c), pos+elsize return unichr(c), pos+elsize
class UnBinary(object): class UnBinary(object):
AMPERSAND_RE = re.compile(
r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)')
def __init__(self, bin, manifest, map=OPF_MAP): def __init__(self, bin, manifest, map=OPF_MAP):
self.manifest = manifest self.manifest = manifest
self.tag_map, self.attr_map, self.tag_to_attr_map = map self.tag_map, self.attr_map, self.tag_to_attr_map = map
self.opf = map is OPF_MAP self.opf = map is OPF_MAP
self.bin = bin self.bin = bin
self.buf = cStringIO.StringIO() self.buf = cStringIO.StringIO()
self.ampersands = []
self.binary_to_text() self.binary_to_text()
self.raw = self.buf.getvalue().lstrip().decode('utf-8') self.raw = self.buf.getvalue().lstrip().decode('utf-8')
self.escape_ampersands() self.escape_ampersands()
def escape_ampersands(self): def escape_ampersands(self):
offset = 0 self.raw = self.AMPERSAND_RE.sub('&amp;', self.raw)
for pos in self.ampersands:
test = self.raw[pos+offset:pos+offset+6]
if test.startswith('&#') and ';' in test:
continue
escape = True
for ent in XML_ENTITIES:
if test.startswith(ent):
escape = False
break
if not escape:
continue
self.raw = '&amp;'.join(
(self.raw[:pos+offset], self.raw[pos+offset+1:]))
offset += 4
def item_path(self, internal_id): def item_path(self, internal_id):
try: try:
@ -153,8 +142,6 @@ class UnBinary(object):
continue continue
elif c == '\v': elif c == '\v':
c = '\n' c = '\n'
elif c == '&':
self.ampersands.append(self.buf.tell()-1)
self.buf.write(c.encode('utf-8')) self.buf.write(c.encode('utf-8'))
elif state == 'get flags': elif state == 'get flags':