This commit is contained in:
Kovid Goyal 2008-07-21 20:45:22 -07:00
commit 280561dc2b

View File

@ -9,6 +9,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
import sys, struct, cStringIO, os import sys, struct, cStringIO, os
import functools import functools
import re
from calibre.ebooks.lit import LitError from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1 import calibre.ebooks.lit.mssha1 as mssha1
@ -93,33 +94,21 @@ def read_utf8_char(bytes, pos):
return unichr(c), pos+elsize return unichr(c), pos+elsize
class UnBinary(object): class UnBinary(object):
AMPERSAND_RE = re.compile(
r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)')
def __init__(self, bin, manifest, map=OPF_MAP): def __init__(self, bin, manifest, map=OPF_MAP):
self.manifest = manifest self.manifest = manifest
self.tag_map, self.attr_map, self.tag_to_attr_map = map self.tag_map, self.attr_map, self.tag_to_attr_map = map
self.opf = map is OPF_MAP self.opf = map is OPF_MAP
self.bin = bin self.bin = bin
self.buf = cStringIO.StringIO() self.buf = cStringIO.StringIO()
self.ampersands = []
self.binary_to_text() self.binary_to_text()
self.raw = self.buf.getvalue().lstrip().decode('utf-8') self.raw = self.buf.getvalue().lstrip().decode('utf-8')
self.escape_ampersands() self.escape_ampersands()
def escape_ampersands(self): def escape_ampersands(self):
offset = 0 self.raw = self.AMPERSAND_RE.sub('&amp;', self.raw)
for pos in self.ampersands:
test = self.raw[pos+offset:pos+offset+6]
if test.startswith('&#') and ';' in test:
continue
escape = True
for ent in XML_ENTITIES:
if test.startswith(ent):
escape = False
break
if not escape:
continue
self.raw = '&amp;'.join(
(self.raw[:pos+offset], self.raw[pos+offset+1:]))
offset += 4
def item_path(self, internal_id): def item_path(self, internal_id):
try: try:
@ -148,8 +137,6 @@ class UnBinary(object):
continue continue
elif c == '\v': elif c == '\v':
c = '\n' c = '\n'
elif c == '&':
self.ampersands.append(self.buf.tell()-1)
self.buf.write(c.encode('utf-8')) self.buf.write(c.encode('utf-8'))
elif state == 'get flags': elif state == 'get flags':