Refactor the LitReader:

- Protect whitespace-only nodes with <span/> tags so they can be safely
    pretty-printed.
  - Separate the LitReader class into a low-level LitFile class and a
    high-level LitReader class.
This commit is contained in:
Marshall T. Vandegrift 2008-12-07 21:30:28 -05:00
parent efbe7d9a2d
commit ca37c601ec

View File

@ -7,9 +7,10 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'and Marshall T. Vandegrift <llasram@gmail.com>'
import sys, struct, cStringIO, os
import sys, struct, os
import functools
import re
from cStringIO import StringIO
from lxml import etree
from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
@ -19,6 +20,8 @@ from calibre import plugins
lzx, lxzerror = plugins['lzx']
msdes, msdeserror = plugins['msdes']
__all__ = ["LitReader"]
XML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
"""
OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
@ -106,31 +109,54 @@ def consume_sized_utf8_string(bytes, zpad=False):
pos += 1
return u''.join(result), bytes[pos:]
class UnBinary(object):
AMPERSAND_RE = re.compile(
r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)')
OPEN_ANGLE_RE = re.compile(r'<<(?![!]--)')
CLOSE_ANGLE_RE = re.compile(r'(?<!--)>>')
CLOSE_ANGLE_RE = re.compile(r'(?<!--)>>(?=>>|[^>])')
DOUBLE_ANGLE_RE = re.compile(r'([<>])\1')
def __init__(self, bin, path, manifest={}, map=HTML_MAP):
self.manifest = manifest
self.tag_map, self.attr_map, self.tag_to_attr_map = map
self.opf = map is OPF_MAP
self.bin = bin
self.is_html = map is HTML_MAP
self.dir = os.path.dirname(path)
self.buf = cStringIO.StringIO()
self.binary_to_text()
self.raw = self.buf.getvalue().lstrip().decode('utf-8')
self.escape_reserved()
buf = StringIO()
self.binary_to_text(bin, buf)
raw = buf.getvalue().lstrip().decode('utf-8')
raw = self.escape_reserved(raw)
self.tree = self.fixup_tree(raw)
def escape_reserved(self):
raw = self.raw
def fixup_node(self, node, in_head=False):
in_head = in_head or (node.tag == 'head')
if self.is_html and not in_head:
text = node.text
if text and text.isspace() and len(node) > 0:
node.text = None
span = etree.SubElement(node, 'span')
span.text = text
text = node.tail
if text and text.isspace():
node.tail = None
if self.is_html and not in_head:
span = etree.Element('span')
span.text = text
node.addnext(span)
for child in node.iterchildren():
if isinstance(child.tag, basestring):
self.fixup_node(child, in_head)
return node
def fixup_tree(self, raw):
return self.fixup_node(etree.fromstring(raw))
def escape_reserved(self, raw):
raw = self.AMPERSAND_RE.sub(r'&amp;', raw)
raw = self.OPEN_ANGLE_RE.sub(r'&lt;', raw)
raw = self.CLOSE_ANGLE_RE.sub(r'&gt;', raw)
raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw)
self.raw = raw
return raw
def item_path(self, internal_id):
try:
@ -148,19 +174,15 @@ class UnBinary(object):
relpath = (['..'] * (len(base) - index)) + target[index:]
return '/'.join(relpath)
def __unicode__(self):
return self.raw
def binary_to_text(self, base=0, depth=0):
def binary_to_text(self, bin, buf, index=0, depth=0):
tag_name = current_map = None
dynamic_tag = errors = 0
in_censorship = is_goingdown = False
state = 'text'
index = base
flags = 0
while index < len(self.bin):
c, index = read_utf8_char(self.bin, index)
while index < len(bin):
c, index = read_utf8_char(bin, index)
oc = ord(c)
if state == 'text':
@ -173,7 +195,7 @@ class UnBinary(object):
c = '>>'
elif c == '<':
c = '<<'
self.buf.write(c.encode('ascii', 'xmlcharrefreplace'))
buf.write(c.encode('ascii', 'xmlcharrefreplace'))
elif state == 'get flags':
if oc == 0:
@ -186,7 +208,7 @@ class UnBinary(object):
state = 'text' if oc == 0 else 'get attr'
if flags & FLAG_OPENING:
tag = oc
self.buf.write('<')
buf.write('<')
if not (flags & FLAG_CLOSING):
is_goingdown = True
if tag == 0x8000:
@ -203,7 +225,7 @@ class UnBinary(object):
tag_name = '?'+unichr(tag)+'?'
current_map = self.tag_to_attr_map[tag]
print 'WARNING: tag %s unknown' % unichr(tag)
self.buf.write(unicode(tag_name).encode('utf-8'))
buf.write(unicode(tag_name).encode('utf-8'))
elif flags & FLAG_CLOSING:
if depth == 0:
raise LitError('Extra closing tag')
@ -215,14 +237,14 @@ class UnBinary(object):
if not is_goingdown:
tag_name = None
dynamic_tag = 0
self.buf.write(' />')
buf.write(' />')
else:
self.buf.write('>')
index = self.binary_to_text(base=index, depth=depth+1)
buf.write('>')
index = self.binary_to_text(bin, buf, index, depth+1)
is_goingdown = False
if not tag_name:
raise LitError('Tag ends before it begins.')
self.buf.write(u''.join(
buf.write(u''.join(
('</', tag_name, '>')).encode('utf-8'))
dynamic_tag = 0
tag_name = None
@ -243,7 +265,7 @@ class UnBinary(object):
in_censorship = True
state = 'get value length'
continue
self.buf.write(' ' + unicode(attr).encode('utf-8') + '=')
buf.write(' ' + unicode(attr).encode('utf-8') + '=')
if attr in ['href', 'src']:
state = 'get href length'
else:
@ -251,40 +273,40 @@ class UnBinary(object):
elif state == 'get value length':
if not in_censorship:
self.buf.write('"')
buf.write('"')
count = oc - 1
if count == 0:
if not in_censorship:
self.buf.write('"')
buf.write('"')
in_censorship = False
state = 'get attr'
continue
state = 'get value'
if oc == 0xffff:
continue
if count < 0 or count > (len(self.bin) - index):
if count < 0 or count > (len(bin) - index):
raise LitError('Invalid character count %d' % count)
elif state == 'get value':
if count == 0xfffe:
if not in_censorship:
self.buf.write('%s"' % (oc - 1))
buf.write('%s"' % (oc - 1))
in_censorship = False
state = 'get attr'
elif count > 0:
if not in_censorship:
self.buf.write(c.encode(
buf.write(c.encode(
'ascii', 'xmlcharrefreplace'))
count -= 1
if count == 0:
if not in_censorship:
self.buf.write('"')
buf.write('"')
in_censorship = False
state = 'get attr'
elif state == 'get custom length':
count = oc - 1
if count <= 0 or count > len(self.bin)-index:
if count <= 0 or count > len(bin)-index:
raise LitError('Invalid character count %d' % count)
dynamic_tag += 1
state = 'get custom'
@ -294,26 +316,26 @@ class UnBinary(object):
tag_name += c
count -= 1
if count == 0:
self.buf.write(unicode(tag_name).encode('utf-8'))
buf.write(unicode(tag_name).encode('utf-8'))
state = 'get attr'
elif state == 'get attr length':
count = oc - 1
if count <= 0 or count > (len(self.bin) - index):
if count <= 0 or count > (len(bin) - index):
raise LitError('Invalid character count %d' % count)
self.buf.write(' ')
buf.write(' ')
state = 'get custom attr'
elif state == 'get custom attr':
self.buf.write(unicode(c).encode('utf-8'))
buf.write(unicode(c).encode('utf-8'))
count -= 1
if count == 0:
self.buf.write('=')
buf.write('=')
state = 'get value length'
elif state == 'get href length':
count = oc - 1
if count <= 0 or count > (len(self.bin) - index):
if count <= 0 or count > (len(bin) - index):
raise LitError('Invalid character count %d' % count)
href = ''
state = 'get href'
@ -326,11 +348,12 @@ class UnBinary(object):
path = self.item_path(doc)
if m and frag:
path += m + frag
self.buf.write((u'"%s"' % path).encode(
buf.write((u'"%s"' % path).encode(
'ascii', 'xmlcharrefreplace'))
state = 'get attr'
return index
class DirectoryEntry(object):
def __init__(self, name, section, offset, size):
self.name = name
@ -345,6 +368,7 @@ class DirectoryEntry(object):
def __str__(self):
return repr(self)
class ManifestItem(object):
def __init__(self, original, internal, mime_type, offset, root, state):
self.original = original
@ -372,65 +396,87 @@ class ManifestItem(object):
% (self.internal, self.path, self.mime_type, self.offset,
self.root, self.state)
def preserve(function):
def wrapper(self, *args, **kwargs):
opos = self._stream.tell()
opos = self.stream.tell()
try:
return function(self, *args, **kwargs)
finally:
self._stream.seek(opos)
self.stream.seek(opos)
functools.update_wrapper(wrapper, function)
return wrapper
class LitReader(object):
class LitFile(object):
PIECE_SIZE = 16
XML_PARSER = etree.XMLParser(
remove_blank_text=True, resolve_entities=False)
def __init__(self, filename_or_stream):
if hasattr(filename_or_stream, 'read'):
self.stream = filename_or_stream
else:
self.stream = open(filename_or_stream, 'rb')
try:
self.opf_path = os.path.splitext(
os.path.basename(self.stream.name))[0] + '.opf'
except AttributeError:
self.opf_path = 'content.opf'
if self.magic != 'ITOLITLS':
raise LitError('Not a valid LIT file')
if self.version != 1:
raise LitError('Unknown LIT version %d' % (self.version,))
self.read_secondary_header()
self.read_header_pieces()
self.read_section_names()
self.read_manifest()
self.read_drm()
def warn(self, msg):
print "WARNING: %s" % (msg,)
def magic():
@preserve
def fget(self):
self._stream.seek(0)
return self._stream.read(8)
self.stream.seek(0)
return self.stream.read(8)
return property(fget=fget)
magic = magic()
def version():
def fget(self):
self._stream.seek(8)
return u32(self._stream.read(4))
self.stream.seek(8)
return u32(self.stream.read(4))
return property(fget=fget)
version = version()
def hdr_len():
@preserve
def fget(self):
self._stream.seek(12)
return int32(self._stream.read(4))
self.stream.seek(12)
return int32(self.stream.read(4))
return property(fget=fget)
hdr_len = hdr_len()
def num_pieces():
@preserve
def fget(self):
self._stream.seek(16)
return int32(self._stream.read(4))
self.stream.seek(16)
return int32(self.stream.read(4))
return property(fget=fget)
num_pieces = num_pieces()
def sec_hdr_len():
@preserve
def fget(self):
self._stream.seek(20)
return int32(self._stream.read(4))
self.stream.seek(20)
return int32(self.stream.read(4))
return property(fget=fget)
sec_hdr_len = sec_hdr_len()
def guid():
@preserve
def fget(self):
self._stream.seek(24)
return self._stream.read(16)
self.stream.seek(24)
return self.stream.read(16)
return property(fget=fget)
guid = guid()
@ -440,44 +486,27 @@ class LitReader(object):
size = self.hdr_len \
+ (self.num_pieces * self.PIECE_SIZE) \
+ self.sec_hdr_len
self._stream.seek(0)
return self._stream.read(size)
self.stream.seek(0)
return self.stream.read(size)
return property(fget=fget)
header = header()
def __init__(self, filename_or_stream):
if hasattr(filename_or_stream, 'read'):
self._stream = filename_or_stream
else:
self._stream = open(filename_or_stream, 'rb')
if self.magic != 'ITOLITLS':
raise LitError('Not a valid LIT file')
if self.version != 1:
raise LitError('Unknown LIT version %d' % (self.version,))
self.entries = {}
self._read_secondary_header()
self._read_header_pieces()
self._read_section_names()
self._read_manifest()
self._read_meta()
self._read_drm()
@preserve
def __len__(self):
self._stream.seek(0, 2)
return self._stream.tell()
self.stream.seek(0, 2)
return self.stream.tell()
@preserve
def _read_raw(self, offset, size):
self._stream.seek(offset)
return self._stream.read(size)
def read_raw(self, offset, size):
self.stream.seek(offset)
return self.stream.read(size)
def _read_content(self, offset, size):
return self._read_raw(self.content_offset + offset, size)
def read_content(self, offset, size):
return self.read_raw(self.content_offset + offset, size)
def _read_secondary_header(self):
def read_secondary_header(self):
offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE)
bytes = self._read_raw(offset, self.sec_hdr_len)
bytes = self.read_raw(offset, self.sec_hdr_len)
offset = int32(bytes[4:])
while offset < len(bytes):
blocktype = bytes[offset:offset+4]
@ -505,21 +534,21 @@ class LitReader(object):
if not hasattr(self, 'content_offset'):
raise LitError('Could not figure out the content offset')
def _read_header_pieces(self):
def read_header_pieces(self):
src = self.header[self.hdr_len:]
for i in xrange(self.num_pieces):
piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE]
if u32(piece[4:]) != 0 or u32(piece[12:]) != 0:
raise LitError('Piece %s has 64bit value' % repr(piece))
offset, size = u32(piece), int32(piece[8:])
piece = self._read_raw(offset, size)
piece = self.read_raw(offset, size)
if i == 0:
continue # Dont need this piece
elif i == 1:
if u32(piece[8:]) != self.entry_chunklen or \
u32(piece[12:]) != self.entry_unknown:
raise LitError('Secondary header does not match piece')
self._read_directory(piece)
self.read_directory(piece)
elif i == 2:
if u32(piece[8:]) != self.count_chunklen or \
u32(piece[12:]) != self.count_unknown:
@ -530,12 +559,13 @@ class LitReader(object):
elif i == 4:
self.piece4_guid = piece
def _read_directory(self, piece):
def read_directory(self, piece):
if not piece.startswith('IFCM'):
raise LitError('Header piece #1 is not main directory.')
chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28])
if (32 + (num_chunks * chunk_size)) != len(piece):
raise LitError('IFCM HEADER has incorrect length')
raise LitError('IFCM header has incorrect length')
self.entries = {}
for i in xrange(num_chunks):
offset = 32 + (i * chunk_size)
chunk = piece[offset:offset + chunk_size]
@ -569,17 +599,17 @@ class LitReader(object):
entry = DirectoryEntry(name, section, offset, size)
self.entries[name] = entry
def _read_section_names(self):
def read_section_names(self):
if '::DataSpace/NameList' not in self.entries:
raise LitError('Lit file does not have a valid NameList')
raw = self.get_file('::DataSpace/NameList')
if len(raw) < 4:
raise LitError('Invalid Namelist section')
pos = 4
self.num_sections = u16(raw[2:pos])
self.section_names = [""]*self.num_sections
self.section_data = [None]*self.num_sections
for section in xrange(self.num_sections):
num_sections = u16(raw[2:pos])
self.section_names = [""] * num_sections
self.section_data = [None] * num_sections
for section in xrange(num_sections):
size = u16(raw[pos:pos+2])
pos += 2
size = size*2 + 2
@ -589,11 +619,12 @@ class LitReader(object):
raw[pos:pos+size].decode('utf-16-le').rstrip('\000')
pos += size
def _read_manifest(self):
def read_manifest(self):
if '/manifest' not in self.entries:
raise LitError('Lit file does not have a valid manifest')
raw = self.get_file('/manifest')
self.manifest = {}
self.paths = {self.opf_path: None}
while raw:
slen, raw = ord(raw[0]), raw[1:]
if slen == 0: break
@ -632,28 +663,9 @@ class LitReader(object):
for item in mlist:
if item.path[0] == '/':
item.path = os.path.basename(item.path)
self.paths[item.path] = item
def _pretty_print(self, xml):
f = cStringIO.StringIO(xml.encode('utf-8'))
doc = etree.parse(f, parser=self.XML_PARSER)
pretty = etree.tostring(doc, encoding='ascii', pretty_print=True)
return XML_DECL + unicode(pretty)
def _read_meta(self):
path = 'content.opf'
raw = self.get_file('/meta')
xml = OPF_DECL
try:
xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP))
except LitError:
if 'PENGUIN group' not in raw: raise
print "WARNING: attempting PENGUIN malformed OPF fix"
raw = raw.replace(
'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1)
xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP))
self.meta = xml
def _read_drm(self):
def read_drm(self):
self.drmlevel = 0
if '/DRMStorage/Licenses/EUL' in self.entries:
self.drmlevel = 5
@ -664,7 +676,7 @@ class LitReader(object):
else:
return
if self.drmlevel < 5:
msdes.deskey(self._calculate_deskey(), msdes.DE1)
msdes.deskey(self.calculate_deskey(), msdes.DE1)
bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed'))
if bookkey[0] != '\000':
raise LitError('Unable to decrypt title key!')
@ -672,7 +684,7 @@ class LitReader(object):
else:
raise DRMError("Cannot access DRM-protected book")
def _calculate_deskey(self):
def calculate_deskey(self):
hashfiles = ['/meta', '/DRMStorage/DRMSource']
if self.drmlevel == 3:
hashfiles.append('/DRMStorage/DRMBookplate')
@ -696,18 +708,18 @@ class LitReader(object):
def get_file(self, name):
entry = self.entries[name]
if entry.section == 0:
return self._read_content(entry.offset, entry.size)
return self.read_content(entry.offset, entry.size)
section = self.get_section(entry.section)
return section[entry.offset:entry.offset+entry.size]
def get_section(self, section):
data = self.section_data[section]
if not data:
data = self._get_section(section)
data = self.get_section_uncached(section)
self.section_data[section] = data
return data
def _get_section(self, section):
def get_section_uncached(self, section):
name = self.section_names[section]
path = '::DataSpace/Storage/' + name
transform = self.get_file(path + '/Transform/List')
@ -719,29 +731,29 @@ class LitReader(object):
raise LitError("ControlData is too short")
guid = msguid(transform)
if guid == DESENCRYPT_GUID:
content = self._decrypt(content)
content = self.decrypt(content)
control = control[csize:]
elif guid == LZXCOMPRESS_GUID:
reset_table = self.get_file(
'/'.join(('::DataSpace/Storage', name, 'Transform',
LZXCOMPRESS_GUID, 'InstanceData/ResetTable')))
content = self._decompress(content, control, reset_table)
content = self.decompress(content, control, reset_table)
control = control[csize:]
else:
raise LitError("Unrecognized transform: %s." % repr(guid))
transform = transform[16:]
return content
def _decrypt(self, content):
def decrypt(self, content):
length = len(content)
extra = length & 0x7
if extra > 0:
self._warn("content length not a multiple of block size")
self.warn("content length not a multiple of block size")
content += "\0" * (8 - extra)
msdes.deskey(self.bookkey, msdes.DE1)
return msdes.des(content)
def _decompress(self, content, control, reset_table):
def decompress(self, content, control, reset_table):
if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC":
raise LitError("Invalid ControlData tag value")
if len(reset_table) < (RESET_INTERVAL + 8):
@ -782,7 +794,7 @@ class LitReader(object):
result.append(
lzx.decompress(content[base:size], window_bytes))
except lzx.LzxError:
self._warn("LZX decompression error; skipping chunk")
self.warn("LZX decompression error; skipping chunk")
bytes_remaining -= window_bytes
base = size
accum += int32(reset_table[RESET_INTERVAL:])
@ -792,55 +804,81 @@ class LitReader(object):
try:
result.append(lzx.decompress(content[base:], bytes_remaining))
except lzx.LzxError:
self._warn("LZX decompression error; skipping chunk")
self.warn("LZX decompression error; skipping chunk")
bytes_remaining = 0
if bytes_remaining > 0:
raise LitError("Failed to completely decompress section")
return ''.join(result)
def get_entry_content(self, entry, pretty_print=False):
if 'spine' in entry.state:
name = '/'.join(('/data', entry.internal, 'content'))
path = entry.path
raw = self.get_file(name)
decl, map = (OPF_DECL, OPF_MAP) \
if name == '/meta' else (HTML_DECL, HTML_MAP)
content = decl + unicode(UnBinary(raw, path, self.manifest, map))
if pretty_print:
content = self._pretty_print(content)
content = content.encode('utf-8')
else:
name = '/'.join(('/data', entry.internal))
content = self.get_file(name)
return content
def extract_content(self, output_dir=os.getcwdu(), pretty_print=False):
output_dir = os.path.abspath(output_dir)
try:
opf_path = os.path.splitext(
os.path.basename(self._stream.name))[0] + '.opf'
except AttributeError:
opf_path = 'content.opf'
opf_path = os.path.join(output_dir, opf_path)
self._ensure_dir(opf_path)
with open(opf_path, 'wb') as f:
xml = self.meta
if pretty_print:
xml = self._pretty_print(xml)
f.write(xml.encode('utf-8'))
for entry in self.manifest.values():
path = os.path.join(output_dir, entry.path)
self._ensure_dir(path)
with open(path, 'wb') as f:
f.write(self.get_entry_content(entry, pretty_print))
class LitReader(object):
def __init__(self, filename_or_stream):
self._litfile = LitFile(filename_or_stream)
def namelist(self):
return self._litfile.paths.keys()
def _read_meta(self):
path = 'content.opf'
raw = self._litfile.get_file('/meta')
try:
unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
except LitError:
if 'PENGUIN group' not in raw: raise
print "WARNING: attempting PENGUIN malformed OPF fix"
raw = raw.replace(
'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1)
unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
return unbin.tree
def read_xml(self, name):
entry = self._litfile.paths[name] if name else None
if entry is None:
content = self._read_meta()
elif 'spine' in entry.state:
internal = '/'.join(('/data', entry.internal, 'content'))
raw = self._litfile.get_file(internal)
unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP)
content = unbin.tree
else:
raise LitError('Requested non-XML content as XML')
return content
def read(self, name, pretty_print=False):
entry = self._litfile.paths[name] if name else None
if entry is None:
meta = self._read_meta()
content = OPF_DECL + etree.tostring(
meta, encoding='ascii', pretty_print=pretty_print)
elif 'spine' in entry.state:
internal = '/'.join(('/data', entry.internal, 'content'))
raw = self._litfile.get_file(internal)
unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP)
content = HTML_DECL + etree.tostring(
unbin.tree, encoding='ascii', pretty_print=pretty_print)
else:
internal = '/'.join(('/data', entry.internal))
content = self._litfile.get_file(internal)
return content
def meta():
def fget(self):
return self.read(self._litfile.opf_path)
return property(fget=fget)
meta = meta()
def _ensure_dir(self, path):
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
def _warn(self, msg):
print "WARNING: %s" % (msg,)
def extract_content(self, output_dir=os.getcwdu(), pretty_print=False):
for name in self.namelist():
path = os.path.join(output_dir, name)
self._ensure_dir(path)
with open(path, 'wb') as f:
f.write(self.read(name, pretty_print=pretty_print))
def option_parser():
from calibre.utils.config import OptionParser
@ -850,7 +888,8 @@ def option_parser():
help=_('Output directory. Defaults to current directory.'))
parser.add_option(
'-p', '--pretty-print', default=False, action='store_true',
help=_('Legibly format extracted markup. May modify meaningful whitespace.'))
help=_('Legibly format extracted markup.' \
' May modify meaningful whitespace.'))
parser.add_option(
'--verbose', default=False, action='store_true',
help=_('Useful for debugging.'))