Remove some of the crazier aspects of the LitReader refactoring.

This commit is contained in:
Marshall T. Vandegrift 2008-12-16 12:21:58 -05:00
parent 391e94a075
commit 83470f3d6b
2 changed files with 84 additions and 89 deletions

View File

@ -516,7 +516,9 @@ class TOC(object):
class OEBBook(object): class OEBBook(object):
def __init__(self, opfpath, container=None): def __init__(self, opfpath=None, container=None):
if not opfpath:
opfpath = 'content.opf'
if not container: if not container:
container = DirContainer(os.path.dirname(opfpath)) container = DirContainer(os.path.dirname(opfpath))
opfpath = os.path.basename(opfpath) opfpath = os.path.basename(opfpath)

View File

@ -16,7 +16,7 @@ from lxml import etree
from calibre.ebooks.lit import LitError from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1 import calibre.ebooks.lit.mssha1 as mssha1
from calibre.ebooks.lit.oeb import urlnormalize from calibre.ebooks.lit.oeb import XML_PARSER, urlnormalize
from calibre.ebooks import DRMError from calibre.ebooks import DRMError
from calibre import plugins from calibre import plugins
lzx, lxzerror = plugins['lzx'] lzx, lxzerror = plugins['lzx']
@ -111,6 +111,8 @@ def consume_sized_utf8_string(bytes, zpad=False):
pos += 1 pos += 1
return u''.join(result), bytes[pos:] return u''.join(result), bytes[pos:]
def encode(string):
return unicode(string).encode('ascii', 'xmlcharrefreplace')
class UnBinary(object): class UnBinary(object):
AMPERSAND_RE = re.compile( AMPERSAND_RE = re.compile(
@ -126,39 +128,17 @@ class UnBinary(object):
self.dir = os.path.dirname(path) self.dir = os.path.dirname(path)
buf = StringIO() buf = StringIO()
self.binary_to_text(bin, buf) self.binary_to_text(bin, buf)
raw = buf.getvalue().lstrip().decode('utf-8') self.raw = buf.getvalue().lstrip().decode('utf-8')
raw = self.escape_reserved(raw) self.escape_reserved()
self.tree = self.fixup_tree(raw) self._tree = None
def fixup_node(self, node, in_head=False): def escape_reserved(self):
in_head = in_head or (node.tag == 'head') raw = self.raw
if self.is_html and not in_head:
text = node.text
if text and text.isspace() and len(node) > 0:
node.text = None
span = etree.SubElement(node, 'span')
span.text = text
text = node.tail
if text and text.isspace():
node.tail = None
if self.is_html and not in_head:
span = etree.Element('span')
span.text = text
node.addnext(span)
for child in node.iterchildren():
if isinstance(child.tag, basestring):
self.fixup_node(child, in_head)
return node
def fixup_tree(self, raw):
return self.fixup_node(etree.fromstring(raw))
def escape_reserved(self, raw):
raw = self.AMPERSAND_RE.sub(r'&', raw) raw = self.AMPERSAND_RE.sub(r'&', raw)
raw = self.OPEN_ANGLE_RE.sub(r'<', raw) raw = self.OPEN_ANGLE_RE.sub(r'<', raw)
raw = self.CLOSE_ANGLE_RE.sub(r'>', raw) raw = self.CLOSE_ANGLE_RE.sub(r'>', raw)
raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw) raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw)
return raw self.raw = raw
def item_path(self, internal_id): def item_path(self, internal_id):
try: try:
@ -176,6 +156,17 @@ class UnBinary(object):
relpath = (['..'] * (len(base) - index)) + target[index:] relpath = (['..'] * (len(base) - index)) + target[index:]
return '/'.join(relpath) return '/'.join(relpath)
def __unicode__(self):
return self.raw
def tree():
def fget(self):
if not self._tree:
self._tree = etree.fromstring(self.raw, parser=XML_PARSER)
return self._tree
return property(fget=fget)
tree = tree()
def binary_to_text(self, bin, buf, index=0, depth=0): def binary_to_text(self, bin, buf, index=0, depth=0):
tag_name = current_map = None tag_name = current_map = None
dynamic_tag = errors = 0 dynamic_tag = errors = 0
@ -197,7 +188,7 @@ class UnBinary(object):
c = '>>' c = '>>'
elif c == '<': elif c == '<':
c = '<<' c = '<<'
buf.write(c.encode('ascii', 'xmlcharrefreplace')) buf.write(encode(c))
elif state == 'get flags': elif state == 'get flags':
if oc == 0: if oc == 0:
@ -227,7 +218,7 @@ class UnBinary(object):
tag_name = '?'+unichr(tag)+'?' tag_name = '?'+unichr(tag)+'?'
current_map = self.tag_to_attr_map[tag] current_map = self.tag_to_attr_map[tag]
print 'WARNING: tag %s unknown' % unichr(tag) print 'WARNING: tag %s unknown' % unichr(tag)
buf.write(unicode(tag_name).encode('utf-8')) buf.write(encode(tag_name))
elif flags & FLAG_CLOSING: elif flags & FLAG_CLOSING:
if depth == 0: if depth == 0:
raise LitError('Extra closing tag') raise LitError('Extra closing tag')
@ -246,8 +237,7 @@ class UnBinary(object):
is_goingdown = False is_goingdown = False
if not tag_name: if not tag_name:
raise LitError('Tag ends before it begins.') raise LitError('Tag ends before it begins.')
buf.write(u''.join( buf.write(encode(u''.join(('</', tag_name, '>'))))
('</', tag_name, '>')).encode('utf-8'))
dynamic_tag = 0 dynamic_tag = 0
tag_name = None tag_name = None
state = 'text' state = 'text'
@ -267,7 +257,7 @@ class UnBinary(object):
in_censorship = True in_censorship = True
state = 'get value length' state = 'get value length'
continue continue
buf.write(' ' + unicode(attr).encode('utf-8') + '=') buf.write(' ' + encode(attr) + '=')
if attr in ['href', 'src']: if attr in ['href', 'src']:
state = 'get href length' state = 'get href length'
else: else:
@ -297,8 +287,7 @@ class UnBinary(object):
state = 'get attr' state = 'get attr'
elif count > 0: elif count > 0:
if not in_censorship: if not in_censorship:
buf.write(c.encode( buf.write(encode(c))
'ascii', 'xmlcharrefreplace'))
count -= 1 count -= 1
if count == 0: if count == 0:
if not in_censorship: if not in_censorship:
@ -318,7 +307,7 @@ class UnBinary(object):
tag_name += c tag_name += c
count -= 1 count -= 1
if count == 0: if count == 0:
buf.write(unicode(tag_name).encode('utf-8')) buf.write(encode(tag_name))
state = 'get attr' state = 'get attr'
elif state == 'get attr length': elif state == 'get attr length':
@ -329,7 +318,7 @@ class UnBinary(object):
state = 'get custom attr' state = 'get custom attr'
elif state == 'get custom attr': elif state == 'get custom attr':
buf.write(unicode(c).encode('utf-8')) buf.write(encode(c))
count -= 1 count -= 1
if count == 0: if count == 0:
buf.write('=') buf.write('=')
@ -351,7 +340,7 @@ class UnBinary(object):
if frag: if frag:
path = '#'.join((path, frag)) path = '#'.join((path, frag))
path = urlnormalize(path) path = urlnormalize(path)
self.buf.write((u'"%s"' % path).encode('utf-8')) buf.write(encode(u'"%s"' % path))
state = 'get attr' state = 'get attr'
return index return index
@ -816,10 +805,62 @@ class LitFile(object):
class LitReader(object): class LitReader(object):
def __init__(self, filename_or_stream): def __init__(self, filename_or_stream):
self._litfile = LitFile(filename_or_stream) self._litfile = LitFile(filename_or_stream)
def namelist(self): def namelist(self):
return self._litfile.paths.keys() return self._litfile.paths.keys()
def read_xml(self, name):
entry = self._litfile.paths[name] if name else None
if entry is None:
content = self._read_meta()
elif 'spine' in entry.state:
internal = '/'.join(('/data', entry.internal, 'content'))
raw = self._litfile.get_file(internal)
unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP)
content = unbin.tree
else:
raise LitError('Requested non-XML content as XML')
return content
def read(self, name, pretty_print=False):
entry = self._litfile.paths[name] if name else None
if entry is None:
meta = self._read_meta()
content = OPF_DECL + etree.tostring(
meta, encoding='ascii', pretty_print=pretty_print)
elif 'spine' in entry.state:
internal = '/'.join(('/data', entry.internal, 'content'))
raw = self._litfile.get_file(internal)
unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP)
content = HTML_DECL
if pretty_print:
content += etree.tostring(unbin.tree,
encoding='ascii', pretty_print=True)
else:
content += unicode(unbin)
else:
internal = '/'.join(('/data', entry.internal))
content = self._litfile.get_file(internal)
return content
def meta():
def fget(self):
return self.read(self._litfile.opf_path)
return property(fget=fget)
meta = meta()
def _ensure_dir(self, path):
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
def extract_content(self, output_dir=os.getcwdu(), pretty_print=False):
for name in self.namelist():
path = os.path.join(output_dir, name)
self._ensure_dir(path)
with open(path, 'wb') as f:
f.write(self.read(name, pretty_print=pretty_print))
def _read_meta(self): def _read_meta(self):
path = 'content.opf' path = 'content.opf'
raw = self._litfile.get_file('/meta') raw = self._litfile.get_file('/meta')
@ -833,54 +874,6 @@ class LitReader(object):
unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
return unbin.tree return unbin.tree
def read_xml(self, name):
entry = self._litfile.paths[name] if name else None
if entry is None:
content = self._read_meta()
elif 'spine' in entry.state:
internal = '/'.join(('/data', entry.internal, 'content'))
raw = self._litfile.get_file(internal)
unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP)
content = unbin.tree
else:
raise LitError('Requested non-XML content as XML')
return content
def read(self, name, pretty_print=False):
entry = self._litfile.paths[name] if name else None
if entry is None:
meta = self._read_meta()
content = OPF_DECL + etree.tostring(
meta, encoding='ascii', pretty_print=pretty_print)
elif 'spine' in entry.state:
internal = '/'.join(('/data', entry.internal, 'content'))
raw = self._litfile.get_file(internal)
unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP)
content = HTML_DECL + etree.tostring(
unbin.tree, encoding='ascii', pretty_print=pretty_print)
else:
internal = '/'.join(('/data', entry.internal))
content = self._litfile.get_file(internal)
return content
def meta():
def fget(self):
return self.read(self._litfile.opf_path)
return property(fget=fget)
meta = meta()
def _ensure_dir(self, path):
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
def extract_content(self, output_dir=os.getcwdu(), pretty_print=False):
for name in self.namelist():
path = os.path.join(output_dir, name)
self._ensure_dir(path)
with open(path, 'wb') as f:
f.write(self.read(name, pretty_print=pretty_print))
def option_parser(): def option_parser():
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser