From 83470f3d6bd49f794437897d9587ce41e337e57a Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Tue, 16 Dec 2008 12:21:58 -0500 Subject: [PATCH] Remove some of the crazier aspects of the LitReader refactoring. --- src/calibre/ebooks/lit/oeb.py | 4 +- src/calibre/ebooks/lit/reader.py | 169 +++++++++++++++---------------- 2 files changed, 84 insertions(+), 89 deletions(-) diff --git a/src/calibre/ebooks/lit/oeb.py b/src/calibre/ebooks/lit/oeb.py index 85299b0c78..35497dfb25 100644 --- a/src/calibre/ebooks/lit/oeb.py +++ b/src/calibre/ebooks/lit/oeb.py @@ -516,7 +516,9 @@ class TOC(object): class OEBBook(object): - def __init__(self, opfpath, container=None): + def __init__(self, opfpath=None, container=None): + if not opfpath: + opfpath = 'content.opf' if not container: container = DirContainer(os.path.dirname(opfpath)) opfpath = os.path.basename(opfpath) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index e50436ee4f..e872cad85b 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -16,7 +16,7 @@ from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 -from calibre.ebooks.lit.oeb import urlnormalize +from calibre.ebooks.lit.oeb import XML_PARSER, urlnormalize from calibre.ebooks import DRMError from calibre import plugins lzx, lxzerror = plugins['lzx'] @@ -111,6 +111,8 @@ def consume_sized_utf8_string(bytes, zpad=False): pos += 1 return u''.join(result), bytes[pos:] +def encode(string): + return unicode(string).encode('ascii', 'xmlcharrefreplace') class UnBinary(object): AMPERSAND_RE = re.compile( @@ -126,39 +128,17 @@ class UnBinary(object): self.dir = os.path.dirname(path) buf = StringIO() self.binary_to_text(bin, buf) - raw = buf.getvalue().lstrip().decode('utf-8') - raw = self.escape_reserved(raw) - self.tree = self.fixup_tree(raw) + self.raw = buf.getvalue().lstrip().decode('utf-8') + self.escape_reserved() + self._tree = None - def fixup_node(self, node, in_head=False): - in_head = in_head or (node.tag == 'head') - if self.is_html and not in_head: - text = node.text - if text and text.isspace() and len(node) > 0: - node.text = None - span = etree.SubElement(node, 'span') - span.text = text - text = node.tail - if text and text.isspace(): - node.tail = None - if self.is_html and not in_head: - span = etree.Element('span') - span.text = text - node.addnext(span) - for child in node.iterchildren(): - if isinstance(child.tag, basestring): - self.fixup_node(child, in_head) - return node - - def fixup_tree(self, raw): - return self.fixup_node(etree.fromstring(raw)) - - def escape_reserved(self, raw): + def escape_reserved(self): + raw = self.raw raw = self.AMPERSAND_RE.sub(r'&', raw) raw = self.OPEN_ANGLE_RE.sub(r'<', raw) raw = self.CLOSE_ANGLE_RE.sub(r'>', raw) raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw) - return raw + self.raw = raw def item_path(self, internal_id): try: @@ -176,6 +156,17 @@ class UnBinary(object): relpath = (['..'] * (len(base) - index)) + target[index:] return '/'.join(relpath) + def __unicode__(self): + return self.raw + + def tree(): + def fget(self): + if not self._tree: + self._tree = etree.fromstring(self.raw, parser=XML_PARSER) + return self._tree + return property(fget=fget) + tree = tree() + def binary_to_text(self, bin, buf, index=0, depth=0): tag_name = current_map = None dynamic_tag = errors = 0 @@ -197,7 +188,7 @@ class UnBinary(object): c = '>>' elif c == '<': c = '<<' - buf.write(c.encode('ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) elif state == 'get flags': if oc == 0: @@ -227,7 +218,7 @@ class UnBinary(object): tag_name = '?'+unichr(tag)+'?' current_map = self.tag_to_attr_map[tag] print 'WARNING: tag %s unknown' % unichr(tag) - buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag') @@ -246,8 +237,7 @@ class UnBinary(object): is_goingdown = False if not tag_name: raise LitError('Tag ends before it begins.') - buf.write(u''.join( - ('')).encode('utf-8')) + buf.write(encode(u''.join(('')))) dynamic_tag = 0 tag_name = None state = 'text' @@ -267,7 +257,7 @@ class UnBinary(object): in_censorship = True state = 'get value length' continue - buf.write(' ' + unicode(attr).encode('utf-8') + '=') + buf.write(' ' + encode(attr) + '=') if attr in ['href', 'src']: state = 'get href length' else: @@ -297,8 +287,7 @@ class UnBinary(object): state = 'get attr' elif count > 0: if not in_censorship: - buf.write(c.encode( - 'ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) count -= 1 if count == 0: if not in_censorship: @@ -318,7 +307,7 @@ class UnBinary(object): tag_name += c count -= 1 if count == 0: - buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': @@ -329,7 +318,7 @@ class UnBinary(object): state = 'get custom attr' elif state == 'get custom attr': - buf.write(unicode(c).encode('utf-8')) + buf.write(encode(c)) count -= 1 if count == 0: buf.write('=') @@ -351,7 +340,7 @@ class UnBinary(object): if frag: path = '#'.join((path, frag)) path = urlnormalize(path) - self.buf.write((u'"%s"' % path).encode('utf-8')) + buf.write(encode(u'"%s"' % path)) state = 'get attr' return index @@ -816,10 +805,62 @@ class LitFile(object): class LitReader(object): def __init__(self, filename_or_stream): self._litfile = LitFile(filename_or_stream) - + def namelist(self): return self._litfile.paths.keys() + def read_xml(self, name): + entry = self._litfile.paths[name] if name else None + if entry is None: + content = self._read_meta() + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = unbin.tree + else: + raise LitError('Requested non-XML content as XML') + return content + + def read(self, name, pretty_print=False): + entry = self._litfile.paths[name] if name else None + if entry is None: + meta = self._read_meta() + content = OPF_DECL + etree.tostring( + meta, encoding='ascii', pretty_print=pretty_print) + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = HTML_DECL + if pretty_print: + content += etree.tostring(unbin.tree, + encoding='ascii', pretty_print=True) + else: + content += unicode(unbin) + else: + internal = '/'.join(('/data', entry.internal)) + content = self._litfile.get_file(internal) + return content + + def meta(): + def fget(self): + return self.read(self._litfile.opf_path) + return property(fget=fget) + meta = meta() + + def _ensure_dir(self, path): + dir = os.path.dirname(path) + if not os.path.isdir(dir): + os.makedirs(dir) + + def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): + for name in self.namelist(): + path = os.path.join(output_dir, name) + self._ensure_dir(path) + with open(path, 'wb') as f: + f.write(self.read(name, pretty_print=pretty_print)) + def _read_meta(self): path = 'content.opf' raw = self._litfile.get_file('/meta') @@ -833,54 +874,6 @@ class LitReader(object): unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) return unbin.tree - def read_xml(self, name): - entry = self._litfile.paths[name] if name else None - if entry is None: - content = self._read_meta() - elif 'spine' in entry.state: - internal = '/'.join(('/data', entry.internal, 'content')) - raw = self._litfile.get_file(internal) - unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) - content = unbin.tree - else: - raise LitError('Requested non-XML content as XML') - return content - - def read(self, name, pretty_print=False): - entry = self._litfile.paths[name] if name else None - if entry is None: - meta = self._read_meta() - content = OPF_DECL + etree.tostring( - meta, encoding='ascii', pretty_print=pretty_print) - elif 'spine' in entry.state: - internal = '/'.join(('/data', entry.internal, 'content')) - raw = self._litfile.get_file(internal) - unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) - content = HTML_DECL + etree.tostring( - unbin.tree, encoding='ascii', pretty_print=pretty_print) - else: - internal = '/'.join(('/data', entry.internal)) - content = self._litfile.get_file(internal) - return content - - def meta(): - def fget(self): - return self.read(self._litfile.opf_path) - return property(fget=fget) - meta = meta() - - def _ensure_dir(self, path): - dir = os.path.dirname(path) - if not os.path.isdir(dir): - os.makedirs(dir) - - def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): - for name in self.namelist(): - path = os.path.join(output_dir, name) - self._ensure_dir(path) - with open(path, 'wb') as f: - f.write(self.read(name, pretty_print=pretty_print)) - def option_parser(): from calibre.utils.config import OptionParser