Refactor parts of OEBBook to sanely handle much more broken OPF.

2025-07-09 03:04:10 -04:00 · 2009-01-26 23:08:04 -05:00 · 2009-01-26 23:08:04 -05:00 · 076ec9bbad
commit 076ec9bbad
parent a3ad3a07dd
2 changed files with 337 additions and 54 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -10,7 +10,7 @@ import os
 import sys
 from collections import defaultdict
 from types import StringTypes
-from itertools import izip, count
+from itertools import izip, count, chain
 from urlparse import urldefrag, urlparse, urlunparse
 from urllib import unquote as urlunquote
 import logging
@ -22,6 +22,7 @@ from lxml import html
 from calibre import LoggingInterface
 from calibre.translations.dynamic import translate
 from calibre.startup import get_lang
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 XML_NS = 'http://www.w3.org/XML/1998/namespace'
 XHTML_NS = 'http://www.w3.org/1999/xhtml'
@ -40,6 +41,7 @@ XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
           'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
           'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS,
           'svg': SVG_NS, 'xl': XLINK_NS}
 DC_PREFIXES = ('d11', 'd10', 'd09')
 def XML(name): return '{%s}%s' % (XML_NS, name)
 def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
@ -61,6 +63,7 @@ GIF_MIME = 'image/gif'
 JPEG_MIME = 'image/jpeg'
 PNG_MIME = 'image/png'
 SVG_MIME = 'image/svg+xml'
 BINARY_MIME = 'application/octet-stream'
 OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
 OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
@ -69,6 +72,8 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
 MS_COVER_TYPE = 'other.ms-coverimage-standard'
 ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
 COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
 def element(parent, *args, **kwargs):
    if parent is not None:
@ -191,11 +196,8 @@ class Metadata(object):
        def __init__(self, term, value, fq_attrib={}, **kwargs):
            self.fq_attrib = fq_attrib = dict(fq_attrib)
            fq_attrib.update(kwargs)
-            if term == OPF('meta') and not value:
+            if barename(term).lower() in Metadata.TERMS and \
-                term = self.fq_attrib.pop('name')
+               (not namespace(term) or namespace(term) in DC_NSES):
                value = self.fq_attrib.pop('content')
            elif barename(term).lower() in Metadata.TERMS and \
                 (not namespace(term) or namespace(term) in DC_NSES):
                # Anything looking like Dublin Core is coerced
                term = DC(barename(term).lower())
            elif namespace(term) == OPF2_NS:
@ -329,8 +331,11 @@ class Manifest(object):
                % (self.id, self.href, self.media_type)
        def _force_xhtml(self, data):
            # Possibly decode in user-specified encoding
            if self.oeb.encoding is not None:
                data = data.decode(self.oeb.encoding, 'replace')
            # Force to UNIX line encodings
            data = data.replace('\r\n', '\n').replace('\r', '\n')
            # Handle broken XHTML w/ SVG (ugh)
            if 'svg:' in data and SVG_NS not in data:
                data = data.replace(
@ -338,18 +343,29 @@ class Manifest(object):
            if 'xlink:' in data and XLINK_NS not in data:
                data = data.replace(
                    '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
            # Try with more & more drastic measures to parse
            try:
                data = etree.fromstring(data)
            except etree.XMLSyntaxError:
-                data = html.fromstring(data)
+                repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
-                data = etree.tostring(data, encoding=unicode)
+                data = ENTITY_RE.sub(repl, data)
-                data = etree.fromstring(data)
+                try:
                    data = etree.fromstring(data)
                except etree.XMLSyntaxError:
                    self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
                    data = html.fromstring(data)
                    data.attrib.pop('xmlns', None)
                    data = etree.tostring(data, encoding=unicode)
                    data = etree.fromstring(data)
            # Force into the XHTML namespace
            if namespace(data.tag) != XHTML_NS:
                data.attrib['xmlns'] = XHTML_NS
                data = etree.tostring(data, encoding=unicode)
                data = etree.fromstring(data)
            # Remove any encoding-specifying <meta/> elements
            for meta in self.META_XP(data):
                meta.getparent().remove(meta)
            # Ensure has a <head/>
            head = xpath(data, '/h:html/h:head')
            head = head[0] if head else None
            if head is None:
@ -364,6 +380,7 @@ class Manifest(object):
                    'File %r missing <title/> element' % self.href)
                title = etree.SubElement(head, XHTML('title'))
                title.text = self.oeb.translate(__('Unknown'))
            # Ensure has a <body/>
            if not xpath(data, '/h:html/h:body'):
                self.oeb.logger.warn(
                    'File %r missing <body/> element' % self.href)
@ -494,9 +511,9 @@ class Manifest(object):
        elem = element(parent, 'manifest')
        for item in self.ids.values():
            media_type = item.media_type
-            if media_type == XHTML_MIME:
+            if media_type in OEB_DOCS:
                media_type = OEB_DOC_MIME
-            elif media_type == CSS_MIME:
+            elif media_type in OEB_STYLES:
                media_type = OEB_CSS_MIME
            attrib = {'id': item.id, 'href': item.href,
                      'media-type': media_type}
@ -508,6 +525,11 @@ class Manifest(object):
    def to_opf2(self, parent=None):
        elem = element(parent, OPF('manifest'))
        for item in self.ids.values():
            media_type = item.media_type
            if media_type in OEB_DOCS:
                media_type = XHTML_MIME
            elif media_type in OEB_STYLES:
                media_type = CSS_MIME
            attrib = {'id': item.id, 'href': item.href,
                      'media-type': item.media_type}
            if item.fallback:
@ -771,25 +793,19 @@ class OEBBook(object):
            opf = self._read_opf(opfpath)
            self._all_from_opf(opf)
-    def _convert_opf1(self, opf):
+    def _clean_opf(self, opf):
-        # Seriously, seriously wrong
+        for elem in opf.iter():
-        if namespace(opf.tag) == OPF1_NS:
+            if isinstance(elem.tag, basestring) \
-            opf.tag = barename(opf.tag)
+               and namespace(elem.tag) in ('', OPF1_NS):
-            for elem in opf.iterdescendants():
+                elem.tag = OPF(barename(elem.tag))
                if isinstance(elem.tag, basestring) \
                   and namespace(elem.tag) == OPF1_NS:
                    elem.tag = barename(elem.tag)
        attrib = dict(opf.attrib)
        attrib['version'] = '2.0'
        nroot = etree.Element(OPF('package'),
            nsmap={None: OPF2_NS}, attrib=attrib)
        metadata = etree.SubElement(nroot, OPF('metadata'),
            nsmap={'opf': OPF2_NS, 'dc': DC11_NS,
                   'xsi': XSI_NS, 'dcterms': DCTERMS_NS})
-        for prefix in ('d11', 'd10', 'd09'):
+        dc = lambda prefix: xpath(opf, 'o2:metadata//%s:*' % prefix)
-            elements = xpath(opf, 'metadata//%s:*' % prefix)
+        for element in chain(*(dc(prefix) for prefix in DC_PREFIXES)):
            if elements: break
        for element in elements:
            if not element.text: continue
            tag = barename(element.tag).lower()
            element.tag = '{%s}%s' % (DC11_NS, tag)
@ -799,28 +815,27 @@ class OEBBook(object):
                    element.attrib[nsname] = element.attrib[name]
                    del element.attrib[name]
            metadata.append(element)
-        for element in opf.xpath('metadata//meta'):
+        for element in xpath(opf, 'o2:metadata//o2:meta'):
            metadata.append(element)
-        for item in opf.xpath('manifest/item'):
+        for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
-            media_type = item.attrib['media-type'].lower()
+            for element in xpath(opf, tag):
            if media_type in OEB_DOCS:
                media_type = XHTML_MIME
            elif media_type in OEB_STYLES:
                media_type = CSS_MIME
            item.attrib['media-type'] = media_type
        for tag in ('manifest', 'spine', 'tours', 'guide'):
            for element in opf.xpath(tag):
                nroot.append(element)
-        return etree.fromstring(etree.tostring(nroot))
+        return nroot
    def _read_opf(self, opfpath):
-        opf = self.container.read_xml(opfpath)
+        opf = self.container.read(opfpath)
-        version = float(opf.get('version', 1.0))
+        opf = opf.replace('\r\n', '\n').replace('\r', '\n')
        try:
            opf = etree.fromstring(opf)
        except etree.XMLSyntaxError:
            repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
            opf = ENTITY_RE.sub(repl, opf)
            opf = etree.fromstring(opf)
            self.logger.warn('OPF contains invalid HTML named entities')
        ns = namespace(opf.tag)
        if ns not in ('', OPF1_NS, OPF2_NS):
            raise OEBError('Invalid namespace %r for OPF document' % ns)
-        if ns != OPF2_NS or version < 2.0:
+        opf = self._clean_opf(opf)
            opf = self._convert_opf1(opf)
        return opf
    def _metadata_from_opf(self, opf):
@ -829,8 +844,16 @@ class OEBBook(object):
        self.metadata = metadata = Metadata(self)
        ignored = (OPF('dc-metadata'), OPF('x-metadata'))
        for elem in xpath(opf, '/o2:package/o2:metadata//*'):
-            if elem.tag not in ignored and (elem.text or elem.attrib):
+            if elem.tag in ignored: continue
-                metadata.add(elem.tag, elem.text, elem.attrib)
+            term = elem.tag
            value = elem.text
            if term == OPF('meta'):
                term = elem.attrib.pop('name', None)
                value = elem.attrib.pop('content', None)
            if value:
                value = COLLAPSE_RE.sub(' ', value.strip())
            if term and (value or elem.attrib):
                metadata.add(term, value, elem.attrib)
        haveuuid = haveid = False
        for ident in metadata.identifier:
            if unicode(ident).startswith('urn:uuid:'):
@ -845,36 +868,38 @@ class OEBBook(object):
                self.uid = item
                break
        else:
-            self.logger.warn(u'Unique-identifier %r not found.' % uid)
+            self.logger.warn(u'Unique-identifier %r not found' % uid)
            for ident in metadata.identifier:
                if 'id' in ident.attrib:
                    self.uid = metadata.identifier[0]
                    break
        if not metadata.language:
-            self.logger.warn(u'Language not specified.')
+            self.logger.warn(u'Language not specified')
            metadata.add('language', get_lang())
        if not metadata.creator:
-            self.logger.warn(u'Creator not specified.')
+            self.logger.warn('Creator not specified')
-            metadata.add('creator', _('Unknown'))
+            metadata.add('creator', self.translate(__('Unknown')))
        if not metadata.title:
-            self.logger.warn(u'Title not specified.')
+            self.logger.warn('Title not specified')
-            metadata.add('title', _('Unknown'))
+            metadata.add('title', self.translate(__('Unknown')))
    def _manifest_from_opf(self, opf):
        self.manifest = manifest = Manifest(self)
        for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
            id = elem.get('id')
            href = elem.get('href')
-            media_type = elem.get('media-type')
+            media_type = elem.get('media-type', None)
            if media_type is None:
                media_type = elem.get('mediatype', BINARY_MIME)
            fallback = elem.get('fallback')
            if href in manifest.hrefs:
-                self.logger.warn(u'Duplicate manifest entry for %r.' % href)
+                self.logger.warn(u'Duplicate manifest entry for %r' % href)
                continue
            if not self.container.exists(href):
-                self.logger.warn(u'Manifest item %r not found.' % href)
+                self.logger.warn(u'Manifest item %r not found' % href)
                continue
            if id in manifest.ids:
-                self.logger.warn(u'Duplicate manifest id %r.' % id)
+                self.logger.warn(u'Duplicate manifest id %r' % id)
                id, href = manifest.generate(id, href)
            manifest.add(id, href, media_type, fallback)
@ -883,7 +908,7 @@ class OEBBook(object):
        for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
            idref = elem.get('idref')
            if idref not in self.manifest:
-                self.logger.warn(u'Spine item %r not found.' % idref)
+                self.logger.warn(u'Spine item %r not found' % idref)
                continue
            item = self.manifest[idref]
            spine.add(item, elem.get('linear'))
@ -931,7 +956,8 @@ class OEBBook(object):
        item = self.manifest.ids[id]
        ncx = item.data
        self.manifest.remove(item)
-        title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')[0]
+        title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')
        title = title[0].strip() if title else unicode(self.metadata.title)
        self.toc = toc = TOC(title)
        navmaps = xpath(ncx, 'ncx:navMap')
        for navmap in navmaps:
@ -988,7 +1014,8 @@ class OEBBook(object):
            if not item.linear: continue
            html = item.data
            title = xpath(html, '/h:html/h:head/h:title/text()')
-            if title: titles.append(title[0])
+            title = title[0].strip() if title else None
            if title: titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,)
--- a/src/calibre/ebooks/oeb/entitydefs.py
+++ b/src/calibre/ebooks/oeb/entitydefs.py
@ -0,0 +1,256 @@
 """
 Replacement for htmlentitydefs which uses purely numeric entities.
 """
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 ENTITYDEFS = \
    {'AElig': '&#198;',
     'Aacute': '&#193;',
     'Acirc': '&#194;',
     'Agrave': '&#192;',
     'Alpha': '&#913;',
     'Aring': '&#197;',
     'Atilde': '&#195;',
     'Auml': '&#196;',
     'Beta': '&#914;',
     'Ccedil': '&#199;',
     'Chi': '&#935;',
     'Dagger': '&#8225;',
     'Delta': '&#916;',
     'ETH': '&#208;',
     'Eacute': '&#201;',
     'Ecirc': '&#202;',
     'Egrave': '&#200;',
     'Epsilon': '&#917;',
     'Eta': '&#919;',
     'Euml': '&#203;',
     'Gamma': '&#915;',
     'Iacute': '&#205;',
     'Icirc': '&#206;',
     'Igrave': '&#204;',
     'Iota': '&#921;',
     'Iuml': '&#207;',
     'Kappa': '&#922;',
     'Lambda': '&#923;',
     'Mu': '&#924;',
     'Ntilde': '&#209;',
     'Nu': '&#925;',
     'OElig': '&#338;',
     'Oacute': '&#211;',
     'Ocirc': '&#212;',
     'Ograve': '&#210;',
     'Omega': '&#937;',
     'Omicron': '&#927;',
     'Oslash': '&#216;',
     'Otilde': '&#213;',
     'Ouml': '&#214;',
     'Phi': '&#934;',
     'Pi': '&#928;',
     'Prime': '&#8243;',
     'Psi': '&#936;',
     'Rho': '&#929;',
     'Scaron': '&#352;',
     'Sigma': '&#931;',
     'THORN': '&#222;',
     'Tau': '&#932;',
     'Theta': '&#920;',
     'Uacute': '&#218;',
     'Ucirc': '&#219;',
     'Ugrave': '&#217;',
     'Upsilon': '&#933;',
     'Uuml': '&#220;',
     'Xi': '&#926;',
     'Yacute': '&#221;',
     'Yuml': '&#376;',
     'Zeta': '&#918;',
     'aacute': '&#225;',
     'acirc': '&#226;',
     'acute': '&#180;',
     'aelig': '&#230;',
     'agrave': '&#224;',
     'alefsym': '&#8501;',
     'alpha': '&#945;',
     'and': '&#8743;',
     'ang': '&#8736;',
     'aring': '&#229;',
     'asymp': '&#8776;',
     'atilde': '&#227;',
     'auml': '&#228;',
     'bdquo': '&#8222;',
     'beta': '&#946;',
     'brvbar': '&#166;',
     'bull': '&#8226;',
     'cap': '&#8745;',
     'ccedil': '&#231;',
     'cedil': '&#184;',
     'cent': '&#162;',
     'chi': '&#967;',
     'circ': '&#710;',
     'clubs': '&#9827;',
     'cong': '&#8773;',
     'copy': '&#169;',
     'crarr': '&#8629;',
     'cup': '&#8746;',
     'curren': '&#164;',
     'dArr': '&#8659;',
     'dagger': '&#8224;',
     'darr': '&#8595;',
     'deg': '&#176;',
     'delta': '&#948;',
     'diams': '&#9830;',
     'divide': '&#247;',
     'eacute': '&#233;',
     'ecirc': '&#234;',
     'egrave': '&#232;',
     'empty': '&#8709;',
     'emsp': '&#8195;',
     'ensp': '&#8194;',
     'epsilon': '&#949;',
     'equiv': '&#8801;',
     'eta': '&#951;',
     'eth': '&#240;',
     'euml': '&#235;',
     'euro': '&#8364;',
     'exist': '&#8707;',
     'fnof': '&#402;',
     'forall': '&#8704;',
     'frac12': '&#189;',
     'frac14': '&#188;',
     'frac34': '&#190;',
     'frasl': '&#8260;',
     'gamma': '&#947;',
     'ge': '&#8805;',
     'hArr': '&#8660;',
     'harr': '&#8596;',
     'hearts': '&#9829;',
     'hellip': '&#8230;',
     'iacute': '&#237;',
     'icirc': '&#238;',
     'iexcl': '&#161;',
     'igrave': '&#236;',
     'image': '&#8465;',
     'infin': '&#8734;',
     'int': '&#8747;',
     'iota': '&#953;',
     'iquest': '&#191;',
     'isin': '&#8712;',
     'iuml': '&#239;',
     'kappa': '&#954;',
     'lArr': '&#8656;',
     'lambda': '&#955;',
     'lang': '&#9001;',
     'laquo': '&#171;',
     'larr': '&#8592;',
     'lceil': '&#8968;',
     'ldquo': '&#8220;',
     'le': '&#8804;',
     'lfloor': '&#8970;',
     'lowast': '&#8727;',
     'loz': '&#9674;',
     'lrm': '&#8206;',
     'lsaquo': '&#8249;',
     'lsquo': '&#8216;',
     'macr': '&#175;',
     'mdash': '&#8212;',
     'micro': '&#181;',
     'middot': '&#183;',
     'minus': '&#8722;',
     'mu': '&#956;',
     'nabla': '&#8711;',
     'nbsp': '&#160;',
     'ndash': '&#8211;',
     'ne': '&#8800;',
     'ni': '&#8715;',
     'not': '&#172;',
     'notin': '&#8713;',
     'nsub': '&#8836;',
     'ntilde': '&#241;',
     'nu': '&#957;',
     'oacute': '&#243;',
     'ocirc': '&#244;',
     'oelig': '&#339;',
     'ograve': '&#242;',
     'oline': '&#8254;',
     'omega': '&#969;',
     'omicron': '&#959;',
     'oplus': '&#8853;',
     'or': '&#8744;',
     'ordf': '&#170;',
     'ordm': '&#186;',
     'oslash': '&#248;',
     'otilde': '&#245;',
     'otimes': '&#8855;',
     'ouml': '&#246;',
     'para': '&#182;',
     'part': '&#8706;',
     'permil': '&#8240;',
     'perp': '&#8869;',
     'phi': '&#966;',
     'pi': '&#960;',
     'piv': '&#982;',
     'plusmn': '&#177;',
     'pound': '&#163;',
     'prime': '&#8242;',
     'prod': '&#8719;',
     'prop': '&#8733;',
     'psi': '&#968;',
     'rArr': '&#8658;',
     'radic': '&#8730;',
     'rang': '&#9002;',
     'raquo': '&#187;',
     'rarr': '&#8594;',
     'rceil': '&#8969;',
     'rdquo': '&#8221;',
     'real': '&#8476;',
     'reg': '&#174;',
     'rfloor': '&#8971;',
     'rho': '&#961;',
     'rlm': '&#8207;',
     'rsaquo': '&#8250;',
     'rsquo': '&#8217;',
     'sbquo': '&#8218;',
     'scaron': '&#353;',
     'sdot': '&#8901;',
     'sect': '&#167;',
     'shy': '&#173;',
     'sigma': '&#963;',
     'sigmaf': '&#962;',
     'sim': '&#8764;',
     'spades': '&#9824;',
     'sub': '&#8834;',
     'sube': '&#8838;',
     'sum': '&#8721;',
     'sup': '&#8835;',
     'sup1': '&#185;',
     'sup2': '&#178;',
     'sup3': '&#179;',
     'supe': '&#8839;',
     'szlig': '&#223;',
     'tau': '&#964;',
     'there4': '&#8756;',
     'theta': '&#952;',
     'thetasym': '&#977;',
     'thinsp': '&#8201;',
     'thorn': '&#254;',
     'tilde': '&#732;',
     'times': '&#215;',
     'trade': '&#8482;',
     'uArr': '&#8657;',
     'uacute': '&#250;',
     'uarr': '&#8593;',
     'ucirc': '&#251;',
     'ugrave': '&#249;',
     'uml': '&#168;',
     'upsih': '&#978;',
     'upsilon': '&#965;',
     'uuml': '&#252;',
     'weierp': '&#8472;',
     'xi': '&#958;',
     'yacute': '&#253;',
     'yen': '&#165;',
     'yuml': '&#255;',
     'zeta': '&#950;',
     'zwj': '&#8205;',
     'zwnj': '&#8204;'}