Refactor parts of OEBBook to sanely handle much more broken OPF.

2026-04-25 10:19:54 -04:00 · 2009-01-26 23:08:04 -05:00 · 2009-01-26 23:08:04 -05:00 · 076ec9bbad
commit 076ec9bbad
parent a3ad3a07dd
2 changed files with 337 additions and 54 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -10,7 +10,7 @@ import os
 import sys
 from collections import defaultdict
 from types import StringTypes
-from itertools import izip, count
+from itertools import izip, count, chain
 from urlparse import urldefrag, urlparse, urlunparse
 from urllib import unquote as urlunquote
 import logging
@ -22,6 +22,7 @@ from lxml import html
 from calibre import LoggingInterface
 from calibre.translations.dynamic import translate
 from calibre.startup import get_lang
+from calibre.ebooks.oeb.entitydefs import ENTITYDEFS

 XML_NS = 'http://www.w3.org/XML/1998/namespace'
 XHTML_NS = 'http://www.w3.org/1999/xhtml'
@ -40,6 +41,7 @@ XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
           'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
           'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS,
           'svg': SVG_NS, 'xl': XLINK_NS}
+DC_PREFIXES = ('d11', 'd10', 'd09')

 def XML(name): return '{%s}%s' % (XML_NS, name)
 def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
@ -61,6 +63,7 @@ GIF_MIME = 'image/gif'
 JPEG_MIME = 'image/jpeg'
 PNG_MIME = 'image/png'
 SVG_MIME = 'image/svg+xml'
+BINARY_MIME = 'application/octet-stream'

 OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
 OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
@ -69,6 +72,8 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])

 MS_COVER_TYPE = 'other.ms-coverimage-standard'

+ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
+COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')

 def element(parent, *args, **kwargs):
    if parent is not None:
@ -191,11 +196,8 @@ class Metadata(object):
        def __init__(self, term, value, fq_attrib={}, **kwargs):
            self.fq_attrib = fq_attrib = dict(fq_attrib)
            fq_attrib.update(kwargs)
-            if term == OPF('meta') and not value:
-                term = self.fq_attrib.pop('name')
-                value = self.fq_attrib.pop('content')
-            elif barename(term).lower() in Metadata.TERMS and \
-                 (not namespace(term) or namespace(term) in DC_NSES):
+            if barename(term).lower() in Metadata.TERMS and \
+               (not namespace(term) or namespace(term) in DC_NSES):
                # Anything looking like Dublin Core is coerced
                term = DC(barename(term).lower())
            elif namespace(term) == OPF2_NS:
@ -329,8 +331,11 @@ class Manifest(object):
                % (self.id, self.href, self.media_type)

        def _force_xhtml(self, data):
+            # Possibly decode in user-specified encoding
            if self.oeb.encoding is not None:
                data = data.decode(self.oeb.encoding, 'replace')
+            # Force to UNIX line encodings
+            data = data.replace('\r\n', '\n').replace('\r', '\n')
            # Handle broken XHTML w/ SVG (ugh)
            if 'svg:' in data and SVG_NS not in data:
                data = data.replace(
@ -338,18 +343,29 @@ class Manifest(object):
            if 'xlink:' in data and XLINK_NS not in data:
                data = data.replace(
                    '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
+            # Try with more & more drastic measures to parse
            try:
                data = etree.fromstring(data)
            except etree.XMLSyntaxError:
-                data = html.fromstring(data)
-                data = etree.tostring(data, encoding=unicode)
-                data = etree.fromstring(data)
+                repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
+                data = ENTITY_RE.sub(repl, data)
+                try:
+                    data = etree.fromstring(data)
+                except etree.XMLSyntaxError:
+                    self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
+                    data = html.fromstring(data)
+                    data.attrib.pop('xmlns', None)
+                    data = etree.tostring(data, encoding=unicode)
+                    data = etree.fromstring(data)
+            # Force into the XHTML namespace
            if namespace(data.tag) != XHTML_NS:
                data.attrib['xmlns'] = XHTML_NS
                data = etree.tostring(data, encoding=unicode)
                data = etree.fromstring(data)
+            # Remove any encoding-specifying <meta/> elements
            for meta in self.META_XP(data):
                meta.getparent().remove(meta)
+            # Ensure has a <head/>
            head = xpath(data, '/h:html/h:head')
            head = head[0] if head else None
            if head is None:
@ -364,6 +380,7 @@ class Manifest(object):
                    'File %r missing <title/> element' % self.href)
                title = etree.SubElement(head, XHTML('title'))
                title.text = self.oeb.translate(__('Unknown'))
+            # Ensure has a <body/>
            if not xpath(data, '/h:html/h:body'):
                self.oeb.logger.warn(
                    'File %r missing <body/> element' % self.href)
@ -494,9 +511,9 @@ class Manifest(object):
        elem = element(parent, 'manifest')
        for item in self.ids.values():
            media_type = item.media_type
-            if media_type == XHTML_MIME:
+            if media_type in OEB_DOCS:
                media_type = OEB_DOC_MIME
-            elif media_type == CSS_MIME:
+            elif media_type in OEB_STYLES:
                media_type = OEB_CSS_MIME
            attrib = {'id': item.id, 'href': item.href,
                      'media-type': media_type}
@ -508,6 +525,11 @@ class Manifest(object):
    def to_opf2(self, parent=None):
        elem = element(parent, OPF('manifest'))
        for item in self.ids.values():
+            media_type = item.media_type
+            if media_type in OEB_DOCS:
+                media_type = XHTML_MIME
+            elif media_type in OEB_STYLES:
+                media_type = CSS_MIME
            attrib = {'id': item.id, 'href': item.href,
                      'media-type': item.media_type}
            if item.fallback:
@ -771,25 +793,19 @@ class OEBBook(object):
            opf = self._read_opf(opfpath)
            self._all_from_opf(opf)
    
-    def _convert_opf1(self, opf):
-        # Seriously, seriously wrong
-        if namespace(opf.tag) == OPF1_NS:
-            opf.tag = barename(opf.tag)
-            for elem in opf.iterdescendants():
-                if isinstance(elem.tag, basestring) \
-                   and namespace(elem.tag) == OPF1_NS:
-                    elem.tag = barename(elem.tag)
+    def _clean_opf(self, opf):
+        for elem in opf.iter():
+            if isinstance(elem.tag, basestring) \
+               and namespace(elem.tag) in ('', OPF1_NS):
+                elem.tag = OPF(barename(elem.tag))
        attrib = dict(opf.attrib)
-        attrib['version'] = '2.0'
        nroot = etree.Element(OPF('package'),
            nsmap={None: OPF2_NS}, attrib=attrib)
        metadata = etree.SubElement(nroot, OPF('metadata'),
            nsmap={'opf': OPF2_NS, 'dc': DC11_NS,
                   'xsi': XSI_NS, 'dcterms': DCTERMS_NS})
-        for prefix in ('d11', 'd10', 'd09'):
-            elements = xpath(opf, 'metadata//%s:*' % prefix)
-            if elements: break
-        for element in elements:
+        dc = lambda prefix: xpath(opf, 'o2:metadata//%s:*' % prefix)
+        for element in chain(*(dc(prefix) for prefix in DC_PREFIXES)):
            if not element.text: continue
            tag = barename(element.tag).lower()
            element.tag = '{%s}%s' % (DC11_NS, tag)
@ -799,28 +815,27 @@ class OEBBook(object):
                    element.attrib[nsname] = element.attrib[name]
                    del element.attrib[name]
            metadata.append(element)
-        for element in opf.xpath('metadata//meta'):
+        for element in xpath(opf, 'o2:metadata//o2:meta'):
            metadata.append(element)
-        for item in opf.xpath('manifest/item'):
-            media_type = item.attrib['media-type'].lower()
-            if media_type in OEB_DOCS:
-                media_type = XHTML_MIME
-            elif media_type in OEB_STYLES:
-                media_type = CSS_MIME
-            item.attrib['media-type'] = media_type
-        for tag in ('manifest', 'spine', 'tours', 'guide'):
-            for element in opf.xpath(tag):
+        for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
+            for element in xpath(opf, tag):
                nroot.append(element)
-        return etree.fromstring(etree.tostring(nroot))
+        return nroot
    
    def _read_opf(self, opfpath):
-        opf = self.container.read_xml(opfpath)
-        version = float(opf.get('version', 1.0))
+        opf = self.container.read(opfpath)
+        opf = opf.replace('\r\n', '\n').replace('\r', '\n')
+        try:
+            opf = etree.fromstring(opf)
+        except etree.XMLSyntaxError:
+            repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
+            opf = ENTITY_RE.sub(repl, opf)
+            opf = etree.fromstring(opf)
+            self.logger.warn('OPF contains invalid HTML named entities')
        ns = namespace(opf.tag)
        if ns not in ('', OPF1_NS, OPF2_NS):
            raise OEBError('Invalid namespace %r for OPF document' % ns)
-        if ns != OPF2_NS or version < 2.0:
-            opf = self._convert_opf1(opf)
+        opf = self._clean_opf(opf)
        return opf
    
    def _metadata_from_opf(self, opf):
@ -829,8 +844,16 @@ class OEBBook(object):
        self.metadata = metadata = Metadata(self)
        ignored = (OPF('dc-metadata'), OPF('x-metadata'))
        for elem in xpath(opf, '/o2:package/o2:metadata//*'):
-            if elem.tag not in ignored and (elem.text or elem.attrib):
-                metadata.add(elem.tag, elem.text, elem.attrib)
+            if elem.tag in ignored: continue
+            term = elem.tag
+            value = elem.text
+            if term == OPF('meta'):
+                term = elem.attrib.pop('name', None)
+                value = elem.attrib.pop('content', None)
+            if value:
+                value = COLLAPSE_RE.sub(' ', value.strip())
+            if term and (value or elem.attrib):
+                metadata.add(term, value, elem.attrib)
        haveuuid = haveid = False
        for ident in metadata.identifier:
            if unicode(ident).startswith('urn:uuid:'):
@ -845,36 +868,38 @@ class OEBBook(object):
                self.uid = item
                break
        else:
-            self.logger.warn(u'Unique-identifier %r not found.' % uid)
+            self.logger.warn(u'Unique-identifier %r not found' % uid)
            for ident in metadata.identifier:
                if 'id' in ident.attrib:
                    self.uid = metadata.identifier[0]
                    break
        if not metadata.language:
-            self.logger.warn(u'Language not specified.')
+            self.logger.warn(u'Language not specified')
            metadata.add('language', get_lang())
        if not metadata.creator:
-            self.logger.warn(u'Creator not specified.')
-            metadata.add('creator', _('Unknown'))
+            self.logger.warn('Creator not specified')
+            metadata.add('creator', self.translate(__('Unknown')))
        if not metadata.title:
-            self.logger.warn(u'Title not specified.')
-            metadata.add('title', _('Unknown'))
+            self.logger.warn('Title not specified')
+            metadata.add('title', self.translate(__('Unknown')))
    
    def _manifest_from_opf(self, opf):
        self.manifest = manifest = Manifest(self)
        for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
            id = elem.get('id')
            href = elem.get('href')
-            media_type = elem.get('media-type')
+            media_type = elem.get('media-type', None)
+            if media_type is None:
+                media_type = elem.get('mediatype', BINARY_MIME)
            fallback = elem.get('fallback')
            if href in manifest.hrefs:
-                self.logger.warn(u'Duplicate manifest entry for %r.' % href)
+                self.logger.warn(u'Duplicate manifest entry for %r' % href)
                continue
            if not self.container.exists(href):
-                self.logger.warn(u'Manifest item %r not found.' % href)
+                self.logger.warn(u'Manifest item %r not found' % href)
                continue
            if id in manifest.ids:
-                self.logger.warn(u'Duplicate manifest id %r.' % id)
+                self.logger.warn(u'Duplicate manifest id %r' % id)
                id, href = manifest.generate(id, href)
            manifest.add(id, href, media_type, fallback)
    
@ -883,7 +908,7 @@ class OEBBook(object):
        for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
            idref = elem.get('idref')
            if idref not in self.manifest:
-                self.logger.warn(u'Spine item %r not found.' % idref)
+                self.logger.warn(u'Spine item %r not found' % idref)
                continue
            item = self.manifest[idref]
            spine.add(item, elem.get('linear'))
@ -931,7 +956,8 @@ class OEBBook(object):
        item = self.manifest.ids[id]
        ncx = item.data
        self.manifest.remove(item)
-        title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')[0]
+        title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')
+        title = title[0].strip() if title else unicode(self.metadata.title)
        self.toc = toc = TOC(title)
        navmaps = xpath(ncx, 'ncx:navMap')
        for navmap in navmaps:
@ -988,7 +1014,8 @@ class OEBBook(object):
            if not item.linear: continue
            html = item.data
            title = xpath(html, '/h:html/h:head/h:title/text()')
-            if title: titles.append(title[0])
+            title = title[0].strip() if title else None
+            if title: titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,)
--- a/src/calibre/ebooks/oeb/entitydefs.py
+++ b/src/calibre/ebooks/oeb/entitydefs.py
@ -0,0 +1,256 @@
+"""
+Replacement for htmlentitydefs which uses purely numeric entities.
+"""
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
+
+ENTITYDEFS = \
+    {'AElig': '&#198;',
+     'Aacute': '&#193;',
+     'Acirc': '&#194;',
+     'Agrave': '&#192;',
+     'Alpha': '&#913;',
+     'Aring': '&#197;',
+     'Atilde': '&#195;',
+     'Auml': '&#196;',
+     'Beta': '&#914;',
+     'Ccedil': '&#199;',
+     'Chi': '&#935;',
+     'Dagger': '&#8225;',
+     'Delta': '&#916;',
+     'ETH': '&#208;',
+     'Eacute': '&#201;',
+     'Ecirc': '&#202;',
+     'Egrave': '&#200;',
+     'Epsilon': '&#917;',
+     'Eta': '&#919;',
+     'Euml': '&#203;',
+     'Gamma': '&#915;',
+     'Iacute': '&#205;',
+     'Icirc': '&#206;',
+     'Igrave': '&#204;',
+     'Iota': '&#921;',
+     'Iuml': '&#207;',
+     'Kappa': '&#922;',
+     'Lambda': '&#923;',
+     'Mu': '&#924;',
+     'Ntilde': '&#209;',
+     'Nu': '&#925;',
+     'OElig': '&#338;',
+     'Oacute': '&#211;',
+     'Ocirc': '&#212;',
+     'Ograve': '&#210;',
+     'Omega': '&#937;',
+     'Omicron': '&#927;',
+     'Oslash': '&#216;',
+     'Otilde': '&#213;',
+     'Ouml': '&#214;',
+     'Phi': '&#934;',
+     'Pi': '&#928;',
+     'Prime': '&#8243;',
+     'Psi': '&#936;',
+     'Rho': '&#929;',
+     'Scaron': '&#352;',
+     'Sigma': '&#931;',
+     'THORN': '&#222;',
+     'Tau': '&#932;',
+     'Theta': '&#920;',
+     'Uacute': '&#218;',
+     'Ucirc': '&#219;',
+     'Ugrave': '&#217;',
+     'Upsilon': '&#933;',
+     'Uuml': '&#220;',
+     'Xi': '&#926;',
+     'Yacute': '&#221;',
+     'Yuml': '&#376;',
+     'Zeta': '&#918;',
+     'aacute': '&#225;',
+     'acirc': '&#226;',
+     'acute': '&#180;',
+     'aelig': '&#230;',
+     'agrave': '&#224;',
+     'alefsym': '&#8501;',
+     'alpha': '&#945;',
+     'and': '&#8743;',
+     'ang': '&#8736;',
+     'aring': '&#229;',
+     'asymp': '&#8776;',
+     'atilde': '&#227;',
+     'auml': '&#228;',
+     'bdquo': '&#8222;',
+     'beta': '&#946;',
+     'brvbar': '&#166;',
+     'bull': '&#8226;',
+     'cap': '&#8745;',
+     'ccedil': '&#231;',
+     'cedil': '&#184;',
+     'cent': '&#162;',
+     'chi': '&#967;',
+     'circ': '&#710;',
+     'clubs': '&#9827;',
+     'cong': '&#8773;',
+     'copy': '&#169;',
+     'crarr': '&#8629;',
+     'cup': '&#8746;',
+     'curren': '&#164;',
+     'dArr': '&#8659;',
+     'dagger': '&#8224;',
+     'darr': '&#8595;',
+     'deg': '&#176;',
+     'delta': '&#948;',
+     'diams': '&#9830;',
+     'divide': '&#247;',
+     'eacute': '&#233;',
+     'ecirc': '&#234;',
+     'egrave': '&#232;',
+     'empty': '&#8709;',
+     'emsp': '&#8195;',
+     'ensp': '&#8194;',
+     'epsilon': '&#949;',
+     'equiv': '&#8801;',
+     'eta': '&#951;',
+     'eth': '&#240;',
+     'euml': '&#235;',
+     'euro': '&#8364;',
+     'exist': '&#8707;',
+     'fnof': '&#402;',
+     'forall': '&#8704;',
+     'frac12': '&#189;',
+     'frac14': '&#188;',
+     'frac34': '&#190;',
+     'frasl': '&#8260;',
+     'gamma': '&#947;',
+     'ge': '&#8805;',
+     'hArr': '&#8660;',
+     'harr': '&#8596;',
+     'hearts': '&#9829;',
+     'hellip': '&#8230;',
+     'iacute': '&#237;',
+     'icirc': '&#238;',
+     'iexcl': '&#161;',
+     'igrave': '&#236;',
+     'image': '&#8465;',
+     'infin': '&#8734;',
+     'int': '&#8747;',
+     'iota': '&#953;',
+     'iquest': '&#191;',
+     'isin': '&#8712;',
+     'iuml': '&#239;',
+     'kappa': '&#954;',
+     'lArr': '&#8656;',
+     'lambda': '&#955;',
+     'lang': '&#9001;',
+     'laquo': '&#171;',
+     'larr': '&#8592;',
+     'lceil': '&#8968;',
+     'ldquo': '&#8220;',
+     'le': '&#8804;',
+     'lfloor': '&#8970;',
+     'lowast': '&#8727;',
+     'loz': '&#9674;',
+     'lrm': '&#8206;',
+     'lsaquo': '&#8249;',
+     'lsquo': '&#8216;',
+     'macr': '&#175;',
+     'mdash': '&#8212;',
+     'micro': '&#181;',
+     'middot': '&#183;',
+     'minus': '&#8722;',
+     'mu': '&#956;',
+     'nabla': '&#8711;',
+     'nbsp': '&#160;',
+     'ndash': '&#8211;',
+     'ne': '&#8800;',
+     'ni': '&#8715;',
+     'not': '&#172;',
+     'notin': '&#8713;',
+     'nsub': '&#8836;',
+     'ntilde': '&#241;',
+     'nu': '&#957;',
+     'oacute': '&#243;',
+     'ocirc': '&#244;',
+     'oelig': '&#339;',
+     'ograve': '&#242;',
+     'oline': '&#8254;',
+     'omega': '&#969;',
+     'omicron': '&#959;',
+     'oplus': '&#8853;',
+     'or': '&#8744;',
+     'ordf': '&#170;',
+     'ordm': '&#186;',
+     'oslash': '&#248;',
+     'otilde': '&#245;',
+     'otimes': '&#8855;',
+     'ouml': '&#246;',
+     'para': '&#182;',
+     'part': '&#8706;',
+     'permil': '&#8240;',
+     'perp': '&#8869;',
+     'phi': '&#966;',
+     'pi': '&#960;',
+     'piv': '&#982;',
+     'plusmn': '&#177;',
+     'pound': '&#163;',
+     'prime': '&#8242;',
+     'prod': '&#8719;',
+     'prop': '&#8733;',
+     'psi': '&#968;',
+     'rArr': '&#8658;',
+     'radic': '&#8730;',
+     'rang': '&#9002;',
+     'raquo': '&#187;',
+     'rarr': '&#8594;',
+     'rceil': '&#8969;',
+     'rdquo': '&#8221;',
+     'real': '&#8476;',
+     'reg': '&#174;',
+     'rfloor': '&#8971;',
+     'rho': '&#961;',
+     'rlm': '&#8207;',
+     'rsaquo': '&#8250;',
+     'rsquo': '&#8217;',
+     'sbquo': '&#8218;',
+     'scaron': '&#353;',
+     'sdot': '&#8901;',
+     'sect': '&#167;',
+     'shy': '&#173;',
+     'sigma': '&#963;',
+     'sigmaf': '&#962;',
+     'sim': '&#8764;',
+     'spades': '&#9824;',
+     'sub': '&#8834;',
+     'sube': '&#8838;',
+     'sum': '&#8721;',
+     'sup': '&#8835;',
+     'sup1': '&#185;',
+     'sup2': '&#178;',
+     'sup3': '&#179;',
+     'supe': '&#8839;',
+     'szlig': '&#223;',
+     'tau': '&#964;',
+     'there4': '&#8756;',
+     'theta': '&#952;',
+     'thetasym': '&#977;',
+     'thinsp': '&#8201;',
+     'thorn': '&#254;',
+     'tilde': '&#732;',
+     'times': '&#215;',
+     'trade': '&#8482;',
+     'uArr': '&#8657;',
+     'uacute': '&#250;',
+     'uarr': '&#8593;',
+     'ucirc': '&#251;',
+     'ugrave': '&#249;',
+     'uml': '&#168;',
+     'upsih': '&#978;',
+     'upsilon': '&#965;',
+     'uuml': '&#252;',
+     'weierp': '&#8472;',
+     'xi': '&#958;',
+     'yacute': '&#253;',
+     'yen': '&#165;',
+     'yuml': '&#255;',
+     'zeta': '&#950;',
+     'zwj': '&#8205;',
+     'zwnj': '&#8204;'}