diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 1510cb6c32..f61b88c4d4 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -10,7 +10,7 @@ import os import sys from collections import defaultdict from types import StringTypes -from itertools import izip, count +from itertools import izip, count, chain from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote import logging @@ -22,6 +22,7 @@ from lxml import html from calibre import LoggingInterface from calibre.translations.dynamic import translate from calibre.startup import get_lang +from calibre.ebooks.oeb.entitydefs import ENTITYDEFS XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' @@ -40,6 +41,7 @@ XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS, 'svg': SVG_NS, 'xl': XLINK_NS} +DC_PREFIXES = ('d11', 'd10', 'd09') def XML(name): return '{%s}%s' % (XML_NS, name) def XHTML(name): return '{%s}%s' % (XHTML_NS, name) @@ -61,6 +63,7 @@ GIF_MIME = 'image/gif' JPEG_MIME = 'image/jpeg' PNG_MIME = 'image/png' SVG_MIME = 'image/svg+xml' +BINARY_MIME = 'application/octet-stream' OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) @@ -69,6 +72,8 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) MS_COVER_TYPE = 'other.ms-coverimage-standard' +ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') +COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') def element(parent, *args, **kwargs): if parent is not None: @@ -191,11 +196,8 @@ class Metadata(object): def __init__(self, term, value, fq_attrib={}, **kwargs): self.fq_attrib = fq_attrib = dict(fq_attrib) fq_attrib.update(kwargs) - if term == OPF('meta') and not value: - term = self.fq_attrib.pop('name') - value = self.fq_attrib.pop('content') - elif barename(term).lower() in Metadata.TERMS and \ - (not namespace(term) or namespace(term) in DC_NSES): + if barename(term).lower() in Metadata.TERMS and \ + (not namespace(term) or namespace(term) in DC_NSES): # Anything looking like Dublin Core is coerced term = DC(barename(term).lower()) elif namespace(term) == OPF2_NS: @@ -329,8 +331,11 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _force_xhtml(self, data): + # Possibly decode in user-specified encoding if self.oeb.encoding is not None: data = data.decode(self.oeb.encoding, 'replace') + # Force to UNIX line encodings + data = data.replace('\r\n', '\n').replace('\r', '\n') # Handle broken XHTML w/ SVG (ugh) if 'svg:' in data and SVG_NS not in data: data = data.replace( @@ -338,18 +343,29 @@ class Manifest(object): if 'xlink:' in data and XLINK_NS not in data: data = data.replace( ' elements for meta in self.META_XP(data): meta.getparent().remove(meta) + # Ensure has a head = xpath(data, '/h:html/h:head') head = head[0] if head else None if head is None: @@ -364,6 +380,7 @@ class Manifest(object): 'File %r missing element' % self.href) title = etree.SubElement(head, XHTML('title')) title.text = self.oeb.translate(__('Unknown')) + # Ensure has a <body/> if not xpath(data, '/h:html/h:body'): self.oeb.logger.warn( 'File %r missing <body/> element' % self.href) @@ -494,9 +511,9 @@ class Manifest(object): elem = element(parent, 'manifest') for item in self.ids.values(): media_type = item.media_type - if media_type == XHTML_MIME: + if media_type in OEB_DOCS: media_type = OEB_DOC_MIME - elif media_type == CSS_MIME: + elif media_type in OEB_STYLES: media_type = OEB_CSS_MIME attrib = {'id': item.id, 'href': item.href, 'media-type': media_type} @@ -508,6 +525,11 @@ class Manifest(object): def to_opf2(self, parent=None): elem = element(parent, OPF('manifest')) for item in self.ids.values(): + media_type = item.media_type + if media_type in OEB_DOCS: + media_type = XHTML_MIME + elif media_type in OEB_STYLES: + media_type = CSS_MIME attrib = {'id': item.id, 'href': item.href, 'media-type': item.media_type} if item.fallback: @@ -771,25 +793,19 @@ class OEBBook(object): opf = self._read_opf(opfpath) self._all_from_opf(opf) - def _convert_opf1(self, opf): - # Seriously, seriously wrong - if namespace(opf.tag) == OPF1_NS: - opf.tag = barename(opf.tag) - for elem in opf.iterdescendants(): - if isinstance(elem.tag, basestring) \ - and namespace(elem.tag) == OPF1_NS: - elem.tag = barename(elem.tag) + def _clean_opf(self, opf): + for elem in opf.iter(): + if isinstance(elem.tag, basestring) \ + and namespace(elem.tag) in ('', OPF1_NS): + elem.tag = OPF(barename(elem.tag)) attrib = dict(opf.attrib) - attrib['version'] = '2.0' nroot = etree.Element(OPF('package'), nsmap={None: OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, OPF('metadata'), nsmap={'opf': OPF2_NS, 'dc': DC11_NS, 'xsi': XSI_NS, 'dcterms': DCTERMS_NS}) - for prefix in ('d11', 'd10', 'd09'): - elements = xpath(opf, 'metadata//%s:*' % prefix) - if elements: break - for element in elements: + dc = lambda prefix: xpath(opf, 'o2:metadata//%s:*' % prefix) + for element in chain(*(dc(prefix) for prefix in DC_PREFIXES)): if not element.text: continue tag = barename(element.tag).lower() element.tag = '{%s}%s' % (DC11_NS, tag) @@ -799,28 +815,27 @@ class OEBBook(object): element.attrib[nsname] = element.attrib[name] del element.attrib[name] metadata.append(element) - for element in opf.xpath('metadata//meta'): + for element in xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) - for item in opf.xpath('manifest/item'): - media_type = item.attrib['media-type'].lower() - if media_type in OEB_DOCS: - media_type = XHTML_MIME - elif media_type in OEB_STYLES: - media_type = CSS_MIME - item.attrib['media-type'] = media_type - for tag in ('manifest', 'spine', 'tours', 'guide'): - for element in opf.xpath(tag): + for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): + for element in xpath(opf, tag): nroot.append(element) - return etree.fromstring(etree.tostring(nroot)) + return nroot def _read_opf(self, opfpath): - opf = self.container.read_xml(opfpath) - version = float(opf.get('version', 1.0)) + opf = self.container.read(opfpath) + opf = opf.replace('\r\n', '\n').replace('\r', '\n') + try: + opf = etree.fromstring(opf) + except etree.XMLSyntaxError: + repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) + opf = ENTITY_RE.sub(repl, opf) + opf = etree.fromstring(opf) + self.logger.warn('OPF contains invalid HTML named entities') ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS): raise OEBError('Invalid namespace %r for OPF document' % ns) - if ns != OPF2_NS or version < 2.0: - opf = self._convert_opf1(opf) + opf = self._clean_opf(opf) return opf def _metadata_from_opf(self, opf): @@ -829,8 +844,16 @@ class OEBBook(object): self.metadata = metadata = Metadata(self) ignored = (OPF('dc-metadata'), OPF('x-metadata')) for elem in xpath(opf, '/o2:package/o2:metadata//*'): - if elem.tag not in ignored and (elem.text or elem.attrib): - metadata.add(elem.tag, elem.text, elem.attrib) + if elem.tag in ignored: continue + term = elem.tag + value = elem.text + if term == OPF('meta'): + term = elem.attrib.pop('name', None) + value = elem.attrib.pop('content', None) + if value: + value = COLLAPSE_RE.sub(' ', value.strip()) + if term and (value or elem.attrib): + metadata.add(term, value, elem.attrib) haveuuid = haveid = False for ident in metadata.identifier: if unicode(ident).startswith('urn:uuid:'): @@ -845,36 +868,38 @@ class OEBBook(object): self.uid = item break else: - self.logger.warn(u'Unique-identifier %r not found.' % uid) + self.logger.warn(u'Unique-identifier %r not found' % uid) for ident in metadata.identifier: if 'id' in ident.attrib: self.uid = metadata.identifier[0] break if not metadata.language: - self.logger.warn(u'Language not specified.') + self.logger.warn(u'Language not specified') metadata.add('language', get_lang()) if not metadata.creator: - self.logger.warn(u'Creator not specified.') - metadata.add('creator', _('Unknown')) + self.logger.warn('Creator not specified') + metadata.add('creator', self.translate(__('Unknown'))) if not metadata.title: - self.logger.warn(u'Title not specified.') - metadata.add('title', _('Unknown')) + self.logger.warn('Title not specified') + metadata.add('title', self.translate(__('Unknown'))) def _manifest_from_opf(self, opf): self.manifest = manifest = Manifest(self) for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): id = elem.get('id') href = elem.get('href') - media_type = elem.get('media-type') + media_type = elem.get('media-type', None) + if media_type is None: + media_type = elem.get('mediatype', BINARY_MIME) fallback = elem.get('fallback') if href in manifest.hrefs: - self.logger.warn(u'Duplicate manifest entry for %r.' % href) + self.logger.warn(u'Duplicate manifest entry for %r' % href) continue if not self.container.exists(href): - self.logger.warn(u'Manifest item %r not found.' % href) + self.logger.warn(u'Manifest item %r not found' % href) continue if id in manifest.ids: - self.logger.warn(u'Duplicate manifest id %r.' % id) + self.logger.warn(u'Duplicate manifest id %r' % id) id, href = manifest.generate(id, href) manifest.add(id, href, media_type, fallback) @@ -883,7 +908,7 @@ class OEBBook(object): for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in self.manifest: - self.logger.warn(u'Spine item %r not found.' % idref) + self.logger.warn(u'Spine item %r not found' % idref) continue item = self.manifest[idref] spine.add(item, elem.get('linear')) @@ -931,7 +956,8 @@ class OEBBook(object): item = self.manifest.ids[id] ncx = item.data self.manifest.remove(item) - title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')[0] + title = xpath(ncx, 'ncx:docTitle/ncx:text/text()') + title = title[0].strip() if title else unicode(self.metadata.title) self.toc = toc = TOC(title) navmaps = xpath(ncx, 'ncx:navMap') for navmap in navmaps: @@ -988,7 +1014,8 @@ class OEBBook(object): if not item.linear: continue html = item.data title = xpath(html, '/h:html/h:head/h:title/text()') - if title: titles.append(title[0]) + title = title[0].strip() if title else None + if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,) diff --git a/src/calibre/ebooks/oeb/entitydefs.py b/src/calibre/ebooks/oeb/entitydefs.py new file mode 100644 index 0000000000..69fc16116c --- /dev/null +++ b/src/calibre/ebooks/oeb/entitydefs.py @@ -0,0 +1,256 @@ +""" +Replacement for htmlentitydefs which uses purely numeric entities. +""" + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' + +ENTITYDEFS = \ + {'AElig': 'Æ', + 'Aacute': 'Á', + 'Acirc': 'Â', + 'Agrave': 'À', + 'Alpha': 'Α', + 'Aring': 'Å', + 'Atilde': 'Ã', + 'Auml': 'Ä', + 'Beta': 'Β', + 'Ccedil': 'Ç', + 'Chi': 'Χ', + 'Dagger': '‡', + 'Delta': 'Δ', + 'ETH': 'Ð', + 'Eacute': 'É', + 'Ecirc': 'Ê', + 'Egrave': 'È', + 'Epsilon': 'Ε', + 'Eta': 'Η', + 'Euml': 'Ë', + 'Gamma': 'Γ', + 'Iacute': 'Í', + 'Icirc': 'Î', + 'Igrave': 'Ì', + 'Iota': 'Ι', + 'Iuml': 'Ï', + 'Kappa': 'Κ', + 'Lambda': 'Λ', + 'Mu': 'Μ', + 'Ntilde': 'Ñ', + 'Nu': 'Ν', + 'OElig': 'Œ', + 'Oacute': 'Ó', + 'Ocirc': 'Ô', + 'Ograve': 'Ò', + 'Omega': 'Ω', + 'Omicron': 'Ο', + 'Oslash': 'Ø', + 'Otilde': 'Õ', + 'Ouml': 'Ö', + 'Phi': 'Φ', + 'Pi': 'Π', + 'Prime': '″', + 'Psi': 'Ψ', + 'Rho': 'Ρ', + 'Scaron': 'Š', + 'Sigma': 'Σ', + 'THORN': 'Þ', + 'Tau': 'Τ', + 'Theta': 'Θ', + 'Uacute': 'Ú', + 'Ucirc': 'Û', + 'Ugrave': 'Ù', + 'Upsilon': 'Υ', + 'Uuml': 'Ü', + 'Xi': 'Ξ', + 'Yacute': 'Ý', + 'Yuml': 'Ÿ', + 'Zeta': 'Ζ', + 'aacute': 'á', + 'acirc': 'â', + 'acute': '´', + 'aelig': 'æ', + 'agrave': 'à', + 'alefsym': 'ℵ', + 'alpha': 'α', + 'and': '∧', + 'ang': '∠', + 'aring': 'å', + 'asymp': '≈', + 'atilde': 'ã', + 'auml': 'ä', + 'bdquo': '„', + 'beta': 'β', + 'brvbar': '¦', + 'bull': '•', + 'cap': '∩', + 'ccedil': 'ç', + 'cedil': '¸', + 'cent': '¢', + 'chi': 'χ', + 'circ': 'ˆ', + 'clubs': '♣', + 'cong': '≅', + 'copy': '©', + 'crarr': '↵', + 'cup': '∪', + 'curren': '¤', + 'dArr': '⇓', + 'dagger': '†', + 'darr': '↓', + 'deg': '°', + 'delta': 'δ', + 'diams': '♦', + 'divide': '÷', + 'eacute': 'é', + 'ecirc': 'ê', + 'egrave': 'è', + 'empty': '∅', + 'emsp': ' ', + 'ensp': ' ', + 'epsilon': 'ε', + 'equiv': '≡', + 'eta': 'η', + 'eth': 'ð', + 'euml': 'ë', + 'euro': '€', + 'exist': '∃', + 'fnof': 'ƒ', + 'forall': '∀', + 'frac12': '½', + 'frac14': '¼', + 'frac34': '¾', + 'frasl': '⁄', + 'gamma': 'γ', + 'ge': '≥', + 'hArr': '⇔', + 'harr': '↔', + 'hearts': '♥', + 'hellip': '…', + 'iacute': 'í', + 'icirc': 'î', + 'iexcl': '¡', + 'igrave': 'ì', + 'image': 'ℑ', + 'infin': '∞', + 'int': '∫', + 'iota': 'ι', + 'iquest': '¿', + 'isin': '∈', + 'iuml': 'ï', + 'kappa': 'κ', + 'lArr': '⇐', + 'lambda': 'λ', + 'lang': '〈', + 'laquo': '«', + 'larr': '←', + 'lceil': '⌈', + 'ldquo': '“', + 'le': '≤', + 'lfloor': '⌊', + 'lowast': '∗', + 'loz': '◊', + 'lrm': '‎', + 'lsaquo': '‹', + 'lsquo': '‘', + 'macr': '¯', + 'mdash': '—', + 'micro': 'µ', + 'middot': '·', + 'minus': '−', + 'mu': 'μ', + 'nabla': '∇', + 'nbsp': ' ', + 'ndash': '–', + 'ne': '≠', + 'ni': '∋', + 'not': '¬', + 'notin': '∉', + 'nsub': '⊄', + 'ntilde': 'ñ', + 'nu': 'ν', + 'oacute': 'ó', + 'ocirc': 'ô', + 'oelig': 'œ', + 'ograve': 'ò', + 'oline': '‾', + 'omega': 'ω', + 'omicron': 'ο', + 'oplus': '⊕', + 'or': '∨', + 'ordf': 'ª', + 'ordm': 'º', + 'oslash': 'ø', + 'otilde': 'õ', + 'otimes': '⊗', + 'ouml': 'ö', + 'para': '¶', + 'part': '∂', + 'permil': '‰', + 'perp': '⊥', + 'phi': 'φ', + 'pi': 'π', + 'piv': 'ϖ', + 'plusmn': '±', + 'pound': '£', + 'prime': '′', + 'prod': '∏', + 'prop': '∝', + 'psi': 'ψ', + 'rArr': '⇒', + 'radic': '√', + 'rang': '〉', + 'raquo': '»', + 'rarr': '→', + 'rceil': '⌉', + 'rdquo': '”', + 'real': 'ℜ', + 'reg': '®', + 'rfloor': '⌋', + 'rho': 'ρ', + 'rlm': '‏', + 'rsaquo': '›', + 'rsquo': '’', + 'sbquo': '‚', + 'scaron': 'š', + 'sdot': '⋅', + 'sect': '§', + 'shy': '­', + 'sigma': 'σ', + 'sigmaf': 'ς', + 'sim': '∼', + 'spades': '♠', + 'sub': '⊂', + 'sube': '⊆', + 'sum': '∑', + 'sup': '⊃', + 'sup1': '¹', + 'sup2': '²', + 'sup3': '³', + 'supe': '⊇', + 'szlig': 'ß', + 'tau': 'τ', + 'there4': '∴', + 'theta': 'θ', + 'thetasym': 'ϑ', + 'thinsp': ' ', + 'thorn': 'þ', + 'tilde': '˜', + 'times': '×', + 'trade': '™', + 'uArr': '⇑', + 'uacute': 'ú', + 'uarr': '↑', + 'ucirc': 'û', + 'ugrave': 'ù', + 'uml': '¨', + 'upsih': 'ϒ', + 'upsilon': 'υ', + 'uuml': 'ü', + 'weierp': '℘', + 'xi': 'ξ', + 'yacute': 'ý', + 'yen': '¥', + 'yuml': 'ÿ', + 'zeta': 'ζ', + 'zwj': '‍', + 'zwnj': '‌'}