Refactor parts of OEBBook to sanely handle much more broken OPF.

This commit is contained in:
Marshall T. Vandegrift 2009-01-26 23:08:04 -05:00
parent a3ad3a07dd
commit 076ec9bbad
2 changed files with 337 additions and 54 deletions

View File

@ -10,7 +10,7 @@ import os
import sys
from collections import defaultdict
from types import StringTypes
from itertools import izip, count
from itertools import izip, count, chain
from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
import logging
@ -22,6 +22,7 @@ from lxml import html
from calibre import LoggingInterface
from calibre.translations.dynamic import translate
from calibre.startup import get_lang
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml'
@ -40,6 +41,7 @@ XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS,
'svg': SVG_NS, 'xl': XLINK_NS}
DC_PREFIXES = ('d11', 'd10', 'd09')
def XML(name): return '{%s}%s' % (XML_NS, name)
def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
@ -61,6 +63,7 @@ GIF_MIME = 'image/gif'
JPEG_MIME = 'image/jpeg'
PNG_MIME = 'image/png'
SVG_MIME = 'image/svg+xml'
BINARY_MIME = 'application/octet-stream'
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
@ -69,6 +72,8 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
MS_COVER_TYPE = 'other.ms-coverimage-standard'
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def element(parent, *args, **kwargs):
if parent is not None:
@ -191,11 +196,8 @@ class Metadata(object):
def __init__(self, term, value, fq_attrib={}, **kwargs):
self.fq_attrib = fq_attrib = dict(fq_attrib)
fq_attrib.update(kwargs)
if term == OPF('meta') and not value:
term = self.fq_attrib.pop('name')
value = self.fq_attrib.pop('content')
elif barename(term).lower() in Metadata.TERMS and \
(not namespace(term) or namespace(term) in DC_NSES):
if barename(term).lower() in Metadata.TERMS and \
(not namespace(term) or namespace(term) in DC_NSES):
# Anything looking like Dublin Core is coerced
term = DC(barename(term).lower())
elif namespace(term) == OPF2_NS:
@ -329,8 +331,11 @@ class Manifest(object):
% (self.id, self.href, self.media_type)
def _force_xhtml(self, data):
# Possibly decode in user-specified encoding
if self.oeb.encoding is not None:
data = data.decode(self.oeb.encoding, 'replace')
# Force to UNIX line encodings
data = data.replace('\r\n', '\n').replace('\r', '\n')
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in data and SVG_NS not in data:
data = data.replace(
@ -338,18 +343,29 @@ class Manifest(object):
if 'xlink:' in data and XLINK_NS not in data:
data = data.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
# Try with more & more drastic measures to parse
try:
data = etree.fromstring(data)
except etree.XMLSyntaxError:
data = html.fromstring(data)
data = etree.tostring(data, encoding=unicode)
data = etree.fromstring(data)
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
data = ENTITY_RE.sub(repl, data)
try:
data = etree.fromstring(data)
except etree.XMLSyntaxError:
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
data = html.fromstring(data)
data.attrib.pop('xmlns', None)
data = etree.tostring(data, encoding=unicode)
data = etree.fromstring(data)
# Force into the XHTML namespace
if namespace(data.tag) != XHTML_NS:
data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data, encoding=unicode)
data = etree.fromstring(data)
# Remove any encoding-specifying <meta/> elements
for meta in self.META_XP(data):
meta.getparent().remove(meta)
# Ensure has a <head/>
head = xpath(data, '/h:html/h:head')
head = head[0] if head else None
if head is None:
@ -364,6 +380,7 @@ class Manifest(object):
'File %r missing <title/> element' % self.href)
title = etree.SubElement(head, XHTML('title'))
title.text = self.oeb.translate(__('Unknown'))
# Ensure has a <body/>
if not xpath(data, '/h:html/h:body'):
self.oeb.logger.warn(
'File %r missing <body/> element' % self.href)
@ -494,9 +511,9 @@ class Manifest(object):
elem = element(parent, 'manifest')
for item in self.ids.values():
media_type = item.media_type
if media_type == XHTML_MIME:
if media_type in OEB_DOCS:
media_type = OEB_DOC_MIME
elif media_type == CSS_MIME:
elif media_type in OEB_STYLES:
media_type = OEB_CSS_MIME
attrib = {'id': item.id, 'href': item.href,
'media-type': media_type}
@ -508,6 +525,11 @@ class Manifest(object):
def to_opf2(self, parent=None):
elem = element(parent, OPF('manifest'))
for item in self.ids.values():
media_type = item.media_type
if media_type in OEB_DOCS:
media_type = XHTML_MIME
elif media_type in OEB_STYLES:
media_type = CSS_MIME
attrib = {'id': item.id, 'href': item.href,
'media-type': item.media_type}
if item.fallback:
@ -771,25 +793,19 @@ class OEBBook(object):
opf = self._read_opf(opfpath)
self._all_from_opf(opf)
def _convert_opf1(self, opf):
# Seriously, seriously wrong
if namespace(opf.tag) == OPF1_NS:
opf.tag = barename(opf.tag)
for elem in opf.iterdescendants():
if isinstance(elem.tag, basestring) \
and namespace(elem.tag) == OPF1_NS:
elem.tag = barename(elem.tag)
def _clean_opf(self, opf):
for elem in opf.iter():
if isinstance(elem.tag, basestring) \
and namespace(elem.tag) in ('', OPF1_NS):
elem.tag = OPF(barename(elem.tag))
attrib = dict(opf.attrib)
attrib['version'] = '2.0'
nroot = etree.Element(OPF('package'),
nsmap={None: OPF2_NS}, attrib=attrib)
metadata = etree.SubElement(nroot, OPF('metadata'),
nsmap={'opf': OPF2_NS, 'dc': DC11_NS,
'xsi': XSI_NS, 'dcterms': DCTERMS_NS})
for prefix in ('d11', 'd10', 'd09'):
elements = xpath(opf, 'metadata//%s:*' % prefix)
if elements: break
for element in elements:
dc = lambda prefix: xpath(opf, 'o2:metadata//%s:*' % prefix)
for element in chain(*(dc(prefix) for prefix in DC_PREFIXES)):
if not element.text: continue
tag = barename(element.tag).lower()
element.tag = '{%s}%s' % (DC11_NS, tag)
@ -799,28 +815,27 @@ class OEBBook(object):
element.attrib[nsname] = element.attrib[name]
del element.attrib[name]
metadata.append(element)
for element in opf.xpath('metadata//meta'):
for element in xpath(opf, 'o2:metadata//o2:meta'):
metadata.append(element)
for item in opf.xpath('manifest/item'):
media_type = item.attrib['media-type'].lower()
if media_type in OEB_DOCS:
media_type = XHTML_MIME
elif media_type in OEB_STYLES:
media_type = CSS_MIME
item.attrib['media-type'] = media_type
for tag in ('manifest', 'spine', 'tours', 'guide'):
for element in opf.xpath(tag):
for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
for element in xpath(opf, tag):
nroot.append(element)
return etree.fromstring(etree.tostring(nroot))
return nroot
def _read_opf(self, opfpath):
opf = self.container.read_xml(opfpath)
version = float(opf.get('version', 1.0))
opf = self.container.read(opfpath)
opf = opf.replace('\r\n', '\n').replace('\r', '\n')
try:
opf = etree.fromstring(opf)
except etree.XMLSyntaxError:
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
opf = ENTITY_RE.sub(repl, opf)
opf = etree.fromstring(opf)
self.logger.warn('OPF contains invalid HTML named entities')
ns = namespace(opf.tag)
if ns not in ('', OPF1_NS, OPF2_NS):
raise OEBError('Invalid namespace %r for OPF document' % ns)
if ns != OPF2_NS or version < 2.0:
opf = self._convert_opf1(opf)
opf = self._clean_opf(opf)
return opf
def _metadata_from_opf(self, opf):
@ -829,8 +844,16 @@ class OEBBook(object):
self.metadata = metadata = Metadata(self)
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
for elem in xpath(opf, '/o2:package/o2:metadata//*'):
if elem.tag not in ignored and (elem.text or elem.attrib):
metadata.add(elem.tag, elem.text, elem.attrib)
if elem.tag in ignored: continue
term = elem.tag
value = elem.text
if term == OPF('meta'):
term = elem.attrib.pop('name', None)
value = elem.attrib.pop('content', None)
if value:
value = COLLAPSE_RE.sub(' ', value.strip())
if term and (value or elem.attrib):
metadata.add(term, value, elem.attrib)
haveuuid = haveid = False
for ident in metadata.identifier:
if unicode(ident).startswith('urn:uuid:'):
@ -845,36 +868,38 @@ class OEBBook(object):
self.uid = item
break
else:
self.logger.warn(u'Unique-identifier %r not found.' % uid)
self.logger.warn(u'Unique-identifier %r not found' % uid)
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.uid = metadata.identifier[0]
break
if not metadata.language:
self.logger.warn(u'Language not specified.')
self.logger.warn(u'Language not specified')
metadata.add('language', get_lang())
if not metadata.creator:
self.logger.warn(u'Creator not specified.')
metadata.add('creator', _('Unknown'))
self.logger.warn('Creator not specified')
metadata.add('creator', self.translate(__('Unknown')))
if not metadata.title:
self.logger.warn(u'Title not specified.')
metadata.add('title', _('Unknown'))
self.logger.warn('Title not specified')
metadata.add('title', self.translate(__('Unknown')))
def _manifest_from_opf(self, opf):
self.manifest = manifest = Manifest(self)
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
id = elem.get('id')
href = elem.get('href')
media_type = elem.get('media-type')
media_type = elem.get('media-type', None)
if media_type is None:
media_type = elem.get('mediatype', BINARY_MIME)
fallback = elem.get('fallback')
if href in manifest.hrefs:
self.logger.warn(u'Duplicate manifest entry for %r.' % href)
self.logger.warn(u'Duplicate manifest entry for %r' % href)
continue
if not self.container.exists(href):
self.logger.warn(u'Manifest item %r not found.' % href)
self.logger.warn(u'Manifest item %r not found' % href)
continue
if id in manifest.ids:
self.logger.warn(u'Duplicate manifest id %r.' % id)
self.logger.warn(u'Duplicate manifest id %r' % id)
id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback)
@ -883,7 +908,7 @@ class OEBBook(object):
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
idref = elem.get('idref')
if idref not in self.manifest:
self.logger.warn(u'Spine item %r not found.' % idref)
self.logger.warn(u'Spine item %r not found' % idref)
continue
item = self.manifest[idref]
spine.add(item, elem.get('linear'))
@ -931,7 +956,8 @@ class OEBBook(object):
item = self.manifest.ids[id]
ncx = item.data
self.manifest.remove(item)
title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')[0]
title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')
title = title[0].strip() if title else unicode(self.metadata.title)
self.toc = toc = TOC(title)
navmaps = xpath(ncx, 'ncx:navMap')
for navmap in navmaps:
@ -988,7 +1014,8 @@ class OEBBook(object):
if not item.linear: continue
html = item.data
title = xpath(html, '/h:html/h:head/h:title/text()')
if title: titles.append(title[0])
title = title[0].strip() if title else None
if title: titles.append(title)
headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,)

View File

@ -0,0 +1,256 @@
"""
Replacement for htmlentitydefs which uses purely numeric entities.
"""
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
ENTITYDEFS = \
{'AElig': '&#198;',
'Aacute': '&#193;',
'Acirc': '&#194;',
'Agrave': '&#192;',
'Alpha': '&#913;',
'Aring': '&#197;',
'Atilde': '&#195;',
'Auml': '&#196;',
'Beta': '&#914;',
'Ccedil': '&#199;',
'Chi': '&#935;',
'Dagger': '&#8225;',
'Delta': '&#916;',
'ETH': '&#208;',
'Eacute': '&#201;',
'Ecirc': '&#202;',
'Egrave': '&#200;',
'Epsilon': '&#917;',
'Eta': '&#919;',
'Euml': '&#203;',
'Gamma': '&#915;',
'Iacute': '&#205;',
'Icirc': '&#206;',
'Igrave': '&#204;',
'Iota': '&#921;',
'Iuml': '&#207;',
'Kappa': '&#922;',
'Lambda': '&#923;',
'Mu': '&#924;',
'Ntilde': '&#209;',
'Nu': '&#925;',
'OElig': '&#338;',
'Oacute': '&#211;',
'Ocirc': '&#212;',
'Ograve': '&#210;',
'Omega': '&#937;',
'Omicron': '&#927;',
'Oslash': '&#216;',
'Otilde': '&#213;',
'Ouml': '&#214;',
'Phi': '&#934;',
'Pi': '&#928;',
'Prime': '&#8243;',
'Psi': '&#936;',
'Rho': '&#929;',
'Scaron': '&#352;',
'Sigma': '&#931;',
'THORN': '&#222;',
'Tau': '&#932;',
'Theta': '&#920;',
'Uacute': '&#218;',
'Ucirc': '&#219;',
'Ugrave': '&#217;',
'Upsilon': '&#933;',
'Uuml': '&#220;',
'Xi': '&#926;',
'Yacute': '&#221;',
'Yuml': '&#376;',
'Zeta': '&#918;',
'aacute': '&#225;',
'acirc': '&#226;',
'acute': '&#180;',
'aelig': '&#230;',
'agrave': '&#224;',
'alefsym': '&#8501;',
'alpha': '&#945;',
'and': '&#8743;',
'ang': '&#8736;',
'aring': '&#229;',
'asymp': '&#8776;',
'atilde': '&#227;',
'auml': '&#228;',
'bdquo': '&#8222;',
'beta': '&#946;',
'brvbar': '&#166;',
'bull': '&#8226;',
'cap': '&#8745;',
'ccedil': '&#231;',
'cedil': '&#184;',
'cent': '&#162;',
'chi': '&#967;',
'circ': '&#710;',
'clubs': '&#9827;',
'cong': '&#8773;',
'copy': '&#169;',
'crarr': '&#8629;',
'cup': '&#8746;',
'curren': '&#164;',
'dArr': '&#8659;',
'dagger': '&#8224;',
'darr': '&#8595;',
'deg': '&#176;',
'delta': '&#948;',
'diams': '&#9830;',
'divide': '&#247;',
'eacute': '&#233;',
'ecirc': '&#234;',
'egrave': '&#232;',
'empty': '&#8709;',
'emsp': '&#8195;',
'ensp': '&#8194;',
'epsilon': '&#949;',
'equiv': '&#8801;',
'eta': '&#951;',
'eth': '&#240;',
'euml': '&#235;',
'euro': '&#8364;',
'exist': '&#8707;',
'fnof': '&#402;',
'forall': '&#8704;',
'frac12': '&#189;',
'frac14': '&#188;',
'frac34': '&#190;',
'frasl': '&#8260;',
'gamma': '&#947;',
'ge': '&#8805;',
'hArr': '&#8660;',
'harr': '&#8596;',
'hearts': '&#9829;',
'hellip': '&#8230;',
'iacute': '&#237;',
'icirc': '&#238;',
'iexcl': '&#161;',
'igrave': '&#236;',
'image': '&#8465;',
'infin': '&#8734;',
'int': '&#8747;',
'iota': '&#953;',
'iquest': '&#191;',
'isin': '&#8712;',
'iuml': '&#239;',
'kappa': '&#954;',
'lArr': '&#8656;',
'lambda': '&#955;',
'lang': '&#9001;',
'laquo': '&#171;',
'larr': '&#8592;',
'lceil': '&#8968;',
'ldquo': '&#8220;',
'le': '&#8804;',
'lfloor': '&#8970;',
'lowast': '&#8727;',
'loz': '&#9674;',
'lrm': '&#8206;',
'lsaquo': '&#8249;',
'lsquo': '&#8216;',
'macr': '&#175;',
'mdash': '&#8212;',
'micro': '&#181;',
'middot': '&#183;',
'minus': '&#8722;',
'mu': '&#956;',
'nabla': '&#8711;',
'nbsp': '&#160;',
'ndash': '&#8211;',
'ne': '&#8800;',
'ni': '&#8715;',
'not': '&#172;',
'notin': '&#8713;',
'nsub': '&#8836;',
'ntilde': '&#241;',
'nu': '&#957;',
'oacute': '&#243;',
'ocirc': '&#244;',
'oelig': '&#339;',
'ograve': '&#242;',
'oline': '&#8254;',
'omega': '&#969;',
'omicron': '&#959;',
'oplus': '&#8853;',
'or': '&#8744;',
'ordf': '&#170;',
'ordm': '&#186;',
'oslash': '&#248;',
'otilde': '&#245;',
'otimes': '&#8855;',
'ouml': '&#246;',
'para': '&#182;',
'part': '&#8706;',
'permil': '&#8240;',
'perp': '&#8869;',
'phi': '&#966;',
'pi': '&#960;',
'piv': '&#982;',
'plusmn': '&#177;',
'pound': '&#163;',
'prime': '&#8242;',
'prod': '&#8719;',
'prop': '&#8733;',
'psi': '&#968;',
'rArr': '&#8658;',
'radic': '&#8730;',
'rang': '&#9002;',
'raquo': '&#187;',
'rarr': '&#8594;',
'rceil': '&#8969;',
'rdquo': '&#8221;',
'real': '&#8476;',
'reg': '&#174;',
'rfloor': '&#8971;',
'rho': '&#961;',
'rlm': '&#8207;',
'rsaquo': '&#8250;',
'rsquo': '&#8217;',
'sbquo': '&#8218;',
'scaron': '&#353;',
'sdot': '&#8901;',
'sect': '&#167;',
'shy': '&#173;',
'sigma': '&#963;',
'sigmaf': '&#962;',
'sim': '&#8764;',
'spades': '&#9824;',
'sub': '&#8834;',
'sube': '&#8838;',
'sum': '&#8721;',
'sup': '&#8835;',
'sup1': '&#185;',
'sup2': '&#178;',
'sup3': '&#179;',
'supe': '&#8839;',
'szlig': '&#223;',
'tau': '&#964;',
'there4': '&#8756;',
'theta': '&#952;',
'thetasym': '&#977;',
'thinsp': '&#8201;',
'thorn': '&#254;',
'tilde': '&#732;',
'times': '&#215;',
'trade': '&#8482;',
'uArr': '&#8657;',
'uacute': '&#250;',
'uarr': '&#8593;',
'ucirc': '&#251;',
'ugrave': '&#249;',
'uml': '&#168;',
'upsih': '&#978;',
'upsilon': '&#965;',
'uuml': '&#252;',
'weierp': '&#8472;',
'xi': '&#958;',
'yacute': '&#253;',
'yen': '&#165;',
'yuml': '&#255;',
'zeta': '&#950;',
'zwj': '&#8205;',
'zwnj': '&#8204;'}