Conversion pipeline: Dont choke on HTML/CSS files that fail to parse correctly. Instead remove them from the mainfest. Preprocessing code migrated from epub layer to OEBBook.

This commit is contained in:
Kovid Goyal 2009-03-29 21:09:04 -07:00
parent b98ada75f7
commit 44799e05ef
8 changed files with 242 additions and 107 deletions

View File

@ -1,5 +1,5 @@
" Project wide builtins " Project wide builtins
let g:pyflakes_builtins += ["dynamic_property"] let g:pyflakes_builtins += ["dynamic_property", '__']
python << EOFPY python << EOFPY
import os import os

View File

@ -4,8 +4,6 @@ Defines the plugin system for conversions.
''' '''
import re, os, shutil import re, os, shutil
from lxml import html
from calibre import CurrentDir from calibre import CurrentDir
from calibre.customize import Plugin from calibre.customize import Plugin
@ -121,7 +119,7 @@ class InputFormatPlugin(Plugin):
#: (option_name, recommended_value, recommendation_level) #: (option_name, recommended_value, recommendation_level)
recommendations = set([]) recommendations = set([])
def convert(self, stream, options, file_ext, parse_cache, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
''' '''
This method must be implemented in sub-classes. It must return This method must be implemented in sub-classes. It must return
the path to the created OPF file. All output should be contained in the path to the created OPF file. All output should be contained in
@ -144,17 +142,6 @@ class InputFormatPlugin(Plugin):
is guaranteed to be one of the `file_types` supported is guaranteed to be one of the `file_types` supported
by this plugin. by this plugin.
:param parse_cache: A dictionary that maps absolute file paths to
parsed representations of their contents. For
HTML the representation is an lxml element of
the root of the tree. For CSS it is a cssutils
stylesheet. If this plugin parses any of the
output files, it should add them to the cache
so that later stages of the conversion wont
have to re-parse them. If a parsed representation
is in the cache, there is no need to actually
write the file to disk.
:param log: A :class:`calibre.utils.logging.Log` object. All output :param log: A :class:`calibre.utils.logging.Log` object. All output
should use this object. should use this object.
@ -165,7 +152,7 @@ class InputFormatPlugin(Plugin):
''' '''
raise NotImplementedError raise NotImplementedError
def __call__(self, stream, options, file_ext, parse_cache, log, def __call__(self, stream, options, file_ext, log,
accelerators, output_dir): accelerators, output_dir):
log('InputFormatPlugin: %s running'%self.name, end=' ') log('InputFormatPlugin: %s running'%self.name, end=' ')
if hasattr(stream, 'name'): if hasattr(stream, 'name'):
@ -176,33 +163,15 @@ class InputFormatPlugin(Plugin):
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
ret = self.convert(stream, options, file_ext, parse_cache, ret = self.convert(stream, options, file_ext,
log, accelerators) log, accelerators)
for key in list(parse_cache.keys()):
if os.path.abspath(key) != key:
log.warn(('InputFormatPlugin: %s returned a '
'relative path: %s')%(self.name, key)
)
parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
if options.debug_input is not None: if options.debug_input is not None:
options.debug_input = os.path.abspath(options.debug_input) options.debug_input = os.path.abspath(options.debug_input)
if not os.path.exists(options.debug_input): if not os.path.exists(options.debug_input):
os.makedirs(options.debug_input) os.makedirs(options.debug_input)
shutil.rmtree(options.debug_input) shutil.rmtree(options.debug_input)
for f, obj in parse_cache.items():
if hasattr(obj, 'cssText'):
raw = obj.cssText
else:
raw = html.tostring(obj, encoding='utf-8', method='xml',
include_meta_content_type=True, pretty_print=True)
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open(f, 'wb').write(raw)
shutil.copytree('.', options.debug_input) shutil.copytree('.', options.debug_input)
return ret return ret

View File

@ -7,7 +7,7 @@ import sys, re
from calibre.customize import Plugin from calibre.customize import Plugin
class InputProfile(Plugin): class InputProfile(Plugin):
author = 'Kovid Goyal' author = 'Kovid Goyal'
supported_platforms = set(['windows', 'osx', 'linux']) supported_platforms = set(['windows', 'osx', 'linux'])
can_be_disabled = False can_be_disabled = False
@ -20,40 +20,40 @@ class InputProfile(Plugin):
short_name = 'default' # Used in the CLI so dont use spaces etc. in it short_name = 'default' # Used in the CLI so dont use spaces etc. in it
description = _('This profile tries to provide sane defaults and is useful ' description = _('This profile tries to provide sane defaults and is useful '
'if you know nothing about the input document.') 'if you know nothing about the input document.')
input_profiles = [InputProfile] input_profiles = [InputProfile]
class OutputProfile(Plugin): class OutputProfile(Plugin):
author = 'Kovid Goyal' author = 'Kovid Goyal'
supported_platforms = set(['windows', 'osx', 'linux']) supported_platforms = set(['windows', 'osx', 'linux'])
can_be_disabled = False can_be_disabled = False
type = _('Output profile') type = _('Output profile')
name = 'Default Output Profile' name = 'Default Output Profile'
short_name = 'default' # Used in the CLI so dont use spaces etc. in it short_name = 'default' # Used in the CLI so dont use spaces etc. in it
description = _('This profile tries to provide sane defaults and is useful ' description = _('This profile tries to provide sane defaults and is useful '
'if you want to produce a document intended to be read at a ' 'if you want to produce a document intended to be read at a '
'computer or on a range of devices.') 'computer or on a range of devices.')
epub_flow_size = sys.maxint epub_flow_size = sys.maxint
screen_size = None screen_size = None
remove_special_chars = False remove_special_chars = None
remove_object_tags = False remove_object_tags = False
class SonyReader(OutputProfile): class SonyReader(OutputProfile):
name = 'Sony Reader' name = 'Sony Reader'
short_name = 'sony' short_name = 'sony'
description = _('This profile is intended for the SONY PRS line. ' description = _('This profile is intended for the SONY PRS line. '
'The 500/505/700 etc.') 'The 500/505/700 etc.')
epub_flow_size = 270000 epub_flow_size = 270000
screen_size = (590, 765) screen_size = (590, 765)
remove_special_chars = re.compile(u'[\u200b\u00ad]') remove_special_chars = re.compile(u'[\u200b\u00ad]')
remove_object_tags = True remove_object_tags = True
output_profiles = [OutputProfile, SonyReader] output_profiles = [OutputProfile, SonyReader]

View File

@ -8,6 +8,7 @@ import os
from calibre.customize.conversion import OptionRecommendation from calibre.customize.conversion import OptionRecommendation
from calibre.customize.ui import input_profiles, output_profiles, \ from calibre.customize.ui import input_profiles, output_profiles, \
plugin_for_input_format, plugin_for_output_format plugin_for_input_format, plugin_for_output_format
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
class OptionValues(object): class OptionValues(object):
pass pass
@ -258,16 +259,17 @@ OptionRecommendation(name='language',
# heavy lifting. # heavy lifting.
from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
parse_cache, accelerators = {}, {} accelerators = {}
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
self.input_fmt, parse_cache, self.log, self.input_fmt, self.log,
accelerators) accelerators)
html_preprocessor = HTMLPreProcessor()
self.reader = OEBReader() self.reader = OEBReader()
self.oeb = OEBBook(self.log, parse_cache=parse_cache) self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
# Read OEB Book into OEBBook # Read OEB Book into OEBBook
self.reader(self.oeb, opfpath) self.reader(self.oeb, opfpath)

View File

@ -0,0 +1,123 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, functools
from calibre import entity_to_unicode
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
def sanitize_head(match):
x = match.group(1)
x = _span_pat.sub('', x)
return '<head>\n'+x+'\n</head>'
class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
def __call__(self, data):
data = self.PAGE_PAT.sub('', data)
return data
class HTMLPreProcessor(object):
PREPROCESS = [
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
lambda match: ''),
]
# Fix pdftohtml markup
PDFTOHTML = [
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
# Remove page numbers
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
# Remove <br> and replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
lambda match: match.group() if \
re.match('<', match.group(1).lstrip()) or \
len(match.group(1)) < 40 else match.group(1)),
# Remove hyphenation
(re.compile(r'-\n\r?'), lambda match: ''),
# Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '),
]
# Fix Book Designer markup
BOOK_DESIGNER = [
# HR
(re.compile('<hr>', re.IGNORECASE),
lambda match : '<span style="page-break-after:always"> </span>'),
# Create header tags
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
re.IGNORECASE).search(src) is not None
def is_book_designer(self, raw):
return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None):
if remove_special_chars is not None:
html = remove_special_chars.sub('', html)
if self.is_baen(html):
rules = []
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
rules = self.PDFTOHTML
else:
rules = []
for rule in self.PREPROCESS + rules:
html = rule[0].sub(rule[1], html)
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in html and SVG_NS not in html:
html = html.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in html and XLINK_NS not in html:
html = html.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
html = XMLDECL_RE.sub('', html)
return html

View File

@ -6,25 +6,28 @@ __docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
class MOBIInput(InputFormatPlugin): class MOBIInput(InputFormatPlugin):
name = 'MOBI Input' name = 'MOBI Input'
author = 'Kovid Goyal' author = 'Kovid Goyal'
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML' description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = set(['mobi', 'prc', 'azw']) file_types = set(['mobi', 'prc', 'azw'])
def convert(self, stream, options, file_ext, parse_cache, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.mobi.reader import MobiReader
mr = MobiReader(stream, log, options.input_encoding, from lxml import html
mr = MobiReader(stream, log, options.input_encoding,
options.debug_input) options.debug_input)
parse_cache = {}
mr.extract_content('.', parse_cache) mr.extract_content('.', parse_cache)
raw = parse_cache.get('calibre_raw_mobi_markup', False) raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw: if raw:
if isinstance(raw, unicode): if isinstance(raw, unicode):
raw = raw.encode('utf-8') raw = raw.encode('utf-8')
open('debug-raw.html', 'wb').write(raw) open('debug-raw.html', 'wb').write(raw)
for f, root in parse_cache.items(): for f, root in parse_cache.items():
if '.' in f: with open(f, 'wb') as q:
accelerators[f] = {'pagebreaks':root.xpath( q.write(html.tostring(root, encoding='utf-8', method='xml',
'//div[@class="mbp_pagebreak"]')} include_meta_content_type=False))
return mr.created_opf_path accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
return mr.created_opf_path

View File

@ -20,6 +20,8 @@ from cssutils import CSSParser
from calibre.translations.dynamic import translate from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, \
CSSPreProcessor
XML_NS = 'http://www.w3.org/XML/1998/namespace' XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml' XHTML_NS = 'http://www.w3.org/1999/xhtml'
@ -207,6 +209,10 @@ class OEBError(Exception):
"""Generic OEB-processing error.""" """Generic OEB-processing error."""
pass pass
class NotHTML(OEBError):
'''Raised when a file that should be HTML (as per manifest) is not'''
pass
class NullContainer(object): class NullContainer(object):
"""An empty container. """An empty container.
@ -575,14 +581,7 @@ class Manifest(object):
def _parse_xhtml(self, data): def _parse_xhtml(self, data):
# Convert to Unicode and normalize line endings # Convert to Unicode and normalize line endings
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = XMLDECL_RE.sub('', data) data = self.oeb.html_preprocessor(data)
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in data and SVG_NS not in data:
data = data.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in data and XLINK_NS not in data:
data = data.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
# Try with more & more drastic measures to parse # Try with more & more drastic measures to parse
try: try:
data = etree.fromstring(data) data = etree.fromstring(data)
@ -606,7 +605,7 @@ class Manifest(object):
data = etree.fromstring(data, parser=RECOVER_PARSER) data = etree.fromstring(data, parser=RECOVER_PARSER)
# Force into the XHTML namespace # Force into the XHTML namespace
if barename(data.tag) != 'html': if barename(data.tag) != 'html':
raise OEBError( raise NotHTML(
'File %r does not appear to be (X)HTML' % self.href) 'File %r does not appear to be (X)HTML' % self.href)
elif not namespace(data.tag): elif not namespace(data.tag):
data.attrib['xmlns'] = XHTML_NS data.attrib['xmlns'] = XHTML_NS
@ -659,6 +658,7 @@ class Manifest(object):
def _parse_css(self, data): def _parse_css(self, data):
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = self.CSSPreProcessor(data)
data = XHTML_CSS_NAMESPACE + data data = XHTML_CSS_NAMESPACE + data
parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING, parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
fetcher=self._fetch_css) fetcher=self._fetch_css)
@ -793,7 +793,7 @@ class Manifest(object):
MIME type which is not one of the OPS core media types. Either the MIME type which is not one of the OPS core media types. Either the
item's data itself may be provided with :param:`data`, or a loader item's data itself may be provided with :param:`data`, or a loader
function for the data may be provided with :param:`loader`, or the function for the data may be provided with :param:`loader`, or the
item's data may latter be set manually via the :attr:`data` attribute. item's data may later be set manually via the :attr:`data` attribute.
""" """
item = self.Item( item = self.Item(
self.oeb, id, href, media_type, fallback, loader, data) self.oeb, id, href, media_type, fallback, loader, data)
@ -840,6 +840,9 @@ class Manifest(object):
for item in self.items: for item in self.items:
yield item yield item
def __len__(self):
return len(self.items)
def values(self): def values(self):
return list(self.items) return list(self.items)
@ -1255,17 +1258,22 @@ class OEBBook(object):
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger, parse_cache={}, encoding='utf-8', def __init__(self, logger,
pretty_print=False): html_preprocessor=HTMLPreProcessor(),
"""Create empty book. Optional arguments: css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False):
"""Create empty book. Arguments:
:param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
paths to the cached files and values are lxml root objects and
cssutils stylesheets.
:param:`encoding`: Default encoding for textual content read :param:`encoding`: Default encoding for textual content read
from an external container. from an external container.
:param:`pretty_print`: Whether or not the canonical string form :param:`pretty_print`: Whether or not the canonical string form
of XML markup is pretty-printed. of XML markup is pretty-printed.
:param html_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all html files
before they are parsed.
:param css_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all CSS files
before they are parsed.
:param:`logger`: A Log object to use for logging all messages :param:`logger`: A Log object to use for logging all messages
related to the processing of this book. It is accessible related to the processing of this book. It is accessible
via the instance data members :attr:`logger,log`. via the instance data members :attr:`logger,log`.
@ -1286,6 +1294,8 @@ class OEBBook(object):
""" """
self.encoding = encoding self.encoding = encoding
self.html_preprocessor = html_preprocessor
self.css_preprocessor = css_preprocessor
self.pretty_print = pretty_print self.pretty_print = pretty_print
self.logger = self.log = logger self.logger = self.log = logger
self.version = '2.0' self.version = '2.0'

View File

@ -32,13 +32,13 @@ __all__ = ['OEBReader']
class OEBReader(object): class OEBReader(object):
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" """Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
Container = DirContainer Container = DirContainer
"""Container type used to access book files. Override in sub-classes.""" """Container type used to access book files. Override in sub-classes."""
DEFAULT_PROFILE = 'PRS505' DEFAULT_PROFILE = 'PRS505'
"""Default renderer profile for content read with this Reader.""" """Default renderer profile for content read with this Reader."""
@ -67,7 +67,7 @@ class OEBReader(object):
opf = self._read_opf() opf = self._read_opf()
self._all_from_opf(opf) self._all_from_opf(opf)
return oeb return oeb
def _clean_opf(self, opf): def _clean_opf(self, opf):
nsmap = {} nsmap = {}
for elem in opf.iter(tag=etree.Element): for elem in opf.iter(tag=etree.Element):
@ -94,7 +94,7 @@ class OEBReader(object):
for element in xpath(opf, tag): for element in xpath(opf, tag):
nroot.append(element) nroot.append(element)
return nroot return nroot
def _read_opf(self): def _read_opf(self):
data = self.oeb.container.read(None) data = self.oeb.container.read(None)
data = self.oeb.decode(data) data = self.oeb.decode(data)
@ -111,7 +111,7 @@ class OEBReader(object):
raise OEBError('Invalid namespace %r for OPF document' % ns) raise OEBError('Invalid namespace %r for OPF document' % ns)
opf = self._clean_opf(opf) opf = self._clean_opf(opf)
return opf return opf
def _metadata_from_opf(self, opf): def _metadata_from_opf(self, opf):
uid = opf.get('unique-identifier', None) uid = opf.get('unique-identifier', None)
self.oeb.uid = None self.oeb.uid = None
@ -161,10 +161,30 @@ class OEBReader(object):
self.logger.warn('Title not specified') self.logger.warn('Title not specified')
metadata.add('title', self.oeb.translate(__('Unknown'))) metadata.add('title', self.oeb.translate(__('Unknown')))
def _manifest_add_missing(self): def _manifest_prune_invalid(self):
'''
Remove items from manifest that contain invalid data. This prevents
catastrophic conversion failure, when a few files contain corrupted
data.
'''
bad = []
check = OEB_DOCS+OEB_STYLES
for item in list(self.oeb.manifest.values()):
if item.media_type in check:
try:
item.data
except:
self.logger.exception('Failed to parse content in %s'%
item.href)
bad.append(item)
self.oeb.manifest.remove(item)
return bad
def _manifest_add_missing(self, invalid):
manifest = self.oeb.manifest manifest = self.oeb.manifest
known = set(manifest.hrefs) known = set(manifest.hrefs)
unchecked = set(manifest.values()) unchecked = set(manifest.values())
bad = []
while unchecked: while unchecked:
new = set() new = set()
for item in unchecked: for item in unchecked:
@ -190,6 +210,13 @@ class OEBReader(object):
unchecked.clear() unchecked.clear()
for href in new: for href in new:
known.add(href) known.add(href)
is_invalid = False
for item in invalid:
if href == item.abshref(urlnormalize(href)):
is_invalid = True
break
if is_invalid:
continue
if not self.oeb.container.exists(href): if not self.oeb.container.exists(href):
self.logger.warn('Referenced file %r not found' % href) self.logger.warn('Referenced file %r not found' % href)
continue continue
@ -199,7 +226,7 @@ class OEBReader(object):
media_type = guessed or BINARY_MIME media_type = guessed or BINARY_MIME
added = manifest.add(id, href, media_type) added = manifest.add(id, href, media_type)
unchecked.add(added) unchecked.add(added)
def _manifest_from_opf(self, opf): def _manifest_from_opf(self, opf):
manifest = self.oeb.manifest manifest = self.oeb.manifest
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
@ -222,8 +249,9 @@ class OEBReader(object):
self.logger.warn(u'Duplicate manifest id %r' % id) self.logger.warn(u'Duplicate manifest id %r' % id)
id, href = manifest.generate(id, href) id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback) manifest.add(id, href, media_type, fallback)
self._manifest_add_missing() invalid = self._manifest_prune_invalid()
self._manifest_add_missing(invalid)
def _spine_add_extra(self): def _spine_add_extra(self):
manifest = self.oeb.manifest manifest = self.oeb.manifest
spine = self.oeb.spine spine = self.oeb.spine
@ -256,7 +284,7 @@ class OEBReader(object):
self.logger.warn( self.logger.warn(
'Spine-referenced file %r not in spine' % item.href) 'Spine-referenced file %r not in spine' % item.href)
spine.add(item, linear=False) spine.add(item, linear=False)
def _spine_from_opf(self, opf): def _spine_from_opf(self, opf):
spine = self.oeb.spine spine = self.oeb.spine
manifest = self.oeb.manifest manifest = self.oeb.manifest
@ -270,7 +298,7 @@ class OEBReader(object):
if len(spine) == 0: if len(spine) == 0:
raise OEBError("Spine is empty") raise OEBError("Spine is empty")
self._spine_add_extra() self._spine_add_extra()
def _guide_from_opf(self, opf): def _guide_from_opf(self, opf):
guide = self.oeb.guide guide = self.oeb.guide
manifest = self.oeb.manifest manifest = self.oeb.manifest
@ -281,7 +309,7 @@ class OEBReader(object):
self.logger.warn(u'Guide reference %r not found' % href) self.logger.warn(u'Guide reference %r not found' % href)
continue continue
guide.add(elem.get('type'), elem.get('title'), href) guide.add(elem.get('type'), elem.get('title'), href)
def _find_ncx(self, opf): def _find_ncx(self, opf):
result = xpath(opf, '/o2:package/o2:spine/@toc') result = xpath(opf, '/o2:package/o2:spine/@toc')
if result: if result:
@ -294,9 +322,9 @@ class OEBReader(object):
for item in self.oeb.manifest.values(): for item in self.oeb.manifest.values():
if item.media_type == NCX_MIME: if item.media_type == NCX_MIME:
self.oeb.manifest.remove(item) self.oeb.manifest.remove(item)
return item return item
return None return None
def _toc_from_navpoint(self, item, toc, navpoint): def _toc_from_navpoint(self, item, toc, navpoint):
children = xpath(navpoint, 'ncx:navPoint') children = xpath(navpoint, 'ncx:navPoint')
for child in children: for child in children:
@ -314,7 +342,7 @@ class OEBReader(object):
klass = child.get('class') klass = child.get('class')
node = toc.add(title, href, id=id, klass=klass) node = toc.add(title, href, id=id, klass=klass)
self._toc_from_navpoint(item, node, child) self._toc_from_navpoint(item, node, child)
def _toc_from_ncx(self, item): def _toc_from_ncx(self, item):
if item is None: if item is None:
return False return False
@ -328,7 +356,7 @@ class OEBReader(object):
for navmap in navmaps: for navmap in navmaps:
self._toc_from_navpoint(item, toc, navmap) self._toc_from_navpoint(item, toc, navmap)
return True return True
def _toc_from_tour(self, opf): def _toc_from_tour(self, opf):
result = xpath(opf, 'o2:tours/o2:tour') result = xpath(opf, 'o2:tours/o2:tour')
if not result: if not result:
@ -345,11 +373,11 @@ class OEBReader(object):
path, _ = urldefrag(urlnormalize(href)) path, _ = urldefrag(urlnormalize(href))
if path not in self.oeb.manifest.hrefs: if path not in self.oeb.manifest.hrefs:
self.logger.warn('TOC reference %r not found' % href) self.logger.warn('TOC reference %r not found' % href)
continue continue
id = site.get('id') id = site.get('id')
toc.add(title, href, id=id) toc.add(title, href, id=id)
return True return True
def _toc_from_html(self, opf): def _toc_from_html(self, opf):
if 'toc' not in self.oeb.guide: if 'toc' not in self.oeb.guide:
return False return False
@ -381,7 +409,7 @@ class OEBReader(object):
for href in order: for href in order:
toc.add(' '.join(titles[href]), href) toc.add(' '.join(titles[href]), href)
return True return True
def _toc_from_spine(self, opf): def _toc_from_spine(self, opf):
toc = self.oeb.toc toc = self.oeb.toc
titles = [] titles = []
@ -408,14 +436,14 @@ class OEBReader(object):
if not item.linear: continue if not item.linear: continue
toc.add(title, item.href) toc.add(title, item.href)
return True return True
def _toc_from_opf(self, opf, item): def _toc_from_opf(self, opf, item):
if self._toc_from_ncx(item): return if self._toc_from_ncx(item): return
if self._toc_from_tour(opf): return if self._toc_from_tour(opf): return
self.logger.warn('No metadata table of contents found') self.logger.warn('No metadata table of contents found')
if self._toc_from_html(opf): return if self._toc_from_html(opf): return
self._toc_from_spine(opf) self._toc_from_spine(opf)
def _pages_from_ncx(self, opf, item): def _pages_from_ncx(self, opf, item):
if item is None: if item is None:
return False return False
@ -436,7 +464,7 @@ class OEBReader(object):
klass = ptarget.get('class') klass = ptarget.get('class')
pages.add(name, href, type=type, id=id, klass=klass) pages.add(name, href, type=type, id=id, klass=klass)
return True return True
def _find_page_map(self, opf): def _find_page_map(self, opf):
result = xpath(opf, '/o2:package/o2:spine/@page-map') result = xpath(opf, '/o2:package/o2:spine/@page-map')
if result: if result:
@ -451,7 +479,7 @@ class OEBReader(object):
self.oeb.manifest.remove(item) self.oeb.manifest.remove(item)
return item return item
return None return None
def _pages_from_page_map(self, opf): def _pages_from_page_map(self, opf):
item = self._find_page_map(opf) item = self._find_page_map(opf)
if item is None: if item is None:
@ -472,12 +500,12 @@ class OEBReader(object):
type = 'front' type = 'front'
pages.add(name, href, type=type) pages.add(name, href, type=type)
return True return True
def _pages_from_opf(self, opf, item): def _pages_from_opf(self, opf, item):
if self._pages_from_ncx(opf, item): return if self._pages_from_ncx(opf, item): return
if self._pages_from_page_map(opf): return if self._pages_from_page_map(opf): return
return return
def _cover_from_html(self, hcover): def _cover_from_html(self, hcover):
with TemporaryDirectory('_html_cover') as tdir: with TemporaryDirectory('_html_cover') as tdir:
writer = OEBWriter() writer = OEBWriter()
@ -488,7 +516,7 @@ class OEBReader(object):
id, href = self.oeb.manifest.generate('cover', 'cover.jpeg') id, href = self.oeb.manifest.generate('cover', 'cover.jpeg')
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
return item return item
def _locate_cover_image(self): def _locate_cover_image(self):
if self.oeb.metadata.cover: if self.oeb.metadata.cover:
id = str(self.oeb.metadata.cover[0]) id = str(self.oeb.metadata.cover[0])
@ -525,14 +553,14 @@ class OEBReader(object):
if item is not None and item.media_type in OEB_IMAGES: if item is not None and item.media_type in OEB_IMAGES:
return item return item
return self._cover_from_html(hcover) return self._cover_from_html(hcover)
def _ensure_cover_image(self): def _ensure_cover_image(self):
cover = self._locate_cover_image() cover = self._locate_cover_image()
if self.oeb.metadata.cover: if self.oeb.metadata.cover:
self.oeb.metadata.cover[0].value = cover.id self.oeb.metadata.cover[0].value = cover.id
return return
self.oeb.metadata.add('cover', cover.id) self.oeb.metadata.add('cover', cover.id)
def _all_from_opf(self, opf): def _all_from_opf(self, opf):
self.oeb.version = opf.get('version', '1.2') self.oeb.version = opf.get('version', '1.2')
self._metadata_from_opf(opf) self._metadata_from_opf(opf)
@ -543,7 +571,7 @@ class OEBReader(object):
self._toc_from_opf(opf, item) self._toc_from_opf(opf, item)
self._pages_from_opf(opf, item) self._pages_from_opf(opf, item)
self._ensure_cover_image() self._ensure_cover_image()
def main(argv=sys.argv): def main(argv=sys.argv):
reader = OEBReader() reader = OEBReader()