mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Conversion pipeline: Dont choke on HTML/CSS files that fail to parse correctly. Instead remove them from the mainfest. Preprocessing code migrated from epub layer to OEBBook.
This commit is contained in:
parent
b98ada75f7
commit
44799e05ef
@ -1,5 +1,5 @@
|
|||||||
" Project wide builtins
|
" Project wide builtins
|
||||||
let g:pyflakes_builtins += ["dynamic_property"]
|
let g:pyflakes_builtins += ["dynamic_property", '__']
|
||||||
|
|
||||||
python << EOFPY
|
python << EOFPY
|
||||||
import os
|
import os
|
||||||
|
@ -4,8 +4,6 @@ Defines the plugin system for conversions.
|
|||||||
'''
|
'''
|
||||||
import re, os, shutil
|
import re, os, shutil
|
||||||
|
|
||||||
from lxml import html
|
|
||||||
|
|
||||||
from calibre import CurrentDir
|
from calibre import CurrentDir
|
||||||
from calibre.customize import Plugin
|
from calibre.customize import Plugin
|
||||||
|
|
||||||
@ -121,7 +119,7 @@ class InputFormatPlugin(Plugin):
|
|||||||
#: (option_name, recommended_value, recommendation_level)
|
#: (option_name, recommended_value, recommendation_level)
|
||||||
recommendations = set([])
|
recommendations = set([])
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, parse_cache, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
'''
|
'''
|
||||||
This method must be implemented in sub-classes. It must return
|
This method must be implemented in sub-classes. It must return
|
||||||
the path to the created OPF file. All output should be contained in
|
the path to the created OPF file. All output should be contained in
|
||||||
@ -144,17 +142,6 @@ class InputFormatPlugin(Plugin):
|
|||||||
is guaranteed to be one of the `file_types` supported
|
is guaranteed to be one of the `file_types` supported
|
||||||
by this plugin.
|
by this plugin.
|
||||||
|
|
||||||
:param parse_cache: A dictionary that maps absolute file paths to
|
|
||||||
parsed representations of their contents. For
|
|
||||||
HTML the representation is an lxml element of
|
|
||||||
the root of the tree. For CSS it is a cssutils
|
|
||||||
stylesheet. If this plugin parses any of the
|
|
||||||
output files, it should add them to the cache
|
|
||||||
so that later stages of the conversion wont
|
|
||||||
have to re-parse them. If a parsed representation
|
|
||||||
is in the cache, there is no need to actually
|
|
||||||
write the file to disk.
|
|
||||||
|
|
||||||
:param log: A :class:`calibre.utils.logging.Log` object. All output
|
:param log: A :class:`calibre.utils.logging.Log` object. All output
|
||||||
should use this object.
|
should use this object.
|
||||||
|
|
||||||
@ -165,7 +152,7 @@ class InputFormatPlugin(Plugin):
|
|||||||
'''
|
'''
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __call__(self, stream, options, file_ext, parse_cache, log,
|
def __call__(self, stream, options, file_ext, log,
|
||||||
accelerators, output_dir):
|
accelerators, output_dir):
|
||||||
log('InputFormatPlugin: %s running'%self.name, end=' ')
|
log('InputFormatPlugin: %s running'%self.name, end=' ')
|
||||||
if hasattr(stream, 'name'):
|
if hasattr(stream, 'name'):
|
||||||
@ -176,33 +163,15 @@ class InputFormatPlugin(Plugin):
|
|||||||
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
||||||
|
|
||||||
|
|
||||||
ret = self.convert(stream, options, file_ext, parse_cache,
|
ret = self.convert(stream, options, file_ext,
|
||||||
log, accelerators)
|
log, accelerators)
|
||||||
for key in list(parse_cache.keys()):
|
|
||||||
if os.path.abspath(key) != key:
|
|
||||||
log.warn(('InputFormatPlugin: %s returned a '
|
|
||||||
'relative path: %s')%(self.name, key)
|
|
||||||
)
|
|
||||||
parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
|
|
||||||
|
|
||||||
if options.debug_input is not None:
|
if options.debug_input is not None:
|
||||||
options.debug_input = os.path.abspath(options.debug_input)
|
options.debug_input = os.path.abspath(options.debug_input)
|
||||||
if not os.path.exists(options.debug_input):
|
if not os.path.exists(options.debug_input):
|
||||||
os.makedirs(options.debug_input)
|
os.makedirs(options.debug_input)
|
||||||
shutil.rmtree(options.debug_input)
|
shutil.rmtree(options.debug_input)
|
||||||
for f, obj in parse_cache.items():
|
|
||||||
if hasattr(obj, 'cssText'):
|
|
||||||
raw = obj.cssText
|
|
||||||
else:
|
|
||||||
raw = html.tostring(obj, encoding='utf-8', method='xml',
|
|
||||||
include_meta_content_type=True, pretty_print=True)
|
|
||||||
if isinstance(raw, unicode):
|
|
||||||
raw = raw.encode('utf-8')
|
|
||||||
open(f, 'wb').write(raw)
|
|
||||||
shutil.copytree('.', options.debug_input)
|
shutil.copytree('.', options.debug_input)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ import sys, re
|
|||||||
from calibre.customize import Plugin
|
from calibre.customize import Plugin
|
||||||
|
|
||||||
class InputProfile(Plugin):
|
class InputProfile(Plugin):
|
||||||
|
|
||||||
author = 'Kovid Goyal'
|
author = 'Kovid Goyal'
|
||||||
supported_platforms = set(['windows', 'osx', 'linux'])
|
supported_platforms = set(['windows', 'osx', 'linux'])
|
||||||
can_be_disabled = False
|
can_be_disabled = False
|
||||||
@ -20,40 +20,40 @@ class InputProfile(Plugin):
|
|||||||
short_name = 'default' # Used in the CLI so dont use spaces etc. in it
|
short_name = 'default' # Used in the CLI so dont use spaces etc. in it
|
||||||
description = _('This profile tries to provide sane defaults and is useful '
|
description = _('This profile tries to provide sane defaults and is useful '
|
||||||
'if you know nothing about the input document.')
|
'if you know nothing about the input document.')
|
||||||
|
|
||||||
input_profiles = [InputProfile]
|
input_profiles = [InputProfile]
|
||||||
|
|
||||||
|
|
||||||
class OutputProfile(Plugin):
|
class OutputProfile(Plugin):
|
||||||
|
|
||||||
author = 'Kovid Goyal'
|
author = 'Kovid Goyal'
|
||||||
supported_platforms = set(['windows', 'osx', 'linux'])
|
supported_platforms = set(['windows', 'osx', 'linux'])
|
||||||
can_be_disabled = False
|
can_be_disabled = False
|
||||||
type = _('Output profile')
|
type = _('Output profile')
|
||||||
|
|
||||||
name = 'Default Output Profile'
|
name = 'Default Output Profile'
|
||||||
short_name = 'default' # Used in the CLI so dont use spaces etc. in it
|
short_name = 'default' # Used in the CLI so dont use spaces etc. in it
|
||||||
description = _('This profile tries to provide sane defaults and is useful '
|
description = _('This profile tries to provide sane defaults and is useful '
|
||||||
'if you want to produce a document intended to be read at a '
|
'if you want to produce a document intended to be read at a '
|
||||||
'computer or on a range of devices.')
|
'computer or on a range of devices.')
|
||||||
|
|
||||||
epub_flow_size = sys.maxint
|
epub_flow_size = sys.maxint
|
||||||
screen_size = None
|
screen_size = None
|
||||||
remove_special_chars = False
|
remove_special_chars = None
|
||||||
remove_object_tags = False
|
remove_object_tags = False
|
||||||
|
|
||||||
class SonyReader(OutputProfile):
|
class SonyReader(OutputProfile):
|
||||||
|
|
||||||
name = 'Sony Reader'
|
name = 'Sony Reader'
|
||||||
short_name = 'sony'
|
short_name = 'sony'
|
||||||
description = _('This profile is intended for the SONY PRS line. '
|
description = _('This profile is intended for the SONY PRS line. '
|
||||||
'The 500/505/700 etc.')
|
'The 500/505/700 etc.')
|
||||||
|
|
||||||
epub_flow_size = 270000
|
epub_flow_size = 270000
|
||||||
screen_size = (590, 765)
|
screen_size = (590, 765)
|
||||||
remove_special_chars = re.compile(u'[\u200b\u00ad]')
|
remove_special_chars = re.compile(u'[\u200b\u00ad]')
|
||||||
remove_object_tags = True
|
remove_object_tags = True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
output_profiles = [OutputProfile, SonyReader]
|
output_profiles = [OutputProfile, SonyReader]
|
||||||
|
@ -8,6 +8,7 @@ import os
|
|||||||
from calibre.customize.conversion import OptionRecommendation
|
from calibre.customize.conversion import OptionRecommendation
|
||||||
from calibre.customize.ui import input_profiles, output_profiles, \
|
from calibre.customize.ui import input_profiles, output_profiles, \
|
||||||
plugin_for_input_format, plugin_for_output_format
|
plugin_for_input_format, plugin_for_output_format
|
||||||
|
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||||
|
|
||||||
class OptionValues(object):
|
class OptionValues(object):
|
||||||
pass
|
pass
|
||||||
@ -258,16 +259,17 @@ OptionRecommendation(name='language',
|
|||||||
# heavy lifting.
|
# heavy lifting.
|
||||||
from calibre.ebooks.oeb.reader import OEBReader
|
from calibre.ebooks.oeb.reader import OEBReader
|
||||||
from calibre.ebooks.oeb.base import OEBBook
|
from calibre.ebooks.oeb.base import OEBBook
|
||||||
parse_cache, accelerators = {}, {}
|
accelerators = {}
|
||||||
|
|
||||||
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
|
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
|
||||||
self.input_fmt, parse_cache, self.log,
|
self.input_fmt, self.log,
|
||||||
accelerators)
|
accelerators)
|
||||||
|
html_preprocessor = HTMLPreProcessor()
|
||||||
self.reader = OEBReader()
|
self.reader = OEBReader()
|
||||||
self.oeb = OEBBook(self.log, parse_cache=parse_cache)
|
self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
|
||||||
# Read OEB Book into OEBBook
|
# Read OEB Book into OEBBook
|
||||||
self.reader(self.oeb, opfpath)
|
self.reader(self.oeb, opfpath)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
123
src/calibre/ebooks/conversion/preprocess.py
Normal file
123
src/calibre/ebooks/conversion/preprocess.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re, functools
|
||||||
|
|
||||||
|
from calibre import entity_to_unicode
|
||||||
|
|
||||||
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
|
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||||
|
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||||
|
|
||||||
|
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
|
||||||
|
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_head(match):
|
||||||
|
x = match.group(1)
|
||||||
|
x = _span_pat.sub('', x)
|
||||||
|
return '<head>\n'+x+'\n</head>'
|
||||||
|
|
||||||
|
|
||||||
|
class CSSPreProcessor(object):
|
||||||
|
|
||||||
|
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
||||||
|
|
||||||
|
def __call__(self, data):
|
||||||
|
data = self.PAGE_PAT.sub('', data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
class HTMLPreProcessor(object):
|
||||||
|
|
||||||
|
PREPROCESS = [
|
||||||
|
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||||
|
# Put all sorts of crap into <head>. This messes up lxml
|
||||||
|
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
|
||||||
|
sanitize_head),
|
||||||
|
# Convert all entities, since lxml doesn't handle them well
|
||||||
|
(re.compile(r'&(\S+?);'), convert_entities),
|
||||||
|
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
||||||
|
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
|
||||||
|
lambda match: ''),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Fix pdftohtml markup
|
||||||
|
PDFTOHTML = [
|
||||||
|
# Remove <hr> tags
|
||||||
|
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
||||||
|
# Remove page numbers
|
||||||
|
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
|
||||||
|
# Remove <br> and replace <br><br> with <p>
|
||||||
|
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
||||||
|
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
|
||||||
|
lambda match: match.group() if \
|
||||||
|
re.match('<', match.group(1).lstrip()) or \
|
||||||
|
len(match.group(1)) < 40 else match.group(1)),
|
||||||
|
# Remove hyphenation
|
||||||
|
(re.compile(r'-\n\r?'), lambda match: ''),
|
||||||
|
|
||||||
|
# Remove gray background
|
||||||
|
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||||
|
|
||||||
|
# Remove non breaking spaces
|
||||||
|
(re.compile(ur'\u00a0'), lambda match : ' '),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
# Fix Book Designer markup
|
||||||
|
BOOK_DESIGNER = [
|
||||||
|
# HR
|
||||||
|
(re.compile('<hr>', re.IGNORECASE),
|
||||||
|
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||||
|
# Create header tags
|
||||||
|
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||||
|
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||||
|
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||||
|
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||||
|
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
|
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||||
|
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
|
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_baen(self, src):
|
||||||
|
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
||||||
|
re.IGNORECASE).search(src) is not None
|
||||||
|
|
||||||
|
def is_book_designer(self, raw):
|
||||||
|
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
||||||
|
|
||||||
|
def is_pdftohtml(self, src):
|
||||||
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
|
|
||||||
|
def __call__(self, html, remove_special_chars=None):
|
||||||
|
if remove_special_chars is not None:
|
||||||
|
html = remove_special_chars.sub('', html)
|
||||||
|
if self.is_baen(html):
|
||||||
|
rules = []
|
||||||
|
elif self.is_book_designer(html):
|
||||||
|
rules = self.BOOK_DESIGNER
|
||||||
|
elif self.is_pdftohtml(html):
|
||||||
|
rules = self.PDFTOHTML
|
||||||
|
else:
|
||||||
|
rules = []
|
||||||
|
for rule in self.PREPROCESS + rules:
|
||||||
|
html = rule[0].sub(rule[1], html)
|
||||||
|
|
||||||
|
# Handle broken XHTML w/ SVG (ugh)
|
||||||
|
if 'svg:' in html and SVG_NS not in html:
|
||||||
|
html = html.replace(
|
||||||
|
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
|
||||||
|
if 'xlink:' in html and XLINK_NS not in html:
|
||||||
|
html = html.replace(
|
||||||
|
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
|
||||||
|
|
||||||
|
html = XMLDECL_RE.sub('', html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
@ -6,25 +6,28 @@ __docformat__ = 'restructuredtext en'
|
|||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
|
||||||
class MOBIInput(InputFormatPlugin):
|
class MOBIInput(InputFormatPlugin):
|
||||||
|
|
||||||
name = 'MOBI Input'
|
name = 'MOBI Input'
|
||||||
author = 'Kovid Goyal'
|
author = 'Kovid Goyal'
|
||||||
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
|
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
|
||||||
file_types = set(['mobi', 'prc', 'azw'])
|
file_types = set(['mobi', 'prc', 'azw'])
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, parse_cache, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
from calibre.ebooks.mobi.reader import MobiReader
|
from calibre.ebooks.mobi.reader import MobiReader
|
||||||
mr = MobiReader(stream, log, options.input_encoding,
|
from lxml import html
|
||||||
|
mr = MobiReader(stream, log, options.input_encoding,
|
||||||
options.debug_input)
|
options.debug_input)
|
||||||
|
parse_cache = {}
|
||||||
mr.extract_content('.', parse_cache)
|
mr.extract_content('.', parse_cache)
|
||||||
raw = parse_cache.get('calibre_raw_mobi_markup', False)
|
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
|
||||||
if raw:
|
if raw:
|
||||||
if isinstance(raw, unicode):
|
if isinstance(raw, unicode):
|
||||||
raw = raw.encode('utf-8')
|
raw = raw.encode('utf-8')
|
||||||
open('debug-raw.html', 'wb').write(raw)
|
open('debug-raw.html', 'wb').write(raw)
|
||||||
for f, root in parse_cache.items():
|
for f, root in parse_cache.items():
|
||||||
if '.' in f:
|
with open(f, 'wb') as q:
|
||||||
accelerators[f] = {'pagebreaks':root.xpath(
|
q.write(html.tostring(root, encoding='utf-8', method='xml',
|
||||||
'//div[@class="mbp_pagebreak"]')}
|
include_meta_content_type=False))
|
||||||
return mr.created_opf_path
|
accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
|
||||||
|
return mr.created_opf_path
|
||||||
|
@ -20,6 +20,8 @@ from cssutils import CSSParser
|
|||||||
from calibre.translations.dynamic import translate
|
from calibre.translations.dynamic import translate
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
||||||
|
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, \
|
||||||
|
CSSPreProcessor
|
||||||
|
|
||||||
XML_NS = 'http://www.w3.org/XML/1998/namespace'
|
XML_NS = 'http://www.w3.org/XML/1998/namespace'
|
||||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||||
@ -207,6 +209,10 @@ class OEBError(Exception):
|
|||||||
"""Generic OEB-processing error."""
|
"""Generic OEB-processing error."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class NotHTML(OEBError):
|
||||||
|
'''Raised when a file that should be HTML (as per manifest) is not'''
|
||||||
|
pass
|
||||||
|
|
||||||
class NullContainer(object):
|
class NullContainer(object):
|
||||||
"""An empty container.
|
"""An empty container.
|
||||||
|
|
||||||
@ -575,14 +581,7 @@ class Manifest(object):
|
|||||||
def _parse_xhtml(self, data):
|
def _parse_xhtml(self, data):
|
||||||
# Convert to Unicode and normalize line endings
|
# Convert to Unicode and normalize line endings
|
||||||
data = self.oeb.decode(data)
|
data = self.oeb.decode(data)
|
||||||
data = XMLDECL_RE.sub('', data)
|
data = self.oeb.html_preprocessor(data)
|
||||||
# Handle broken XHTML w/ SVG (ugh)
|
|
||||||
if 'svg:' in data and SVG_NS not in data:
|
|
||||||
data = data.replace(
|
|
||||||
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
|
|
||||||
if 'xlink:' in data and XLINK_NS not in data:
|
|
||||||
data = data.replace(
|
|
||||||
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
|
|
||||||
# Try with more & more drastic measures to parse
|
# Try with more & more drastic measures to parse
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
@ -606,7 +605,7 @@ class Manifest(object):
|
|||||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||||
# Force into the XHTML namespace
|
# Force into the XHTML namespace
|
||||||
if barename(data.tag) != 'html':
|
if barename(data.tag) != 'html':
|
||||||
raise OEBError(
|
raise NotHTML(
|
||||||
'File %r does not appear to be (X)HTML' % self.href)
|
'File %r does not appear to be (X)HTML' % self.href)
|
||||||
elif not namespace(data.tag):
|
elif not namespace(data.tag):
|
||||||
data.attrib['xmlns'] = XHTML_NS
|
data.attrib['xmlns'] = XHTML_NS
|
||||||
@ -659,6 +658,7 @@ class Manifest(object):
|
|||||||
|
|
||||||
def _parse_css(self, data):
|
def _parse_css(self, data):
|
||||||
data = self.oeb.decode(data)
|
data = self.oeb.decode(data)
|
||||||
|
data = self.CSSPreProcessor(data)
|
||||||
data = XHTML_CSS_NAMESPACE + data
|
data = XHTML_CSS_NAMESPACE + data
|
||||||
parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
|
parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
|
||||||
fetcher=self._fetch_css)
|
fetcher=self._fetch_css)
|
||||||
@ -793,7 +793,7 @@ class Manifest(object):
|
|||||||
MIME type which is not one of the OPS core media types. Either the
|
MIME type which is not one of the OPS core media types. Either the
|
||||||
item's data itself may be provided with :param:`data`, or a loader
|
item's data itself may be provided with :param:`data`, or a loader
|
||||||
function for the data may be provided with :param:`loader`, or the
|
function for the data may be provided with :param:`loader`, or the
|
||||||
item's data may latter be set manually via the :attr:`data` attribute.
|
item's data may later be set manually via the :attr:`data` attribute.
|
||||||
"""
|
"""
|
||||||
item = self.Item(
|
item = self.Item(
|
||||||
self.oeb, id, href, media_type, fallback, loader, data)
|
self.oeb, id, href, media_type, fallback, loader, data)
|
||||||
@ -840,6 +840,9 @@ class Manifest(object):
|
|||||||
for item in self.items:
|
for item in self.items:
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.items)
|
||||||
|
|
||||||
def values(self):
|
def values(self):
|
||||||
return list(self.items)
|
return list(self.items)
|
||||||
|
|
||||||
@ -1255,17 +1258,22 @@ class OEBBook(object):
|
|||||||
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
|
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
|
||||||
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
||||||
|
|
||||||
def __init__(self, logger, parse_cache={}, encoding='utf-8',
|
def __init__(self, logger,
|
||||||
pretty_print=False):
|
html_preprocessor=HTMLPreProcessor(),
|
||||||
"""Create empty book. Optional arguments:
|
css_preprocessor=CSSPreProcessor(),
|
||||||
|
encoding='utf-8', pretty_print=False):
|
||||||
|
"""Create empty book. Arguments:
|
||||||
|
|
||||||
:param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
|
|
||||||
paths to the cached files and values are lxml root objects and
|
|
||||||
cssutils stylesheets.
|
|
||||||
:param:`encoding`: Default encoding for textual content read
|
:param:`encoding`: Default encoding for textual content read
|
||||||
from an external container.
|
from an external container.
|
||||||
:param:`pretty_print`: Whether or not the canonical string form
|
:param:`pretty_print`: Whether or not the canonical string form
|
||||||
of XML markup is pretty-printed.
|
of XML markup is pretty-printed.
|
||||||
|
:param html_preprocessor: A callable that takes a unicode object
|
||||||
|
and returns a unicode object. Will be called on all html files
|
||||||
|
before they are parsed.
|
||||||
|
:param css_preprocessor: A callable that takes a unicode object
|
||||||
|
and returns a unicode object. Will be called on all CSS files
|
||||||
|
before they are parsed.
|
||||||
:param:`logger`: A Log object to use for logging all messages
|
:param:`logger`: A Log object to use for logging all messages
|
||||||
related to the processing of this book. It is accessible
|
related to the processing of this book. It is accessible
|
||||||
via the instance data members :attr:`logger,log`.
|
via the instance data members :attr:`logger,log`.
|
||||||
@ -1286,6 +1294,8 @@ class OEBBook(object):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
self.html_preprocessor = html_preprocessor
|
||||||
|
self.css_preprocessor = css_preprocessor
|
||||||
self.pretty_print = pretty_print
|
self.pretty_print = pretty_print
|
||||||
self.logger = self.log = logger
|
self.logger = self.log = logger
|
||||||
self.version = '2.0'
|
self.version = '2.0'
|
||||||
|
@ -32,13 +32,13 @@ __all__ = ['OEBReader']
|
|||||||
|
|
||||||
class OEBReader(object):
|
class OEBReader(object):
|
||||||
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
|
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
|
||||||
|
|
||||||
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
|
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
|
||||||
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
||||||
|
|
||||||
Container = DirContainer
|
Container = DirContainer
|
||||||
"""Container type used to access book files. Override in sub-classes."""
|
"""Container type used to access book files. Override in sub-classes."""
|
||||||
|
|
||||||
DEFAULT_PROFILE = 'PRS505'
|
DEFAULT_PROFILE = 'PRS505'
|
||||||
"""Default renderer profile for content read with this Reader."""
|
"""Default renderer profile for content read with this Reader."""
|
||||||
|
|
||||||
@ -67,7 +67,7 @@ class OEBReader(object):
|
|||||||
opf = self._read_opf()
|
opf = self._read_opf()
|
||||||
self._all_from_opf(opf)
|
self._all_from_opf(opf)
|
||||||
return oeb
|
return oeb
|
||||||
|
|
||||||
def _clean_opf(self, opf):
|
def _clean_opf(self, opf):
|
||||||
nsmap = {}
|
nsmap = {}
|
||||||
for elem in opf.iter(tag=etree.Element):
|
for elem in opf.iter(tag=etree.Element):
|
||||||
@ -94,7 +94,7 @@ class OEBReader(object):
|
|||||||
for element in xpath(opf, tag):
|
for element in xpath(opf, tag):
|
||||||
nroot.append(element)
|
nroot.append(element)
|
||||||
return nroot
|
return nroot
|
||||||
|
|
||||||
def _read_opf(self):
|
def _read_opf(self):
|
||||||
data = self.oeb.container.read(None)
|
data = self.oeb.container.read(None)
|
||||||
data = self.oeb.decode(data)
|
data = self.oeb.decode(data)
|
||||||
@ -111,7 +111,7 @@ class OEBReader(object):
|
|||||||
raise OEBError('Invalid namespace %r for OPF document' % ns)
|
raise OEBError('Invalid namespace %r for OPF document' % ns)
|
||||||
opf = self._clean_opf(opf)
|
opf = self._clean_opf(opf)
|
||||||
return opf
|
return opf
|
||||||
|
|
||||||
def _metadata_from_opf(self, opf):
|
def _metadata_from_opf(self, opf):
|
||||||
uid = opf.get('unique-identifier', None)
|
uid = opf.get('unique-identifier', None)
|
||||||
self.oeb.uid = None
|
self.oeb.uid = None
|
||||||
@ -161,10 +161,30 @@ class OEBReader(object):
|
|||||||
self.logger.warn('Title not specified')
|
self.logger.warn('Title not specified')
|
||||||
metadata.add('title', self.oeb.translate(__('Unknown')))
|
metadata.add('title', self.oeb.translate(__('Unknown')))
|
||||||
|
|
||||||
def _manifest_add_missing(self):
|
def _manifest_prune_invalid(self):
|
||||||
|
'''
|
||||||
|
Remove items from manifest that contain invalid data. This prevents
|
||||||
|
catastrophic conversion failure, when a few files contain corrupted
|
||||||
|
data.
|
||||||
|
'''
|
||||||
|
bad = []
|
||||||
|
check = OEB_DOCS+OEB_STYLES
|
||||||
|
for item in list(self.oeb.manifest.values()):
|
||||||
|
if item.media_type in check:
|
||||||
|
try:
|
||||||
|
item.data
|
||||||
|
except:
|
||||||
|
self.logger.exception('Failed to parse content in %s'%
|
||||||
|
item.href)
|
||||||
|
bad.append(item)
|
||||||
|
self.oeb.manifest.remove(item)
|
||||||
|
return bad
|
||||||
|
|
||||||
|
def _manifest_add_missing(self, invalid):
|
||||||
manifest = self.oeb.manifest
|
manifest = self.oeb.manifest
|
||||||
known = set(manifest.hrefs)
|
known = set(manifest.hrefs)
|
||||||
unchecked = set(manifest.values())
|
unchecked = set(manifest.values())
|
||||||
|
bad = []
|
||||||
while unchecked:
|
while unchecked:
|
||||||
new = set()
|
new = set()
|
||||||
for item in unchecked:
|
for item in unchecked:
|
||||||
@ -190,6 +210,13 @@ class OEBReader(object):
|
|||||||
unchecked.clear()
|
unchecked.clear()
|
||||||
for href in new:
|
for href in new:
|
||||||
known.add(href)
|
known.add(href)
|
||||||
|
is_invalid = False
|
||||||
|
for item in invalid:
|
||||||
|
if href == item.abshref(urlnormalize(href)):
|
||||||
|
is_invalid = True
|
||||||
|
break
|
||||||
|
if is_invalid:
|
||||||
|
continue
|
||||||
if not self.oeb.container.exists(href):
|
if not self.oeb.container.exists(href):
|
||||||
self.logger.warn('Referenced file %r not found' % href)
|
self.logger.warn('Referenced file %r not found' % href)
|
||||||
continue
|
continue
|
||||||
@ -199,7 +226,7 @@ class OEBReader(object):
|
|||||||
media_type = guessed or BINARY_MIME
|
media_type = guessed or BINARY_MIME
|
||||||
added = manifest.add(id, href, media_type)
|
added = manifest.add(id, href, media_type)
|
||||||
unchecked.add(added)
|
unchecked.add(added)
|
||||||
|
|
||||||
def _manifest_from_opf(self, opf):
|
def _manifest_from_opf(self, opf):
|
||||||
manifest = self.oeb.manifest
|
manifest = self.oeb.manifest
|
||||||
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
|
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
|
||||||
@ -222,8 +249,9 @@ class OEBReader(object):
|
|||||||
self.logger.warn(u'Duplicate manifest id %r' % id)
|
self.logger.warn(u'Duplicate manifest id %r' % id)
|
||||||
id, href = manifest.generate(id, href)
|
id, href = manifest.generate(id, href)
|
||||||
manifest.add(id, href, media_type, fallback)
|
manifest.add(id, href, media_type, fallback)
|
||||||
self._manifest_add_missing()
|
invalid = self._manifest_prune_invalid()
|
||||||
|
self._manifest_add_missing(invalid)
|
||||||
|
|
||||||
def _spine_add_extra(self):
|
def _spine_add_extra(self):
|
||||||
manifest = self.oeb.manifest
|
manifest = self.oeb.manifest
|
||||||
spine = self.oeb.spine
|
spine = self.oeb.spine
|
||||||
@ -256,7 +284,7 @@ class OEBReader(object):
|
|||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
'Spine-referenced file %r not in spine' % item.href)
|
'Spine-referenced file %r not in spine' % item.href)
|
||||||
spine.add(item, linear=False)
|
spine.add(item, linear=False)
|
||||||
|
|
||||||
def _spine_from_opf(self, opf):
|
def _spine_from_opf(self, opf):
|
||||||
spine = self.oeb.spine
|
spine = self.oeb.spine
|
||||||
manifest = self.oeb.manifest
|
manifest = self.oeb.manifest
|
||||||
@ -270,7 +298,7 @@ class OEBReader(object):
|
|||||||
if len(spine) == 0:
|
if len(spine) == 0:
|
||||||
raise OEBError("Spine is empty")
|
raise OEBError("Spine is empty")
|
||||||
self._spine_add_extra()
|
self._spine_add_extra()
|
||||||
|
|
||||||
def _guide_from_opf(self, opf):
|
def _guide_from_opf(self, opf):
|
||||||
guide = self.oeb.guide
|
guide = self.oeb.guide
|
||||||
manifest = self.oeb.manifest
|
manifest = self.oeb.manifest
|
||||||
@ -281,7 +309,7 @@ class OEBReader(object):
|
|||||||
self.logger.warn(u'Guide reference %r not found' % href)
|
self.logger.warn(u'Guide reference %r not found' % href)
|
||||||
continue
|
continue
|
||||||
guide.add(elem.get('type'), elem.get('title'), href)
|
guide.add(elem.get('type'), elem.get('title'), href)
|
||||||
|
|
||||||
def _find_ncx(self, opf):
|
def _find_ncx(self, opf):
|
||||||
result = xpath(opf, '/o2:package/o2:spine/@toc')
|
result = xpath(opf, '/o2:package/o2:spine/@toc')
|
||||||
if result:
|
if result:
|
||||||
@ -294,9 +322,9 @@ class OEBReader(object):
|
|||||||
for item in self.oeb.manifest.values():
|
for item in self.oeb.manifest.values():
|
||||||
if item.media_type == NCX_MIME:
|
if item.media_type == NCX_MIME:
|
||||||
self.oeb.manifest.remove(item)
|
self.oeb.manifest.remove(item)
|
||||||
return item
|
return item
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _toc_from_navpoint(self, item, toc, navpoint):
|
def _toc_from_navpoint(self, item, toc, navpoint):
|
||||||
children = xpath(navpoint, 'ncx:navPoint')
|
children = xpath(navpoint, 'ncx:navPoint')
|
||||||
for child in children:
|
for child in children:
|
||||||
@ -314,7 +342,7 @@ class OEBReader(object):
|
|||||||
klass = child.get('class')
|
klass = child.get('class')
|
||||||
node = toc.add(title, href, id=id, klass=klass)
|
node = toc.add(title, href, id=id, klass=klass)
|
||||||
self._toc_from_navpoint(item, node, child)
|
self._toc_from_navpoint(item, node, child)
|
||||||
|
|
||||||
def _toc_from_ncx(self, item):
|
def _toc_from_ncx(self, item):
|
||||||
if item is None:
|
if item is None:
|
||||||
return False
|
return False
|
||||||
@ -328,7 +356,7 @@ class OEBReader(object):
|
|||||||
for navmap in navmaps:
|
for navmap in navmaps:
|
||||||
self._toc_from_navpoint(item, toc, navmap)
|
self._toc_from_navpoint(item, toc, navmap)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _toc_from_tour(self, opf):
|
def _toc_from_tour(self, opf):
|
||||||
result = xpath(opf, 'o2:tours/o2:tour')
|
result = xpath(opf, 'o2:tours/o2:tour')
|
||||||
if not result:
|
if not result:
|
||||||
@ -345,11 +373,11 @@ class OEBReader(object):
|
|||||||
path, _ = urldefrag(urlnormalize(href))
|
path, _ = urldefrag(urlnormalize(href))
|
||||||
if path not in self.oeb.manifest.hrefs:
|
if path not in self.oeb.manifest.hrefs:
|
||||||
self.logger.warn('TOC reference %r not found' % href)
|
self.logger.warn('TOC reference %r not found' % href)
|
||||||
continue
|
continue
|
||||||
id = site.get('id')
|
id = site.get('id')
|
||||||
toc.add(title, href, id=id)
|
toc.add(title, href, id=id)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _toc_from_html(self, opf):
|
def _toc_from_html(self, opf):
|
||||||
if 'toc' not in self.oeb.guide:
|
if 'toc' not in self.oeb.guide:
|
||||||
return False
|
return False
|
||||||
@ -381,7 +409,7 @@ class OEBReader(object):
|
|||||||
for href in order:
|
for href in order:
|
||||||
toc.add(' '.join(titles[href]), href)
|
toc.add(' '.join(titles[href]), href)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _toc_from_spine(self, opf):
|
def _toc_from_spine(self, opf):
|
||||||
toc = self.oeb.toc
|
toc = self.oeb.toc
|
||||||
titles = []
|
titles = []
|
||||||
@ -408,14 +436,14 @@ class OEBReader(object):
|
|||||||
if not item.linear: continue
|
if not item.linear: continue
|
||||||
toc.add(title, item.href)
|
toc.add(title, item.href)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _toc_from_opf(self, opf, item):
|
def _toc_from_opf(self, opf, item):
|
||||||
if self._toc_from_ncx(item): return
|
if self._toc_from_ncx(item): return
|
||||||
if self._toc_from_tour(opf): return
|
if self._toc_from_tour(opf): return
|
||||||
self.logger.warn('No metadata table of contents found')
|
self.logger.warn('No metadata table of contents found')
|
||||||
if self._toc_from_html(opf): return
|
if self._toc_from_html(opf): return
|
||||||
self._toc_from_spine(opf)
|
self._toc_from_spine(opf)
|
||||||
|
|
||||||
def _pages_from_ncx(self, opf, item):
|
def _pages_from_ncx(self, opf, item):
|
||||||
if item is None:
|
if item is None:
|
||||||
return False
|
return False
|
||||||
@ -436,7 +464,7 @@ class OEBReader(object):
|
|||||||
klass = ptarget.get('class')
|
klass = ptarget.get('class')
|
||||||
pages.add(name, href, type=type, id=id, klass=klass)
|
pages.add(name, href, type=type, id=id, klass=klass)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _find_page_map(self, opf):
|
def _find_page_map(self, opf):
|
||||||
result = xpath(opf, '/o2:package/o2:spine/@page-map')
|
result = xpath(opf, '/o2:package/o2:spine/@page-map')
|
||||||
if result:
|
if result:
|
||||||
@ -451,7 +479,7 @@ class OEBReader(object):
|
|||||||
self.oeb.manifest.remove(item)
|
self.oeb.manifest.remove(item)
|
||||||
return item
|
return item
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _pages_from_page_map(self, opf):
|
def _pages_from_page_map(self, opf):
|
||||||
item = self._find_page_map(opf)
|
item = self._find_page_map(opf)
|
||||||
if item is None:
|
if item is None:
|
||||||
@ -472,12 +500,12 @@ class OEBReader(object):
|
|||||||
type = 'front'
|
type = 'front'
|
||||||
pages.add(name, href, type=type)
|
pages.add(name, href, type=type)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _pages_from_opf(self, opf, item):
|
def _pages_from_opf(self, opf, item):
|
||||||
if self._pages_from_ncx(opf, item): return
|
if self._pages_from_ncx(opf, item): return
|
||||||
if self._pages_from_page_map(opf): return
|
if self._pages_from_page_map(opf): return
|
||||||
return
|
return
|
||||||
|
|
||||||
def _cover_from_html(self, hcover):
|
def _cover_from_html(self, hcover):
|
||||||
with TemporaryDirectory('_html_cover') as tdir:
|
with TemporaryDirectory('_html_cover') as tdir:
|
||||||
writer = OEBWriter()
|
writer = OEBWriter()
|
||||||
@ -488,7 +516,7 @@ class OEBReader(object):
|
|||||||
id, href = self.oeb.manifest.generate('cover', 'cover.jpeg')
|
id, href = self.oeb.manifest.generate('cover', 'cover.jpeg')
|
||||||
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
|
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def _locate_cover_image(self):
|
def _locate_cover_image(self):
|
||||||
if self.oeb.metadata.cover:
|
if self.oeb.metadata.cover:
|
||||||
id = str(self.oeb.metadata.cover[0])
|
id = str(self.oeb.metadata.cover[0])
|
||||||
@ -525,14 +553,14 @@ class OEBReader(object):
|
|||||||
if item is not None and item.media_type in OEB_IMAGES:
|
if item is not None and item.media_type in OEB_IMAGES:
|
||||||
return item
|
return item
|
||||||
return self._cover_from_html(hcover)
|
return self._cover_from_html(hcover)
|
||||||
|
|
||||||
def _ensure_cover_image(self):
|
def _ensure_cover_image(self):
|
||||||
cover = self._locate_cover_image()
|
cover = self._locate_cover_image()
|
||||||
if self.oeb.metadata.cover:
|
if self.oeb.metadata.cover:
|
||||||
self.oeb.metadata.cover[0].value = cover.id
|
self.oeb.metadata.cover[0].value = cover.id
|
||||||
return
|
return
|
||||||
self.oeb.metadata.add('cover', cover.id)
|
self.oeb.metadata.add('cover', cover.id)
|
||||||
|
|
||||||
def _all_from_opf(self, opf):
|
def _all_from_opf(self, opf):
|
||||||
self.oeb.version = opf.get('version', '1.2')
|
self.oeb.version = opf.get('version', '1.2')
|
||||||
self._metadata_from_opf(opf)
|
self._metadata_from_opf(opf)
|
||||||
@ -543,7 +571,7 @@ class OEBReader(object):
|
|||||||
self._toc_from_opf(opf, item)
|
self._toc_from_opf(opf, item)
|
||||||
self._pages_from_opf(opf, item)
|
self._pages_from_opf(opf, item)
|
||||||
self._ensure_cover_image()
|
self._ensure_cover_image()
|
||||||
|
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
def main(argv=sys.argv):
|
||||||
reader = OEBReader()
|
reader = OEBReader()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user