Merge various OEB-processing enhancements from local development branch.

This commit is contained in:
Marshall T. Vandegrift 2008-12-30 19:59:51 -05:00
commit 23d17f0a66
7 changed files with 181 additions and 137 deletions

View File

@ -166,7 +166,7 @@ if __name__ == '__main__':
metadata_sqlite = 'library/metadata_sqlite.sql',
jquery = 'gui2/viewer/jquery.js',
jquery_scrollTo = 'gui2/viewer/jquery_scrollTo.js',
html_css = 'ebooks/lit/html.css',
html_css = 'ebooks/oeb/html.css',
)
DEST = os.path.join('src', APPNAME, 'resources.py')

View File

@ -15,7 +15,7 @@ from lxml import etree
from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1
from calibre.ebooks.lit.oeb import urlnormalize
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks import DRMError
from calibre import plugins
lzx, lxzerror = plugins['lzx']

View File

@ -23,11 +23,12 @@ from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit.reader import DirectoryEntry
import calibre.ebooks.lit.maps as maps
from calibre.ebooks.lit.oeb import OEB_DOCS, OEB_STYLES, OEB_CSS_MIME, \
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_CSS_MIME, \
CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize, xpath
from calibre.ebooks.lit.oeb import prefixname, FauxLogger, OEBBook
from calibre.ebooks.lit.stylizer import Stylizer
from calibre.ebooks.oeb.base import namespace, barename, prefixname, \
urlnormalize, xpath
from calibre.ebooks.oeb.base import FauxLogger, OEBBook
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.lit.lzx import Compressor
import calibre
from calibre import LoggingInterface

View File

@ -0,0 +1,2 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

View File

@ -38,12 +38,14 @@ def OPF(name): return '{%s}%s' % (OPF2_NS, name)
def DC(name): return '{%s}%s' % (DC11_NS, name)
def NCX(name): return '{%s}%s' % (NCX_NS, name)
EPUB_MIME = 'application/epub+zip'
XHTML_MIME = 'application/xhtml+xml'
CSS_MIME = 'text/css'
NCX_MIME = 'application/x-dtbncx+xml'
OPF_MIME = 'application/oebps-package+xml'
OEB_DOC_MIME = 'text/x-oeb1-document'
OEB_CSS_MIME = 'text/x-oeb1-css'
OPENTYPE_MIME = 'font/opentype'
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
@ -75,7 +77,14 @@ def prefixname(name, nsrmap):
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
URL_UNSAFE = r"""`!@#$%^&*[](){}?+=;:'",<>\| """
def xml2str(root):
return etree.tostring(root, encoding='utf-8', xml_declaration=True)
ASCII_CHARS = set(chr(x) for x in xrange(128))
URL_SAFE = set(u'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
u'abcdefghijklmnopqrstuvwxyz'
u'0123456789' u'_.-/~')
URL_UNSAFE = ASCII_CHARS - URL_SAFE
def urlquote(href):
result = []
for char in href:
@ -116,6 +125,9 @@ class DirContainer(AbstractContainer):
def write(self, path, data):
path = os.path.join(self.rootdir, path)
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
with open(urlunquote(path), 'wb') as f:
return f.write(data)
@ -123,6 +135,21 @@ class DirContainer(AbstractContainer):
path = os.path.join(self.rootdir, path)
return os.path.isfile(urlunquote(path))
class DirWriter(object):
def __init__(self, version=2.0):
self.version = version
def dump(self, oeb, path):
if not os.path.isdir(path):
os.mkdir(path)
output = DirContainer(path)
for item in oeb.manifest.values():
output.write(item.href, str(item))
metadata = oeb.to_opf2() if self.version == 2 else oeb.to_opf1()
for href, data in metadata.values():
output.write(href, xml2str(data))
return
class Metadata(object):
TERMS = set(['contributor', 'coverage', 'creator', 'date', 'description',
@ -277,12 +304,34 @@ class Manifest(object):
return property(fget, fset, fdel)
data = data()
def __str__(self):
data = self.data
if isinstance(data, etree._Element):
return xml2str(data)
return str(data)
def __cmp__(self, other):
result = cmp(self.spine_position, other.spine_position)
if result != 0:
return result
return cmp(self.id, other.id)
def relhref(self, href):
if '/' not in self.href:
return href
base = os.path.dirname(self.href).split('/')
target, frag = urldefrag(href)
target = target.split('/')
for index in xrange(min(len(base), len(target))):
if base[index] != target[index]: break
else:
index += 1
relhref = (['..'] * (len(base) - index)) + target[index:]
relhref = '/'.join(relhref)
if frag:
relhref = '#'.join((relhref, frag))
return relhref
def abshref(self, href):
if '/' not in self.href:
return href
@ -361,7 +410,7 @@ class Manifest(object):
def to_opf2(self, parent=None):
elem = element(parent, OPF('manifest'))
for item in self.items.values():
for item in self.ids.values():
attrib = {'id': item.id, 'href': item.href,
'media-type': item.media_type}
if item.fallback:
@ -375,18 +424,35 @@ class Spine(object):
self.oeb = oeb
self.items = []
def add(self, item, linear):
def _linear(self, linear):
if isinstance(linear, StringTypes):
linear = linear.lower()
if linear is None or linear in ('yes', 'true'):
linear = True
elif linear in ('no', 'false'):
linear = False
item.linear = linear
return linear
def add(self, item, linear=None):
item.linear = self._linear(linear)
item.spine_position = len(self.items)
self.items.append(item)
return item
def insert(self, index, item, linear):
item.linear = self._linear(linear)
item.spine_position = index
self.items.insert(index, item)
for i in xrange(index, len(self.items)):
self.items[i].spine_position = i
return item
def remove(self, item):
index = item.spine_position
self.items.pop(index)
for i in xrange(index, len(self.items)):
self.items[i].spine_position = i
def __iter__(self):
for item in self.items:
yield item
@ -494,6 +560,12 @@ class TOC(object):
self.nodes.append(node)
return node
def iterdescendants(self):
for node in self.nodes:
yield node
for child in node.iterdescendants():
yield child
def __iter__(self):
for node in self.nodes:
yield node
@ -501,6 +573,15 @@ class TOC(object):
def __getitem__(self, index):
return self.nodes[index]
def autolayer(self):
prev = None
for node in list(self.nodes):
if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]:
self.nodes.remove(node)
prev.nodes.append(node)
else:
prev = node
def depth(self, level=0):
if self.nodes:
return self.nodes[0].depth(level+1)
@ -533,12 +614,13 @@ class TOC(object):
class OEBBook(object):
def __init__(self, opfpath, container=None, logger=FauxLogger()):
if not container:
def __init__(self, opfpath=None, container=None, logger=FauxLogger()):
if opfpath and not container:
container = DirContainer(os.path.dirname(opfpath))
opfpath = os.path.basename(opfpath)
self.container = container
self.logger = logger
if opfpath or container:
opf = self._read_opf(opfpath)
self._all_from_opf(opf)

View File

@ -45,7 +45,6 @@ html, div, map, dt, isindex, form {
body {
display: block;
margin: 8px;
}
p, dl, multicol {
@ -59,7 +58,7 @@ dd {
blockquote {
display: block;
margin: 1em 40px;
margin: 1em;
}
address {
@ -74,7 +73,7 @@ center {
blockquote[type=cite] {
display: block;
margin: 1em 0px;
margin: 1em 0em;
border-color: blue;
border-width: thin;
}
@ -234,14 +233,6 @@ th {
/* inlines */
q:before {
content: open-quote;
}
q:after {
content: close-quote;
}
b, strong {
font-weight: bolder;
}
@ -392,22 +383,6 @@ spacer {
float: none ! important;
}
/* focusable content: anything w/ tabindex >=0 is focusable */
abbr:focus, acronym:focus, address:focus, applet:focus, b:focus,
base:focus, big:focus, blockquote:focus, br:focus, canvas:focus, caption:focus,
center:focus, cite:focus, code:focus, col:focus, colgroup:focus, dd:focus,
del:focus, dfn:focus, dir:focus, div:focus, dl:focus, dt:focus, em:focus,
fieldset:focus, font:focus, form:focus, h1:focus, h2:focus, h3:focus, h4:focus,
h5:focus, h6:focus, hr:focus, i:focus, img:focus, ins:focus,
kbd:focus, label:focus, legend:focus, li:focus, link:focus, menu:focus,
object:focus, ol:focus, p:focus, pre:focus, q:focus, s:focus, samp:focus,
small:focus, span:focus, strike:focus, strong:focus, sub:focus, sup:focus,
table:focus, tbody:focus, td:focus, tfoot:focus, th:focus, thead:focus,
tr:focus, tt:focus, u:focus, ul:focus, var:focus {
/* Don't specify the outline-color, we should always use initial value. */
outline: 1px dotted;
}
/* hidden elements */
area, base, basefont, head, meta, script, style, title,
noembed, param, link {

View File

@ -16,16 +16,19 @@ import itertools
import types
import re
import copy
from itertools import izip
import cssutils
from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \
CSSValueList, cssproperties
from lxml import etree
from calibre.ebooks.lit.oeb import XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.lit.oeb import barename, urlnormalize
from lxml.cssselect import css_to_xpath, ExpressionError
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import barename, urlnormalize
from calibre.resources import html_css
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
HTML_CSS_STYLESHEET = cssutils.parseString(html_css)
XHTML_CSS_NAMESPACE = "@namespace url(http://www.w3.org/1999/xhtml);\n"
HTML_CSS_STYLESHEET.namespaces['h'] = XHTML_NS
INHERITED = set(['azimuth', 'border-collapse', 'border-spacing',
'caption-side', 'color', 'cursor', 'direction', 'elevation',
@ -82,35 +85,48 @@ DEFAULTS = {'azimuth': 'center', 'background-attachment': 'scroll',
FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large',
'x-large', 'xx-large'])
FONT_SIZE_LIST = [('xx-small', 1, 6.),
('x-small', None, 7.),
('small', 2, 8.),
('medium', 3, 9.),
('large', 4, 11.),
('x-large', 5, 13.),
('xx-large', 6, 15.),
(None, 7, 17.)]
FONT_SIZES = [('xx-small', 1),
('x-small', None),
('small', 2),
('medium', 3),
('large', 4),
('x-large', 5),
('xx-large', 6),
(None, 7)]
FONT_SIZE_BY_NAME = {}
FONT_SIZE_BY_NUM = {}
for name, num, size in FONT_SIZE_LIST:
FONT_SIZE_BY_NAME[name] = size
FONT_SIZE_BY_NUM[num] = size
XPNSMAP = {'h': XHTML_NS,}
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
class CSSSelector(etree.XPath):
def __init__(self, css, namespaces=XPNSMAP):
path = css_to_xpath(css)
etree.XPath.__init__(self, path, namespaces=namespaces)
self.css = css
def __repr__(self):
return '<%s %s for %r>' % (
self.__class__.__name__,
hex(abs(id(self)))[2:],
self.css)
class Page(object):
def __init__(self, width, height, dpi):
self.width = float(width)
self.height = float(height)
def __init__(self, width, height, dpi, fbase, fsizes):
self.width = (float(width) / dpi) * 72.
self.height = (float(height) / dpi) * 72.
self.dpi = float(dpi)
self.fbase = float(fbase)
self.fsizes = []
for (name, num), size in izip(FONT_SIZES, fsizes):
self.fsizes.append((name, num, float(size)))
self.fnames = dict((name, sz) for name, _, sz in self.fsizes if name)
self.fnums = dict((num, sz) for _, num, sz in self.fsizes if num)
class Profiles(object):
PRS500 = Page(584, 754, 168.451)
PRS505 = PRS500
PRS505 = Page(584, 754, 168.451, 12, [7.5, 9, 10, 12, 15.5, 20, 22, 24])
MSLIT = Page(652, 480, 100.0, 13, [10, 11, 13, 16, 18, 20, 22, 26])
class Stylizer(object):
@ -126,12 +142,13 @@ class Stylizer(object):
parser = cssutils.CSSParser()
parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path)))
for elem in head:
tag = barename(elem.tag)
if tag == 'style':
text = ''.join(elem.text)
if elem.tag == XHTML('style') and elem.text \
and elem.get('type', CSS_MIME) in OEB_STYLES:
text = XHTML_CSS_NAMESPACE + elem.text
stylesheet = parser.parseString(text, href=cssname)
stylesheet.namespaces['h'] = XHTML_NS
stylesheets.append(stylesheet)
elif tag == 'link' \
elif elem.tag == XHTML('link') and elem.get('href') \
and elem.get('rel', 'stylesheet') == 'stylesheet' \
and elem.get('type', CSS_MIME) in OEB_STYLES:
href = urlnormalize(elem.attrib['href'])
@ -143,11 +160,13 @@ class Stylizer(object):
data = XHTML_CSS_NAMESPACE
data += oeb.manifest.hrefs[path].data
stylesheet = parser.parseString(data, href=path)
stylesheet.namespaces['h'] = XHTML_NS
self.STYLESHEETS[path] = stylesheet
stylesheets.append(stylesheet)
rules = []
index = 0
self.stylesheets = set()
self.page_rule = {}
for stylesheet in stylesheets:
href = stylesheet.href
self.stylesheets.add(href)
@ -157,6 +176,16 @@ class Stylizer(object):
rules.sort()
self.rules = rules
self._styles = {}
for _, _, cssdict, text, _ in rules:
try:
selector = CSSSelector(text)
except ExpressionError, e:
continue
for elem in selector(tree):
self.style(elem)._update_cssdict(cssdict)
for elem in tree.xpath('//*[@style]'):
self.style(elem)._apply_style_tag()
def flatten_rule(self, rule, href, index):
results = []
@ -169,7 +198,7 @@ class Stylizer(object):
results.append((specificity, selector, style, text, href))
elif isinstance(rule, CSSPageRule):
style = self.flatten_style(rule.style)
results.append(((0, 0, 0, 0), [], style, '@page', href))
self.page_rule.update(style)
return results
def flatten_style(self, cssstyle):
@ -186,7 +215,7 @@ class Stylizer(object):
size = style['font-size']
if size == 'normal': size = 'medium'
if size in FONT_SIZE_NAMES:
style['font-size'] = "%dpt" % FONT_SIZE_BY_NAME[size]
style['font-size'] = "%dpt" % self.page.fnames[size]
return style
def _normalize_edge(self, cssvalue, name):
@ -233,8 +262,9 @@ class Stylizer(object):
return style
def style(self, element):
try: return self._styles[element]
except: pass
try:
return self._styles[element]
except KeyError:
return Style(element, self)
def stylesheet(self, name, font_scale=None):
@ -250,74 +280,23 @@ class Stylizer(object):
rules.append('%s {\n %s;\n}' % (selector, style))
return '\n'.join(rules)
class Style(object):
def __init__(self, element, stylizer):
self._element = element
self._page = stylizer.page
self._stylizer = stylizer
self._style = self._assemble_style(element, stylizer)
self._style = {}
stylizer._styles[element] = self
def _assemble_style(self, element, stylizer):
result = {}
rules = stylizer.rules
for _, selector, style, _, _ in rules:
if self._selects_element(element, selector):
result.update(style)
try:
style = CSSStyleDeclaration(element.attrib['style'])
result.update(stylizer.flatten_style(style))
except KeyError:
pass
return result
def _update_cssdict(self, cssdict):
self._style.update(cssdict)
def _selects_element(self, element, selector):
def _selects_element(element, items, index):
if index == -1:
return True
item = items[index]
if item.type == 'universal':
pass
elif item.type == 'type-selector':
name1 = ("{%s}%s" % item.value).lower()
name2 = element.tag.lower()
if name1 != name2:
return False
elif item.type == 'id':
name1 = item.value[1:]
name2 = element.get('id', '')
if name1 != name2:
return False
elif item.type == 'class':
name = item.value[1:].lower()
classes = element.get('class', '').lower().split()
if name not in classes:
return False
elif item.type == 'child':
parent = element.getparent()
if parent is None:
return False
element = parent
elif item.type == 'descendant':
element = element.getparent()
while element is not None:
if _selects_element(element, items, index - 1):
return True
element = element.getparent()
return False
elif item.type == 'pseudo-class':
if item.value == ':first-child':
e = element.getprevious()
if e is not None:
return False
else:
return False
elif item.type == 'pseudo-element':
return False
else:
return False
return _selects_element(element, items, index - 1)
return _selects_element(element, selector, len(selector) - 1)
def _apply_style_tag(self):
attrib = self._element.attrib
if 'style' in attrib:
style = CSSStyleDeclaration(attrib['style'])
self._style.update(self._stylizer.flatten_style(style))
def _has_parent(self):
parent = self._element.getparent()
@ -383,18 +362,19 @@ class Style(object):
result = None
factor = None
if value == 'inherit':
value = 'medium'
# We should only see this if the root element
value = self._page.fbase
if value in FONT_SIZE_NAMES:
result = FONT_SIZE_BY_NAME[value]
result = self._page.fnames[value]
elif value == 'smaller':
factor = 1.0/1.2
for _, _, size in FONT_SIZE_LIST:
for _, _, size in self._page.fsizes:
if base <= size: break
factor = None
result = size
elif value == 'larger':
factor = 1.2
for _, _, size in reversed(FONT_SIZE_LIST):
for _, _, size in reversed(self._page.fsizes):
if base >= size: break
factor = None
result = size
@ -410,7 +390,7 @@ class Style(object):
styles = self._stylizer._styles
base = styles[self._element.getparent()].fontSize
else:
base = normalize_fontsize(DEFAULTS['font-size'])
base = self._page.fbase
if 'font-size' in self._style:
size = self._style['font-size']
result = normalize_fontsize(size, base)
@ -441,4 +421,8 @@ class Style(object):
def __str__(self):
items = self._style.items()
items.sort()
return '; '.join("%s: %s" % (key, val) for key, val in items)
def cssdict(self):
return dict(self._style)