This commit is contained in:
Kovid Goyal 2011-04-06 08:44:26 -06:00
commit 4b039b4c41

View File

@ -12,10 +12,13 @@ Transform OEB content into a single (more or less) HTML file.
import os import os
from urlparse import urlparse from functools import partial
from lxml import html
from urlparse import urldefrag
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace,\
OEB_IMAGES, XLINK, rewrite_links
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
@ -40,6 +43,8 @@ class OEB2HTML(object):
self.opts = opts self.opts = opts
self.links = {} self.links = {}
self.images = {} self.images = {}
self.base_hrefs = [item.href for item in oeb_book.spine]
self.map_resources(oeb_book)
return self.mlize_spine(oeb_book) return self.mlize_spine(oeb_book)
@ -47,6 +52,8 @@ class OEB2HTML(object):
output = [u'<html><body><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /></head>'] output = [u'<html><body><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /></head>']
for item in oeb_book.spine: for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href) self.log.debug('Converting %s to HTML...' % item.href)
self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
output.append('\n\n') output.append('\n\n')
@ -56,43 +63,61 @@ class OEB2HTML(object):
def dump_text(self, elem, stylizer, page): def dump_text(self, elem, stylizer, page):
raise NotImplementedError raise NotImplementedError
def get_link_id(self, href, aid): def get_link_id(self, href, id=''):
aid = '%s#%s' % (href, aid) if id:
if aid not in self.links: href += '#%s' % id
self.links[aid] = 'calibre_link-%s' % len(self.links.keys()) if href not in self.links:
return self.links[aid] self.links[href] = '#calibre_link-%s' % len(self.links.keys())
return self.links[href]
def rewrite_link(self, tag, attribs, page): def map_resources(self, oeb_book):
# Rewrite ids. for item in oeb_book.manifest:
if 'id' in attribs: if item.media_type in OEB_IMAGES:
attribs['id'] = self.get_link_id(page.href, attribs['id']) if item.href not in self.images:
# Rewrite links. ext = os.path.splitext(item.href)[1]
if tag == 'a' and 'href' in attribs:
href = page.abshref(attribs['href'])
if self.url_is_relative(href):
id = ''
if '#' in href:
href, n, id = href.partition('#')
href = '#%s' % self.get_link_id(href, id)
attribs['href'] = href
return attribs
def rewrite_image(self, tag, attribs, page):
if tag == 'img':
src = attribs.get('src', None)
if src:
src = page.abshref(src)
if src not in self.images:
ext = os.path.splitext(src)[1]
fname = '%s%s' % (len(self.images), ext) fname = '%s%s' % (len(self.images), ext)
fname = fname.zfill(10) fname = fname.zfill(10)
self.images[src] = fname self.images[item.href] = fname
attribs['src'] = 'images/%s' % self.images[src] if item in oeb_book.spine:
return attribs self.get_link_id(item.href)
root = item.data.find(XHTML('body'))
link_attrs = set(html.defs.link_attrs)
link_attrs.add(XLINK('href'))
for el in root.iter():
attribs = el.attrib
try:
if not isinstance(el.tag, basestring):
continue
except:
continue
for attr in attribs:
if attr in link_attrs:
href = item.abshref(attribs[attr])
href, id = urldefrag(href)
if href in self.base_hrefs:
self.get_link_id(href, id)
def url_is_relative(self, url): def rewrite_link(self, url, page=None):
o = urlparse(url) if not page:
return False if o.scheme else True return url
abs_url = page.abshref(url)
if abs_url in self.images:
return 'images/%s' % self.images[abs_url]
if abs_url in self.links:
return self.links[abs_url]
return url
def rewrite_ids(self, root, page):
for el in root.iter():
try:
tag = el.tag
except UnicodeDecodeError:
continue
if tag == XHTML('body'):
el.attrib['id'] = self.get_link_id(page.href)[1:]
continue
if 'id' in el.attrib:
el.attrib['id'] = self.get_link_id(page.href, el.attrib['id'])[1:]
def get_css(self, oeb_book): def get_css(self, oeb_book):
css = u'' css = u''
@ -130,12 +155,8 @@ class OEB2HTMLNoCSSizer(OEB2HTML):
tag = barename(elem.tag) tag = barename(elem.tag)
attribs = elem.attrib attribs = elem.attrib
attribs = self.rewrite_link(tag, attribs, page)
attribs = self.rewrite_image(tag, attribs, page)
if tag == 'body': if tag == 'body':
tag = 'div' tag = 'div'
attribs['id'] = self.get_link_id(page.href, '')
tags.append(tag) tags.append(tag)
# Ignore anything that is set to not be displayed. # Ignore anything that is set to not be displayed.
@ -218,13 +239,9 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
tag = barename(elem.tag) tag = barename(elem.tag)
attribs = elem.attrib attribs = elem.attrib
attribs = self.rewrite_link(tag, attribs, page)
attribs = self.rewrite_image(tag, attribs, page)
style_a = '%s' % style style_a = '%s' % style
if tag == 'body': if tag == 'body':
tag = 'div' tag = 'div'
attribs['id'] = self.get_link_id(page.href, '')
if not style['page-break-before'] == 'always': if not style['page-break-before'] == 'always':
style_a = 'page-break-before: always;' + ' ' if style_a else '' + style_a style_a = 'page-break-before: always;' + ' ' if style_a else '' + style_a
tags.append(tag) tags.append(tag)
@ -279,6 +296,8 @@ class OEB2HTMLClassCSSizer(OEB2HTML):
output = [] output = []
for item in oeb_book.spine: for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href) self.log.debug('Converting %s to HTML...' % item.href)
self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
output.append('\n\n') output.append('\n\n')
@ -306,17 +325,12 @@ class OEB2HTMLClassCSSizer(OEB2HTML):
# Setup our variables. # Setup our variables.
text = [''] text = ['']
#style = stylizer.style(elem)
tags = [] tags = []
tag = barename(elem.tag) tag = barename(elem.tag)
attribs = elem.attrib attribs = elem.attrib
attribs = self.rewrite_link(tag, attribs, page)
attribs = self.rewrite_image(tag, attribs, page)
if tag == 'body': if tag == 'body':
tag = 'div' tag = 'div'
attribs['id'] = self.get_link_id(page.href, '')
tags.append(tag) tags.append(tag)
# Remove attributes we won't want. # Remove attributes we won't want.