mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
commit
4b039b4c41
@ -12,10 +12,13 @@ Transform OEB content into a single (more or less) HTML file.
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from urlparse import urlparse
|
from functools import partial
|
||||||
|
from lxml import html
|
||||||
|
from urlparse import urldefrag
|
||||||
|
|
||||||
from calibre import prepare_string_for_xml
|
from calibre import prepare_string_for_xml
|
||||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace,\
|
||||||
|
OEB_IMAGES, XLINK, rewrite_links
|
||||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
|
|
||||||
@ -40,6 +43,8 @@ class OEB2HTML(object):
|
|||||||
self.opts = opts
|
self.opts = opts
|
||||||
self.links = {}
|
self.links = {}
|
||||||
self.images = {}
|
self.images = {}
|
||||||
|
self.base_hrefs = [item.href for item in oeb_book.spine]
|
||||||
|
self.map_resources(oeb_book)
|
||||||
|
|
||||||
return self.mlize_spine(oeb_book)
|
return self.mlize_spine(oeb_book)
|
||||||
|
|
||||||
@ -47,6 +52,8 @@ class OEB2HTML(object):
|
|||||||
output = [u'<html><body><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /></head>']
|
output = [u'<html><body><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /></head>']
|
||||||
for item in oeb_book.spine:
|
for item in oeb_book.spine:
|
||||||
self.log.debug('Converting %s to HTML...' % item.href)
|
self.log.debug('Converting %s to HTML...' % item.href)
|
||||||
|
self.rewrite_ids(item.data, item)
|
||||||
|
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||||
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
|
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
|
||||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
||||||
output.append('\n\n')
|
output.append('\n\n')
|
||||||
@ -56,43 +63,61 @@ class OEB2HTML(object):
|
|||||||
def dump_text(self, elem, stylizer, page):
|
def dump_text(self, elem, stylizer, page):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_link_id(self, href, aid):
|
def get_link_id(self, href, id=''):
|
||||||
aid = '%s#%s' % (href, aid)
|
if id:
|
||||||
if aid not in self.links:
|
href += '#%s' % id
|
||||||
self.links[aid] = 'calibre_link-%s' % len(self.links.keys())
|
if href not in self.links:
|
||||||
return self.links[aid]
|
self.links[href] = '#calibre_link-%s' % len(self.links.keys())
|
||||||
|
return self.links[href]
|
||||||
|
|
||||||
def rewrite_link(self, tag, attribs, page):
|
def map_resources(self, oeb_book):
|
||||||
# Rewrite ids.
|
for item in oeb_book.manifest:
|
||||||
if 'id' in attribs:
|
if item.media_type in OEB_IMAGES:
|
||||||
attribs['id'] = self.get_link_id(page.href, attribs['id'])
|
if item.href not in self.images:
|
||||||
# Rewrite links.
|
ext = os.path.splitext(item.href)[1]
|
||||||
if tag == 'a' and 'href' in attribs:
|
|
||||||
href = page.abshref(attribs['href'])
|
|
||||||
if self.url_is_relative(href):
|
|
||||||
id = ''
|
|
||||||
if '#' in href:
|
|
||||||
href, n, id = href.partition('#')
|
|
||||||
href = '#%s' % self.get_link_id(href, id)
|
|
||||||
attribs['href'] = href
|
|
||||||
return attribs
|
|
||||||
|
|
||||||
def rewrite_image(self, tag, attribs, page):
|
|
||||||
if tag == 'img':
|
|
||||||
src = attribs.get('src', None)
|
|
||||||
if src:
|
|
||||||
src = page.abshref(src)
|
|
||||||
if src not in self.images:
|
|
||||||
ext = os.path.splitext(src)[1]
|
|
||||||
fname = '%s%s' % (len(self.images), ext)
|
fname = '%s%s' % (len(self.images), ext)
|
||||||
fname = fname.zfill(10)
|
fname = fname.zfill(10)
|
||||||
self.images[src] = fname
|
self.images[item.href] = fname
|
||||||
attribs['src'] = 'images/%s' % self.images[src]
|
if item in oeb_book.spine:
|
||||||
return attribs
|
self.get_link_id(item.href)
|
||||||
|
root = item.data.find(XHTML('body'))
|
||||||
|
link_attrs = set(html.defs.link_attrs)
|
||||||
|
link_attrs.add(XLINK('href'))
|
||||||
|
for el in root.iter():
|
||||||
|
attribs = el.attrib
|
||||||
|
try:
|
||||||
|
if not isinstance(el.tag, basestring):
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
for attr in attribs:
|
||||||
|
if attr in link_attrs:
|
||||||
|
href = item.abshref(attribs[attr])
|
||||||
|
href, id = urldefrag(href)
|
||||||
|
if href in self.base_hrefs:
|
||||||
|
self.get_link_id(href, id)
|
||||||
|
|
||||||
def url_is_relative(self, url):
|
def rewrite_link(self, url, page=None):
|
||||||
o = urlparse(url)
|
if not page:
|
||||||
return False if o.scheme else True
|
return url
|
||||||
|
abs_url = page.abshref(url)
|
||||||
|
if abs_url in self.images:
|
||||||
|
return 'images/%s' % self.images[abs_url]
|
||||||
|
if abs_url in self.links:
|
||||||
|
return self.links[abs_url]
|
||||||
|
return url
|
||||||
|
|
||||||
|
def rewrite_ids(self, root, page):
|
||||||
|
for el in root.iter():
|
||||||
|
try:
|
||||||
|
tag = el.tag
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
if tag == XHTML('body'):
|
||||||
|
el.attrib['id'] = self.get_link_id(page.href)[1:]
|
||||||
|
continue
|
||||||
|
if 'id' in el.attrib:
|
||||||
|
el.attrib['id'] = self.get_link_id(page.href, el.attrib['id'])[1:]
|
||||||
|
|
||||||
def get_css(self, oeb_book):
|
def get_css(self, oeb_book):
|
||||||
css = u''
|
css = u''
|
||||||
@ -130,12 +155,8 @@ class OEB2HTMLNoCSSizer(OEB2HTML):
|
|||||||
tag = barename(elem.tag)
|
tag = barename(elem.tag)
|
||||||
attribs = elem.attrib
|
attribs = elem.attrib
|
||||||
|
|
||||||
attribs = self.rewrite_link(tag, attribs, page)
|
|
||||||
attribs = self.rewrite_image(tag, attribs, page)
|
|
||||||
|
|
||||||
if tag == 'body':
|
if tag == 'body':
|
||||||
tag = 'div'
|
tag = 'div'
|
||||||
attribs['id'] = self.get_link_id(page.href, '')
|
|
||||||
tags.append(tag)
|
tags.append(tag)
|
||||||
|
|
||||||
# Ignore anything that is set to not be displayed.
|
# Ignore anything that is set to not be displayed.
|
||||||
@ -218,13 +239,9 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
|
|||||||
tag = barename(elem.tag)
|
tag = barename(elem.tag)
|
||||||
attribs = elem.attrib
|
attribs = elem.attrib
|
||||||
|
|
||||||
attribs = self.rewrite_link(tag, attribs, page)
|
|
||||||
attribs = self.rewrite_image(tag, attribs, page)
|
|
||||||
|
|
||||||
style_a = '%s' % style
|
style_a = '%s' % style
|
||||||
if tag == 'body':
|
if tag == 'body':
|
||||||
tag = 'div'
|
tag = 'div'
|
||||||
attribs['id'] = self.get_link_id(page.href, '')
|
|
||||||
if not style['page-break-before'] == 'always':
|
if not style['page-break-before'] == 'always':
|
||||||
style_a = 'page-break-before: always;' + ' ' if style_a else '' + style_a
|
style_a = 'page-break-before: always;' + ' ' if style_a else '' + style_a
|
||||||
tags.append(tag)
|
tags.append(tag)
|
||||||
@ -279,6 +296,8 @@ class OEB2HTMLClassCSSizer(OEB2HTML):
|
|||||||
output = []
|
output = []
|
||||||
for item in oeb_book.spine:
|
for item in oeb_book.spine:
|
||||||
self.log.debug('Converting %s to HTML...' % item.href)
|
self.log.debug('Converting %s to HTML...' % item.href)
|
||||||
|
self.rewrite_ids(item.data, item)
|
||||||
|
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||||
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
|
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
|
||||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
||||||
output.append('\n\n')
|
output.append('\n\n')
|
||||||
@ -306,17 +325,12 @@ class OEB2HTMLClassCSSizer(OEB2HTML):
|
|||||||
|
|
||||||
# Setup our variables.
|
# Setup our variables.
|
||||||
text = ['']
|
text = ['']
|
||||||
#style = stylizer.style(elem)
|
|
||||||
tags = []
|
tags = []
|
||||||
tag = barename(elem.tag)
|
tag = barename(elem.tag)
|
||||||
attribs = elem.attrib
|
attribs = elem.attrib
|
||||||
|
|
||||||
attribs = self.rewrite_link(tag, attribs, page)
|
|
||||||
attribs = self.rewrite_image(tag, attribs, page)
|
|
||||||
|
|
||||||
if tag == 'body':
|
if tag == 'body':
|
||||||
tag = 'div'
|
tag = 'div'
|
||||||
attribs['id'] = self.get_link_id(page.href, '')
|
|
||||||
tags.append(tag)
|
tags.append(tag)
|
||||||
|
|
||||||
# Remove attributes we won't want.
|
# Remove attributes we won't want.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user