From f89d0efa1f216018aeab84be2be53ab15012e41a Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 4 Apr 2011 19:47:59 -0400 Subject: [PATCH 1/3] HTMLZ Output: Use urldefrag instead of doing it ourself. --- src/calibre/ebooks/htmlz/oeb2html.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/htmlz/oeb2html.py b/src/calibre/ebooks/htmlz/oeb2html.py index 827e57b932..af5867356a 100644 --- a/src/calibre/ebooks/htmlz/oeb2html.py +++ b/src/calibre/ebooks/htmlz/oeb2html.py @@ -12,7 +12,7 @@ Transform OEB content into a single (more or less) HTML file. import os -from urlparse import urlparse +from urlparse import urlparse, urldefrag from calibre import prepare_string_for_xml from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace @@ -70,9 +70,7 @@ class OEB2HTML(object): if tag == 'a': href = page.abshref(attribs['href']) if self.url_is_relative(href): - id = '' - if '#' in href: - href, n, id = href.partition('#') + href, id = urldefrag(href) href = '#%s' % self.get_link_id(href, id) attribs['href'] = href return attribs From 265eabf1a613fcdc3651631fd3f9589bf7d4e7be Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 5 Apr 2011 21:59:11 -0400 Subject: [PATCH 2/3] HTMLZ Output: Rewrite links via oeb.base.rewrite_links function. --- src/calibre/ebooks/htmlz/oeb2html.py | 112 +++++++++++++++------------ 1 file changed, 64 insertions(+), 48 deletions(-) diff --git a/src/calibre/ebooks/htmlz/oeb2html.py b/src/calibre/ebooks/htmlz/oeb2html.py index af5867356a..7d915bcfcb 100644 --- a/src/calibre/ebooks/htmlz/oeb2html.py +++ b/src/calibre/ebooks/htmlz/oeb2html.py @@ -12,10 +12,13 @@ Transform OEB content into a single (more or less) HTML file. import os -from urlparse import urlparse, urldefrag +from functools import partial +from lxml import html +from urlparse import urldefrag from calibre import prepare_string_for_xml -from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace,\ + OEB_IMAGES, XLINK, rewrite_links from calibre.ebooks.oeb.stylizer import Stylizer from calibre.utils.logging import default_log @@ -40,6 +43,8 @@ class OEB2HTML(object): self.opts = opts self.links = {} self.images = {} + self.base_hrefs = [item.href for item in oeb_book.spine] + self.map_resources(oeb_book) return self.mlize_spine(oeb_book) @@ -47,6 +52,8 @@ class OEB2HTML(object): output = [u''] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output.append('\n\n') @@ -56,41 +63,61 @@ class OEB2HTML(object): def dump_text(self, elem, stylizer, page): raise NotImplementedError - def get_link_id(self, href, aid): - aid = '%s#%s' % (href, aid) - if aid not in self.links: - self.links[aid] = 'calibre_link-%s' % len(self.links.keys()) - return self.links[aid] + def get_link_id(self, href, id=''): + if id: + href += '#%s' % id + if href not in self.links: + self.links[href] = '#calibre_link-%s' % len(self.links.keys()) + return self.links[href] - def rewrite_link(self, tag, attribs, page): - # Rewrite ids. - if 'id' in attribs: - attribs['id'] = self.get_link_id(page.href, attribs['id']) - # Rewrite links. - if tag == 'a': - href = page.abshref(attribs['href']) - if self.url_is_relative(href): - href, id = urldefrag(href) - href = '#%s' % self.get_link_id(href, id) - attribs['href'] = href - return attribs - - def rewrite_image(self, tag, attribs, page): - if tag == 'img': - src = attribs.get('src', None) - if src: - src = page.abshref(src) - if src not in self.images: - ext = os.path.splitext(src)[1] + def map_resources(self, oeb_book): + for item in oeb_book.manifest: + if item.media_type in OEB_IMAGES: + if item.href not in self.images: + ext = os.path.splitext(item.href)[1] fname = '%s%s' % (len(self.images), ext) fname = fname.zfill(10) - self.images[src] = fname - attribs['src'] = 'images/%s' % self.images[src] - return attribs - - def url_is_relative(self, url): - o = urlparse(url) - return False if o.scheme else True + self.images[item.href] = fname + if item in oeb_book.spine: + self.get_link_id(item.href) + root = item.data.find(XHTML('body')) + link_attrs = set(html.defs.link_attrs) + link_attrs.add(XLINK('href')) + for el in root.iter(): + attribs = el.attrib + try: + if not isinstance(el.tag, basestring): + continue + except UnicodeDecodeError: + continue + for attr in attribs: + if attr in link_attrs: + href = item.abshref(attribs[attr]) + href, id = urldefrag(href) + if href in self.base_hrefs: + self.get_link_id(href, id) + + def rewrite_link(self, url, page=None): + if not page: + return url + abs_url = page.abshref(url) + if abs_url in self.images: + return 'images/%s' % self.images[abs_url] + if abs_url in self.links: + return self.links[abs_url] + return url + + def rewrite_ids(self, root, page): + for el in root.iter(): + try: + tag = el.tag + except UnicodeDecodeError: + continue + if tag == XHTML('body'): + el.attrib['id'] = self.get_link_id(page.href)[1:] + continue + if 'id' in el.attrib: + el.attrib['id'] = self.get_link_id(page.href, el.attrib['id'])[1:] def get_css(self, oeb_book): css = u'' @@ -127,13 +154,9 @@ class OEB2HTMLNoCSSizer(OEB2HTML): tags = [] tag = barename(elem.tag) attribs = elem.attrib - - attribs = self.rewrite_link(tag, attribs, page) - attribs = self.rewrite_image(tag, attribs, page) - + if tag == 'body': tag = 'div' - attribs['id'] = self.get_link_id(page.href, '') tags.append(tag) # Ignore anything that is set to not be displayed. @@ -215,14 +238,10 @@ class OEB2HTMLInlineCSSizer(OEB2HTML): tags = [] tag = barename(elem.tag) attribs = elem.attrib - - attribs = self.rewrite_link(tag, attribs, page) - attribs = self.rewrite_image(tag, attribs, page) style_a = '%s' % style if tag == 'body': tag = 'div' - attribs['id'] = self.get_link_id(page.href, '') if not style['page-break-before'] == 'always': style_a = 'page-break-before: always;' + ' ' if style_a else '' + style_a tags.append(tag) @@ -277,6 +296,8 @@ class OEB2HTMLClassCSSizer(OEB2HTML): output = [] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output.append('\n\n') @@ -304,17 +325,12 @@ class OEB2HTMLClassCSSizer(OEB2HTML): # Setup our variables. text = [''] - #style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib - attribs = self.rewrite_link(tag, attribs, page) - attribs = self.rewrite_image(tag, attribs, page) - if tag == 'body': tag = 'div' - attribs['id'] = self.get_link_id(page.href, '') tags.append(tag) # Remove attributes we won't want. From 739609210ef60dc4d0bb15fa0253d0c1b7940081 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 5 Apr 2011 22:12:50 -0400 Subject: [PATCH 3/3] ... --- src/calibre/ebooks/htmlz/oeb2html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/htmlz/oeb2html.py b/src/calibre/ebooks/htmlz/oeb2html.py index 7d915bcfcb..b8a6362a99 100644 --- a/src/calibre/ebooks/htmlz/oeb2html.py +++ b/src/calibre/ebooks/htmlz/oeb2html.py @@ -88,7 +88,7 @@ class OEB2HTML(object): try: if not isinstance(el.tag, basestring): continue - except UnicodeDecodeError: + except: continue for attr in attribs: if attr in link_attrs: