From a1365513ddde400a0d88c09952d943ca52b69443 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Feb 2010 12:49:54 -0700 Subject: [PATCH] EPUB metadata: Extract the cover image from the html it is embededd in if possible, instead of rendering the html. Removes the white margins on covers and speeds up cover extraction --- src/calibre/ebooks/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index d5651568fa..38a8ef3662 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -70,6 +70,19 @@ def extract_cover_from_embedded_svg(html, base, log): if href and os.access(path, os.R_OK): return open(path, 'rb').read() +def extract_calibre_cover(raw, base, log): + from calibre.ebooks.BeautifulSoup import BeautifulSoup + soup = BeautifulSoup(raw) + matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', + 'font', 'br']) + images = soup.findAll('img') + if matches is None and len(images) == 1 and \ + images[0].get('alt', '')=='cover': + img = images[0] + img = os.path.join(base, *img['src'].split('/')) + if os.path.exists(img): + return open(img, 'rb').read() + def render_html_svg_workaround(path_to_html, log, width=590, height=750): from calibre.ebooks.oeb.base import SVG_NS raw = open(path_to_html, 'rb').read() @@ -80,6 +93,11 @@ def render_html_svg_workaround(path_to_html, log, width=590, height=750): os.path.dirname(path_to_html), log) except: pass + if data is None: + try: + data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log) + except: + pass if data is None: renderer = render_html(path_to_html, width, height) data = getattr(renderer, 'data', None)